diff --git a/.cvsignore b/.cvsignore new file mode 100644 index 0000000000000000000000000000000000000000..b010390540b4a3187de8b78f58263a3e94b10d16 --- /dev/null +++ b/.cvsignore @@ -0,0 +1,41 @@ +.Xrefs +.Xrefs-2.5 +aclocal.m4 +config.log +config.status +config.cache +config.guess +config.sub +configure +Makefile +autoMakefile +autoMakefile.in +.deps +tags +TAGS +lustre*.tar.gz +cscope.files +cscope.out +autom4te-2.53.cache +autom4te.cache +depcomp +compile +.*.cmd +.mergeinfo-* +Rules +missing +mkinstalldirs +install-sh +.depend +.tmp_versions +config.h +config.h.in +Module.symvers +Modules.symvers +stamp-h1 +INSTALL +.pc +patches +series +lustre.spec +acl.size diff --git a/Makefile.in b/Makefile.in index 3ae031de33e8b0d6911f1c0a316308c1055be9d9..84ab4190040aa3da34ce0041741c212415f9c77d 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1,4 +1,5 @@ -subdir-m += lnet +subdir-m += @LIBCFS_SUBDIR@ lnet subdir-m += lustre +subdir-m += lustre-iokit @INCLUDE_RULES@ diff --git a/autoMakefile.am b/autoMakefile.am index ab75c70e312c5f2e268d1fc94e88873e3b26c96a..2a9976fd2a459fb08f34283d2ebd97ee0b2a6050 100644 --- a/autoMakefile.am +++ b/autoMakefile.am @@ -1,7 +1,7 @@ -SUBDIRS := @LDISKFS_SUBDIR@ . @LIBSYSIO_SUBDIR@ @SNMP_SUBDIR@ lnet lustre -DIST_SUBDIRS := @SNMP_DIST_SUBDIR@ libsysio ldiskfs lnet lustre -SOURCES_SUBDIRS := @LDISKFS_SUBDIR@ lnet lustre -RPM_SUBDIRS := @LDISKFS_SUBDIR@ +SUBDIRS := @LDISKFS_SUBDIR@ . @LIBSYSIO_SUBDIR@ @SNMP_SUBDIR@ @LUSTREIOKIT_SUBDIR@ @LIBCFS_SUBDIR@ lnet lustre +DIST_SUBDIRS := @SNMP_DIST_SUBDIR@ libsysio ldiskfs @LUSTREIOKIT_SUBDIR@ @LIBCFS_SUBDIR@ lnet lustre +SOURCES_SUBDIRS := @LDISKFS_SUBDIR@ @LIBCFS_SUBDIR@ lnet lustre +RPM_SUBDIRS := @LDISKFS_SUBDIR@ @LUSTREIOKIT_SUBDIR@ if LDISKFS_ENABLED EXTRA_SOURCES := @SYMVERFILE@ endif @@ -12,7 +12,9 @@ EXTRA_DIST += config.h.in if LDISKFS_ENABLED @SYMVERFILE@: @LDISKFS_DIR@/@SYMVERFILE@ - cp @LDISKFS_DIR@/@SYMVERFILE@ @SYMVERFILE@ + touch @SYMVERFILE@ + -grep -v ldiskfs @SYMVERFILE@ > @SYMVERFILE@.old + cat @SYMVERFILE@.old @LDISKFS_DIR@/@SYMVERFILE@ > @SYMVERFILE@ endif CSTK=/tmp/checkstack @@ -20,7 +22,7 @@ CSTKO=/tmp/checkstack.orig checkstack: [ -f ${CSTK} -a ! -s ${CSTKO} ] && mv -f ${CSTK} ${CSTKO} || true - { for i in lustre/* lnet/* ; do \ + { for i in lustre/* lnet/* libcfs/*; do \ MOD=$$i/`basename $$i`; \ if [ $$i = "lustre/llite" ]; then MOD=$$i/lustre; fi; \ [ -f $$MOD.ko ] && MOD=$$MOD.ko || MOD=$$MOD.o; \ diff --git a/build/.cvsignore b/build/.cvsignore index 795e13b1899bf9efc3326d058853d6d81fc0ee14..e42cfbd5f9e124ef22ab5fb54d8810abb1f8edc5 100644 --- a/build/.cvsignore +++ b/build/.cvsignore @@ -10,3 +10,4 @@ autoMakefile.in autoMakefile lustre.spec Module.symvers +Modules.symvers diff --git a/build/apidoc.publish b/build/apidoc.publish new file mode 100755 index 0000000000000000000000000000000000000000..69ed1cd5213c38350b030a04c2c51115ea52dee9 --- /dev/null +++ b/build/apidoc.publish @@ -0,0 +1,79 @@ +#! /bin/bash + +#set -x + +OPTVAL=`getopt -o -b:l:du:v -n 'apidoc.publish' -- "$@"` + +# Note the quotes around `$OPTVAL': they are essential! +eval set -- "$OPTVAL" + +if [ -d lustre/CVS ] ;then + if [ -r lustre/CVS/Tag ] ;then + branch=$(cut -c2- lustre/CVS/Tag) + else + branch=HEAD + fi +else + branch='' +fi + +verbose=0 +label='' +user=$USER + +while true ;do + case "$1" in + -b) + branch="$2" + shift 2 + ;; + -l) + label="$label$2" + shift 2 + ;; + -u) + user="$2" + shift 2 + ;; + -d) + label="$label$(date '+%Y.%m.%d')" + shift + ;; + -v) + verbose=$(($verbose + 1)) + shift + ;; + --) + shift + break + ;; + *) + echo "Internal error!" + exit 1 + ;; + esac +done + +if [ x$branch = x ] ;then + echo "No branch specified" + exit 1 +fi + +echo "Updating documentation for branch \`$branch'" +if [ x$label != x ] ;then + echo "Label: $label" +fi + +dst="$user@shell.lustre.sun.com:/home/www/apidoc/$branch$label" + +opt="-rltvzp --delete" +sshopt="ssh -l $user -oPort=922" +if [ -d apidoc.api/html ] ;then + chmod -R a+rx apidoc.api/html + rsync $opt --rsh="$sshopt" apidoc.api/html $dst/api +fi +if [ -d apidoc.ref/html ] ;then + chmod -R a+rx apidoc.ref/html + rsync $opt --rsh="$sshopt" apidoc.ref/html $dst/ref +fi + diff --git a/build/autoMakefile.am.toplevel b/build/autoMakefile.am.toplevel index dfb962407495032cb4bfdac29788302cfe0d09c0..3547b400b011653c49a04667a3b7d79d70331ab2 100644 --- a/build/autoMakefile.am.toplevel +++ b/build/autoMakefile.am.toplevel @@ -60,6 +60,7 @@ endif # MODULES dist-hook: rm -f $(distdir)/ldiskfs/*.spec + rm -f $(distdir)/lustre-iokit/*.spec find $(distdir) -name .deps -o \ -name CVS -o \ -name .svn -o \ diff --git a/build/autoconf/lustre-build-linux.m4 b/build/autoconf/lustre-build-linux.m4 index 193c6c7b9aa033bbc45e2816e21c368c63f03254..9ce393e8200750c1fe1137c810a62ed55eb23035 100644 --- a/build/autoconf/lustre-build-linux.m4 +++ b/build/autoconf/lustre-build-linux.m4 @@ -230,6 +230,20 @@ LB_LINUX_TRY_COMPILE([],[],[ LB_LINUX_RELEASE ]) # end of LB_LINUX_PATH +# LB_LINUX_SYMVERFILE +# SLES 9 uses a different name for this file - unsure about vanilla kernels +# around this version, but it matters for servers only. +AC_DEFUN([LB_LINUX_SYMVERFILE], + [AC_MSG_CHECKING([name of module symbol version file]) + if grep -q Modules.symvers $LINUX/scripts/Makefile.modpost ; then + SYMVERFILE=Modules.symvers + else + SYMVERFILE=Module.symvers + fi + AC_MSG_RESULT($SYMVERFILE) + AC_SUBST(SYMVERFILE) +]) + # # # LB_LINUX_MODPOST @@ -445,6 +459,12 @@ fi # AC_DEFUN([LB_PROG_LINUX], [LB_LINUX_PATH +LB_LINUX_ARCH +LB_LINUX_SYMVERFILE + +if test $LINUX_ARCH == "powerpc64"; then + CFLAGS="$CFLAGS -m64" +fi LB_LINUX_CONFIG([MODULES],[],[ AC_MSG_ERROR([module support is required to build Lustre kernel modules.]) @@ -494,7 +514,7 @@ AC_DEFUN([LB_LINUX_CONDITIONALS], # or check AC_DEFUN([LB_CHECK_SYMBOL_EXPORT], [AC_MSG_CHECKING([if Linux was built with symbol $1 is exported]) -grep -q -E '[[[:space:]]]$1[[[:space:]]]' $LINUX/Module.symvers 2>/dev/null +grep -q -E '[[[:space:]]]$1[[[:space:]]]' $LINUX/$SYMVERFILE 2>/dev/null rc=$? if test $rc -ne 0; then export=0 diff --git a/build/autoconf/lustre-build.m4 b/build/autoconf/lustre-build.m4 index cf9ad646aa5946ab37e21851cba864ac1ab8ead3..5ccadc68716cc973e2c8563df327eae511420f3d 100644 --- a/build/autoconf/lustre-build.m4 +++ b/build/autoconf/lustre-build.m4 @@ -159,6 +159,46 @@ esac AC_CONFIG_SUBDIRS(libsysio) ]) +# +# LB_PATH_LUSTREIOKIT +# +# Handle internal/external lustre-iokit +# +AC_DEFUN([LB_PATH_LUSTREIOKIT], +[AC_ARG_WITH([], + AC_HELP_STRING([--with-lustre-iokit=path], + [set path to lustre-iokit source (default is included lustre-iokit)]), + [],[ + with_lustre_iokit='yes' + ]) +AC_MSG_CHECKING([location of lustre-iokit]) +enable_lustre_iokit="$with_lustre_iokit" +case x$with_lustre_iokit in + xyes) + AC_MSG_RESULT([internal]) + LB_CHECK_FILE([$srcdir/lustre-iokit/ior-survey/ior-survey],[],[ + AC_MSG_ERROR([A complete internal lustre-iokit was not found.]) + ]) + LUSTREIOKIT_SUBDIR="lustre-iokit" + LUSTREIOKIT="$PWD/lustre-iokit" + ;; + xno) + AC_MSG_RESULT([disabled]) + ;; + *) + AC_MSG_RESULT([$with_lustre_iokit]) + LB_CHECK_FILE([$with_lustre_iokit/ior-survey/ior_survey],[],[ + AC_MSG_ERROR([A complete (built) external lustre-iokit was not found.]) + ]) + LUSTREIOKIT="$with_lustre_iokit" + with_lustre_iokit="yes" + ;; +esac +AC_SUBST(LUSTREIOKIT_SUBDIR) +# We have to configure even if we don't build here for make dist to work +AC_CONFIG_SUBDIRS(lustre-iokit) +]) + # # LB_PATH_LDISKFS # @@ -204,6 +244,23 @@ AM_CONDITIONAL(LDISKFS_ENABLED, test x$with_ldiskfs != xno) AC_CONFIG_SUBDIRS(ldiskfs) ]) +# Define no libcfs by default. +AC_DEFUN([LB_LIBCFS_DIR], +[ +case x$libcfs_is_module in + xyes) + LIBCFS_INCLUDE_DIR="libcfs/include" + LIBCFS_SUBDIR="libcfs" + ;; + x*) + LIBCFS_INCLUDE_DIR="lnet/include" + LIBCFS_SUBDIR="" + ;; +esac +AC_SUBST(LIBCFS_SUBDIR) +AC_SUBST(LIBCFS_INCLUDE_DIR) +]) + # # LB_DEFINE_LDISKFS_OPTIONS # @@ -300,26 +357,17 @@ AM_CONDITIONAL(POSIX_OSD_ENABLED, test x$posix_osd = xyes) # # LB_PATH_DMU -# Support for --with-dmu # AC_DEFUN([LB_PATH_DMU], [AC_MSG_CHECKING([whether to enable DMU]) -AC_ARG_WITH([dmu], - AC_HELP_STRING([--with-dmu=path], - [set path to a DMU tree (default is included zfs-lustre)]), - [ - DMU_SRC=$with_dmu - ], - [ - DMU_SRC="$PWD/zfs-lustre" - ]) if test x$enable_uoss = xyes -a x$enable_posix_osd != xyes; then + DMU_SRC="$PWD/lustre/zfs-lustre" AC_DEFINE(DMU_OSD, 1, Enable DMU OSD) AC_MSG_RESULT([yes]) LB_CHECK_FILE([$DMU_SRC/src/.patched],[],[ AC_MSG_ERROR([A complete (patched) DMU tree was not found.]) ]) - AC_CONFIG_SUBDIRS(zfs-lustre) + AC_CONFIG_SUBDIRS(lustre/zfs-lustre) dmu_osd='yes' else AC_MSG_RESULT([no]) @@ -362,11 +410,13 @@ if test x$enable_modules = xyes ; then case $target_os in linux*) LB_PROG_LINUX + LIBCFS_PROG_LINUX LN_PROG_LINUX LC_PROG_LINUX ;; darwin*) LB_PROG_DARWIN + LIBCFS_PROG_DARWIN ;; *) # This is strange - Lustre supports a target we don't @@ -492,6 +542,7 @@ AC_SUBST(sysconfdir) docdir='${datadir}/doc/$(PACKAGE)' AC_SUBST(docdir) +LIBCFS_PATH_DEFAULTS LN_PATH_DEFAULTS LC_PATH_DEFAULTS @@ -542,33 +593,17 @@ if test $ac_cv_sizeof_unsigned_long_long != 8 ; then AC_MSG_ERROR([** we assume that sizeof(long long) == 8. Tell phil@clusterfs.com]) fi -# FIXME -AC_CHECK_DECL([__i386__], [], [ - -if test x$enable_bgl != xyes; then -AC_MSG_CHECKING([if $CC accepts -m64]) -CC_save="$CC" -CC="$CC -m64" -AC_TRY_COMPILE([],[],[ - AC_MSG_RESULT([yes]) -],[ - AC_MSG_RESULT([no]) - CC="$CC_save" -]) -fi - -]) - -CPPFLAGS="-I\$(top_builddir)/lnet/include -I\$(top_srcdir)/lnet/include -I\$(top_builddir)/lustre/include -I\$(top_srcdir)/lustre/include $CPPFLAGS" +CPPFLAGS="-I\$(top_builddir)/$LIBCFS_INCLUDE_DIR -I\$(top_srcdir)/$LIBCFS_INCLUDE_DIR-I\$(top_builddir)/lnet/include -I\$(top_srcdir)/lnet/include -I\$(top_builddir)/lustre/include -I\$(top_srcdir)/lustre/include $CPPFLAGS" LLCPPFLAGS="-D__arch_lib__ -D_LARGEFILE64_SOURCE=1" AC_SUBST(LLCPPFLAGS) -LLCFLAGS="-g -Wall -fPIC" +# Add _GNU_SOURCE for strnlen on linux +LLCFLAGS="-g -Wall -fPIC -D_GNU_SOURCE" AC_SUBST(LLCFLAGS) # everyone builds against lnet and lustre -EXTRA_KCFLAGS="$EXTRA_KCFLAGS -g -I$PWD/lnet/include -I$PWD/lustre/include" +EXTRA_KCFLAGS="$EXTRA_KCFLAGS -g -I$PWD/$LIBCFS_INCLUDE_DIR -I$PWD/lnet/include -I$PWD/lustre/include" AC_SUBST(EXTRA_KCFLAGS) ]) @@ -602,6 +637,7 @@ AC_SUBST(SYSIO) LB_LINUX_CONDITIONALS LB_DARWIN_CONDITIONALS +LIBCFS_CONDITIONALS LN_CONDITIONALS LC_CONDITIONALS ]) @@ -630,6 +666,8 @@ AC_PACKAGE_TARNAME[.spec] AC_DEFUN([LB_CONFIGURE], [LB_CANONICAL_SYSTEM +LB_LIBCFS_DIR + LB_INCLUDE_RULES LB_CONFIG_CRAY_XT3 @@ -657,8 +695,10 @@ LB_CONFIG_MODULES LB_PATH_LIBSYSIO LB_PATH_SNMP LB_PATH_LDISKFS +LB_PATH_LUSTREIOKIT LC_CONFIG_LIBLUSTRE +LIBCFS_CONFIGURE LN_CONFIGURE LC_CONFIGURE @@ -667,9 +707,11 @@ if test "$SNMP_DIST_SUBDIR" ; then LS_CONFIGURE fi + LB_CONDITIONALS LB_CONFIG_HEADERS +LIBCFS_CONFIG_FILES LB_CONFIG_FILES LN_CONFIG_FILES LC_CONFIG_FILES diff --git a/build/autogen.sh b/build/autogen.sh index 1892ed130ce5eea454d538ff1730840ced0747e8..5d77897f135fb43580f9eeb21046f6d9d8b61ad9 100644 --- a/build/autogen.sh +++ b/build/autogen.sh @@ -83,9 +83,9 @@ if [ -d kernel_patches ] ; then REQUIRED_DIRS="build" CONFIGURE_DIRS="" else - REQUIRED_DIRS="build lnet lustre" + REQUIRED_DIRS="build libcfs lnet lustre" OPTIONAL_DIRS="snmp portals" - CONFIGURE_DIRS="libsysio ldiskfs" + CONFIGURE_DIRS="libsysio lustre-iokit ldiskfs" fi for dir in $REQUIRED_DIRS ; do diff --git a/build/buildcvs b/build/buildcvs index 5053089534fab77553a861a929a02c12ed1fa892..60b22bbb8ff5a7ee497b7271b9ad89ed44bfe3f6 100644 --- a/build/buildcvs +++ b/build/buildcvs @@ -1,10 +1,13 @@ # This file is sourced by lustre/lustrecvs portalstag="" -lnettag="HEAD" +lnettag="b1_x" libsysiotag="HEAD" snmptag="HEAD" ldiskfstag="HEAD" +ldiskfstag_head="b1_8_iam" +lustreiokittag="HEAD" +libcfstag="b1_x" dmutag="" hg_base_url="http://www.wizy.org/mercurial" @@ -24,7 +27,7 @@ case "$lustretag" in # this is the branch table # keep this list sorted alphabetically! - # Note these are "specials" -- branches using lnet HEAD don't need an + # Note these are "specials" -- branches using lnet b1_x don't need an # entry here. b1_2) @@ -54,12 +57,6 @@ case "$lustretag" in portalstag="b_hd_newconfig" ;; - b1_8) - # b1_8 is an alias for HEAD. - lustretag="HEAD" - ldiskfstag="b1_8_iam" - ;; - b_cmd*) portalstag="$lustretag" ;; @@ -76,12 +73,12 @@ case "$lustretag" in b_mpilnd) # lnet mpilnd development branch - lnettag="b_mpilnd" - lustretag="HEAD" - ;; + lnettag="b_mpilnd" + lustretag="HEAD" + ;; b_new_cmd) - portalstag="b_new_portals" + portalstag="b_new_portals" # lnettag="b_lnet_tmp" ;; @@ -150,15 +147,15 @@ case "$lustretag" in b_self_test) # lnet self test development branch - lnettag="b_self_test" - lustretag="HEAD" - ;; + lnettag="b_self_test" + lustretag="HEAD" + ;; b_usocklnd) # lnet usocklnd development branch - lnettag="b_usocklnd" - lustretag="b1_6_usocklnd" - ;; + lnettag="b_usocklnd" + lustretag="b1_6_usocklnd" + ;; b_uo2iblnd) # lnet u-o2iblnd development branch @@ -168,45 +165,54 @@ case "$lustretag" in b_ula) # lnet "User Level Access" development branch - lnettag="b_ula" - lustretag="HEAD" - ldiskfstag="b1_8_iam" - ;; + lnettag="b_ula" + lustretag="HEAD" + ldiskfstag="$ldiskfstag_head" + ;; # CMD3 - HEAD|b_post_cmd3|b_HEAD_AT|b_new_cmd_sles10|b1_6_head_sync|b1_8_dir_ra|b_mixed_layout_req|b_mount_perm|b1_8_gns|b1_8_quota|b1_8_interop_server|b_som) + b_post_cmd3|b_new_cmd_sles10|b1_6_head_sync|b_mixed_layout_req|b_mount_perm|b1_8_gns|b1_8_quota|b1_8_interop_server|b_som) # Update b1_8 above when changing this. - ldiskfstag="b1_8_iam" + ldiskfstag="$ldiskfstag_head" ;; #umds cleanup b_hd_umds_cln2) ldiskfstag="b1_8_iam_dynlock" - ;; + ;; # uOSS b_hd_dmu) lnettag="b_uoss" - ldiskfstag="b1_8_iam" + ldiskfstag="$ldiskfstag_head" + dmutag="zfs-lustre" + ;; + + # uMDS + b_dmu_umds) + lnettag="b_uoss_umds" + ldiskfstag="$ldiskfstag_head" dmutag="zfs-lustre" ;; # uOSS o2iblnd b_uoss_o2iblnd) lnettag="b_uoss_o2iblnd" - lustretag="b_hd_dmu" - ldiskfstag="b1_8_iam" + lustretag="b_hd_dmu" + ldiskfstag="$ldiskfstag_head" dmutag="zfs-lustre" ;; # client io stack cleanup b_client_io_layering) - ldiskfstag="b1_8_iam" + lnettag="HEAD" + ldiskfstag="$ldiskfstag_head" + libcfstag="HEAD" ;; # windows client porting (lustre: b_client_io_layering, lnet: HEAD) b_winnt_port) - ldiskfstag="b1_8_iam" + ldiskfstag="$ldiskfstag_head" lnettag="$lustretag" ;; @@ -234,6 +240,26 @@ case "$lustretag" in snmptag="$lustretag" ;; + b1_8_gate) + lnettag="b1_x_lnet_gate" + ldiskfstag="b_ldiskfs_gate" + ;; + + # Branches that have been updated to include + # the libcfs split should be added here + # b_head_interop_disk: Interoperability server side changes + HEAD|b_head_interop_disk) + lnettag="HEAD" + ldiskfstag="$ldiskfstag_head" + libcfstag="HEAD" + ;; + + b_head_libcfs) + lnettag="b_head_libcfs" + ldiskfstag="$ldiskfstag_head" + libcfstag="b_head_libcfs" + ;; + # all later v* tags v[1-9]*) lnettag="$lustretag" @@ -241,6 +267,11 @@ case "$lustretag" in snmptag="$lustretag" ldiskfstag="$lustretag" ;; + + b_HEAD_*|b_head_*|b_hd_*) + lnettag="b1_x" + ldiskfstag="$ldiskfstag_head" + ;; esac cvs_cmd libsysio libsysio "$libsysiotag" @@ -249,6 +280,8 @@ cvs_cmd lnet lnet "$lnettag" cvs_cmd snmp lustre-snmp "$snmptag" cvs_cmd lustre lustre-core "$lustretag" cvs_cmd ldiskfs ldiskfs "$ldiskfstag" -hg_cmd zfs-lustre "$hg_base_url" "$dmutag" +cvs_cmd lustre-iokit lustre-iokit "$lustreiokittag" +hg_cmd lustre/zfs-lustre "$hg_base_url" "$dmutag" +cvs_cmd libcfs libcfs "$libcfstag" [ -a ldiskfs/build ] || ln -sf ../build ldiskfs/build diff --git a/build/cvsdiffclient b/build/cvsdiffclient index 66cd6b6e1fb732dcc489920950aa4a76039925f0..ca09609476dcbc7d4b3c1d05c9cc584c6aae9f32 100644 --- a/build/cvsdiffclient +++ b/build/cvsdiffclient @@ -21,6 +21,12 @@ fi FILES=$($CVS_MODIFIED_FILES_PL $1) TMP=`mktemp /tmp/cvslog-XXXXXXXX` if [ -f $TMP ]; then + cat - >> $TMP <<- EOH + CVS: did you test your fix properly (acc-sm.sh, or as needed)? + CVS: did you update the ChangeLog for a bug fix? + CVS: did you verify/update affected user documentation? + CVS: Remove "CVS:" from lines below to include in commit message + EOH [ -f .mergeinfo ] && . .mergeinfo [ -z "$PARENT" -a -f lustre/.mergeinfo ] && . lustre/.mergeinfo if [ "$PARENT" ]; then @@ -33,15 +39,15 @@ if [ -f $TMP ]; then [ "$TAG" ] && BRANCH="`sed 's/^T//' $TAG`" || BRANCH="HEAD" echo "CVS: Branch $BRANCH" >> $TMP fi -cat - >> $TMP <<- EOF - CVS: Remove "CVS:" from start of lines that should be in commit message - CVS: did you update the ChangeLog for a bug fix? - CVS: did you update the hours spent in Bugzilla? - CVS: did you verify/update the HLD/DLD in CVS? - CVS: b=<bug> - CVS: i=<inspected_by> - CVS: i=<inspected_by> -EOF + # We can't just put these into the commit template without the leading + # "CVS: ", otherwise exiting the commend edit would still leave a valid + # comment in the file and the commit will still be done. We need to + # make a file without valid comments to allow the commit to be aborted. + cat - >> $TMP <<- EOB + CVS: b=<bug> + CVS: i=<inspected_by> + CVS: i=<inspected_by> + EOB cat $1 >> $TMP cp $TMP $1 diff --git a/build/doxyfile.api b/build/doxyfile.api new file mode 100644 index 0000000000000000000000000000000000000000..3739a642607633ba9d859edf9d6c156504f43a6d --- /dev/null +++ b/build/doxyfile.api @@ -0,0 +1,243 @@ +# Doxyfile 1.5.3 + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- +DOXYFILE_ENCODING = UTF-8 +PROJECT_NAME = Lustre +PROJECT_NUMBER = 1.10 +OUTPUT_DIRECTORY = apidoc.api +CREATE_SUBDIRS = NO +OUTPUT_LANGUAGE = English +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ABBREVIATE_BRIEF = "The $name class" \ + "The $name widget" \ + "The $name file" \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = YES +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = YES +QT_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = NO +DETAILS_AT_TOP = NO +INHERIT_DOCS = YES +SEPARATE_MEMBER_PAGES = NO +TAB_SIZE = 8 +ALIASES = +OPTIMIZE_OUTPUT_FOR_C = YES +OPTIMIZE_OUTPUT_JAVA = NO +BUILTIN_STL_SUPPORT = NO +CPP_CLI_SUPPORT = NO +DISTRIBUTE_GROUP_DOC = NO +SUBGROUPING = YES +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- +EXTRACT_ALL = NO +EXTRACT_PRIVATE = NO +EXTRACT_STATIC = NO +EXTRACT_LOCAL_CLASSES = NO +EXTRACT_LOCAL_METHODS = NO +EXTRACT_ANON_NSPACES = NO +HIDE_UNDOC_MEMBERS = NO +HIDE_UNDOC_CLASSES = NO +HIDE_FRIEND_COMPOUNDS = NO +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +SHOW_INCLUDE_FILES = YES +INLINE_INFO = YES +SORT_MEMBER_DOCS = YES +SORT_BRIEF_DOCS = NO +SORT_BY_SCOPE_NAME = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +GENERATE_DEPRECATEDLIST= YES +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +SHOW_USED_FILES = YES +SHOW_DIRECTORIES = NO +FILE_VERSION_FILTER = +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- +QUIET = NO +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_IF_DOC_ERROR = YES +WARN_NO_PARAMDOC = NO +WARN_FORMAT = "$file:$line: $text" +WARN_LOGFILE = +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- +INPUT = lustre/ \ + lnet +INPUT_ENCODING = UTF-8 +FILE_PATTERNS = *.h \ + *.c +RECURSIVE = YES +EXCLUDE = +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXCLUDE_SYMBOLS = EXPORT_SYMBOL* +EXAMPLE_PATH = +EXAMPLE_PATTERNS = * +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- +SOURCE_BROWSER = YES +INLINE_SOURCES = YES +STRIP_CODE_COMMENTS = YES +REFERENCED_BY_RELATION = YES +REFERENCES_RELATION = YES +REFERENCES_LINK_SOURCE = YES +USE_HTAGS = NO +VERBATIM_HEADERS = YES +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 5 +IGNORE_PREFIX = +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- +GENERATE_HTML = YES +HTML_OUTPUT = html +HTML_FILE_EXTENSION = .html +HTML_HEADER = build/doxygen-header.html +HTML_FOOTER = build/doxygen-footer.html +HTML_STYLESHEET = build/doxygen-stylesheet.css +HTML_ALIGN_MEMBERS = YES +GENERATE_HTMLHELP = NO +HTML_DYNAMIC_SECTIONS = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +BINARY_TOC = NO +TOC_EXPAND = NO +DISABLE_INDEX = NO +ENUM_VALUES_PER_LINE = 4 +GENERATE_TREEVIEW = YES +TREEVIEW_WIDTH = 250 +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- +GENERATE_LATEX = NO +LATEX_OUTPUT = latex +LATEX_CMD_NAME = latex +MAKEINDEX_CMD_NAME = makeindex +COMPACT_LATEX = NO +PAPER_TYPE = a4wide +EXTRA_PACKAGES = +LATEX_HEADER = +PDF_HYPERLINKS = YES +USE_PDFLATEX = YES +LATEX_BATCHMODE = NO +LATEX_HIDE_INDICES = NO +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- +GENERATE_RTF = NO +RTF_OUTPUT = rtf +COMPACT_RTF = NO +RTF_HYPERLINKS = YES +RTF_STYLESHEET_FILE = +RTF_EXTENSIONS_FILE = +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- +GENERATE_MAN = NO +MAN_OUTPUT = man +MAN_EXTENSION = .3 +MAN_LINKS = NO +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- +GENERATE_XML = NO +XML_OUTPUT = xml +XML_SCHEMA = +XML_DTD = +XML_PROGRAMLISTING = YES +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- +GENERATE_AUTOGEN_DEF = NO +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- +GENERATE_PERLMOD = NO +PERLMOD_LATEX = NO +PERLMOD_PRETTY = YES +PERLMOD_MAKEVAR_PREFIX = +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = NO +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = YES +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- +TAGFILES = +GENERATE_TAGFILE = +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +PERL_PATH = /usr/bin/perl +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- +CLASS_DIAGRAMS = YES +MSCGEN_PATH = +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = YES +CLASS_GRAPH = YES +COLLABORATION_GRAPH = YES +GROUP_GRAPHS = YES +UML_LOOK = YES +TEMPLATE_RELATIONS = NO +INCLUDE_GRAPH = YES +INCLUDED_BY_GRAPH = YES +CALL_GRAPH = YES +CALLER_GRAPH = YES +GRAPHICAL_HIERARCHY = YES +DIRECTORY_GRAPH = YES +DOT_IMAGE_FORMAT = png +DOT_PATH = +DOTFILE_DIRS = +DOT_GRAPH_MAX_NODES = 50 +MAX_DOT_GRAPH_DEPTH = 1000 +DOT_TRANSPARENT = NO +DOT_MULTI_TARGETS = NO +GENERATE_LEGEND = YES +DOT_CLEANUP = YES +#--------------------------------------------------------------------------- +# Configuration::additions related to the search engine +#--------------------------------------------------------------------------- +SEARCHENGINE = YES diff --git a/build/doxyfile.ref b/build/doxyfile.ref new file mode 100644 index 0000000000000000000000000000000000000000..26c194c5b10e5ab09c11f0680bebdaf4a47152a7 --- /dev/null +++ b/build/doxyfile.ref @@ -0,0 +1,243 @@ +# Doxyfile 1.5.3 + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- +DOXYFILE_ENCODING = UTF-8 +PROJECT_NAME = Lustre +PROJECT_NUMBER = 1.10 +OUTPUT_DIRECTORY = apidoc.ref +CREATE_SUBDIRS = NO +OUTPUT_LANGUAGE = English +BRIEF_MEMBER_DESC = YES +REPEAT_BRIEF = YES +ABBREVIATE_BRIEF = "The $name class " \ + "The $name widget " \ + "The $name file " \ + is \ + provides \ + specifies \ + contains \ + represents \ + a \ + an \ + the +ALWAYS_DETAILED_SEC = NO +INLINE_INHERITED_MEMB = NO +FULL_PATH_NAMES = YES +STRIP_FROM_PATH = +STRIP_FROM_INC_PATH = +SHORT_NAMES = NO +JAVADOC_AUTOBRIEF = YES +QT_AUTOBRIEF = NO +MULTILINE_CPP_IS_BRIEF = NO +DETAILS_AT_TOP = NO +INHERIT_DOCS = YES +SEPARATE_MEMBER_PAGES = NO +TAB_SIZE = 8 +ALIASES = +OPTIMIZE_OUTPUT_FOR_C = YES +OPTIMIZE_OUTPUT_JAVA = NO +BUILTIN_STL_SUPPORT = NO +CPP_CLI_SUPPORT = NO +DISTRIBUTE_GROUP_DOC = NO +SUBGROUPING = YES +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- +EXTRACT_ALL = YES +EXTRACT_PRIVATE = YES +EXTRACT_STATIC = YES +EXTRACT_LOCAL_CLASSES = YES +EXTRACT_LOCAL_METHODS = YES +EXTRACT_ANON_NSPACES = NO +HIDE_UNDOC_MEMBERS = NO +HIDE_UNDOC_CLASSES = NO +HIDE_FRIEND_COMPOUNDS = NO +HIDE_IN_BODY_DOCS = NO +INTERNAL_DOCS = NO +CASE_SENSE_NAMES = YES +HIDE_SCOPE_NAMES = NO +SHOW_INCLUDE_FILES = YES +INLINE_INFO = YES +SORT_MEMBER_DOCS = YES +SORT_BRIEF_DOCS = NO +SORT_BY_SCOPE_NAME = NO +GENERATE_TODOLIST = YES +GENERATE_TESTLIST = YES +GENERATE_BUGLIST = YES +GENERATE_DEPRECATEDLIST= YES +ENABLED_SECTIONS = +MAX_INITIALIZER_LINES = 30 +SHOW_USED_FILES = YES +SHOW_DIRECTORIES = NO +FILE_VERSION_FILTER = +#--------------------------------------------------------------------------- +# configuration options related to warning and progress messages +#--------------------------------------------------------------------------- +QUIET = NO +WARNINGS = YES +WARN_IF_UNDOCUMENTED = YES +WARN_IF_DOC_ERROR = YES +WARN_NO_PARAMDOC = NO +WARN_FORMAT = "$file:$line: $text " +WARN_LOGFILE = +#--------------------------------------------------------------------------- +# configuration options related to the input files +#--------------------------------------------------------------------------- +INPUT = lustre/ \ + lnet +INPUT_ENCODING = UTF-8 +FILE_PATTERNS = *.h \ + *.c +RECURSIVE = YES +EXCLUDE = +EXCLUDE_SYMLINKS = NO +EXCLUDE_PATTERNS = +EXCLUDE_SYMBOLS = EXPORT_SYMBOL* +EXAMPLE_PATH = +EXAMPLE_PATTERNS = * +EXAMPLE_RECURSIVE = NO +IMAGE_PATH = +INPUT_FILTER = +FILTER_PATTERNS = +FILTER_SOURCE_FILES = NO +#--------------------------------------------------------------------------- +# configuration options related to source browsing +#--------------------------------------------------------------------------- +SOURCE_BROWSER = YES +INLINE_SOURCES = NO +STRIP_CODE_COMMENTS = YES +REFERENCED_BY_RELATION = YES +REFERENCES_RELATION = YES +REFERENCES_LINK_SOURCE = YES +USE_HTAGS = NO +VERBATIM_HEADERS = YES +#--------------------------------------------------------------------------- +# configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- +ALPHABETICAL_INDEX = YES +COLS_IN_ALPHA_INDEX = 5 +IGNORE_PREFIX = +#--------------------------------------------------------------------------- +# configuration options related to the HTML output +#--------------------------------------------------------------------------- +GENERATE_HTML = YES +HTML_OUTPUT = html +HTML_FILE_EXTENSION = .html +HTML_HEADER = build/doxygen-header.html +HTML_FOOTER = build/doxygen-footer.html +HTML_STYLESHEET = build/doxygen-stylesheet.css +HTML_ALIGN_MEMBERS = YES +GENERATE_HTMLHELP = NO +HTML_DYNAMIC_SECTIONS = NO +CHM_FILE = +HHC_LOCATION = +GENERATE_CHI = NO +BINARY_TOC = NO +TOC_EXPAND = NO +DISABLE_INDEX = NO +ENUM_VALUES_PER_LINE = 4 +GENERATE_TREEVIEW = YES +TREEVIEW_WIDTH = 250 +#--------------------------------------------------------------------------- +# configuration options related to the LaTeX output +#--------------------------------------------------------------------------- +GENERATE_LATEX = NO +LATEX_OUTPUT = latex +LATEX_CMD_NAME = latex +MAKEINDEX_CMD_NAME = makeindex +COMPACT_LATEX = NO +PAPER_TYPE = a4wide +EXTRA_PACKAGES = +LATEX_HEADER = +PDF_HYPERLINKS = YES +USE_PDFLATEX = YES +LATEX_BATCHMODE = NO +LATEX_HIDE_INDICES = NO +#--------------------------------------------------------------------------- +# configuration options related to the RTF output +#--------------------------------------------------------------------------- +GENERATE_RTF = NO +RTF_OUTPUT = rtf +COMPACT_RTF = NO +RTF_HYPERLINKS = YES +RTF_STYLESHEET_FILE = +RTF_EXTENSIONS_FILE = +#--------------------------------------------------------------------------- +# configuration options related to the man page output +#--------------------------------------------------------------------------- +GENERATE_MAN = NO +MAN_OUTPUT = man +MAN_EXTENSION = .3 +MAN_LINKS = NO +#--------------------------------------------------------------------------- +# configuration options related to the XML output +#--------------------------------------------------------------------------- +GENERATE_XML = NO +XML_OUTPUT = xml +XML_SCHEMA = +XML_DTD = +XML_PROGRAMLISTING = YES +#--------------------------------------------------------------------------- +# configuration options for the AutoGen Definitions output +#--------------------------------------------------------------------------- +GENERATE_AUTOGEN_DEF = NO +#--------------------------------------------------------------------------- +# configuration options related to the Perl module output +#--------------------------------------------------------------------------- +GENERATE_PERLMOD = NO +PERLMOD_LATEX = NO +PERLMOD_PRETTY = YES +PERLMOD_MAKEVAR_PREFIX = +#--------------------------------------------------------------------------- +# Configuration options related to the preprocessor +#--------------------------------------------------------------------------- +ENABLE_PREPROCESSING = YES +MACRO_EXPANSION = NO +EXPAND_ONLY_PREDEF = NO +SEARCH_INCLUDES = YES +INCLUDE_PATH = +INCLUDE_FILE_PATTERNS = +PREDEFINED = +EXPAND_AS_DEFINED = +SKIP_FUNCTION_MACROS = YES +#--------------------------------------------------------------------------- +# Configuration::additions related to external references +#--------------------------------------------------------------------------- +TAGFILES = +GENERATE_TAGFILE = +ALLEXTERNALS = NO +EXTERNAL_GROUPS = YES +PERL_PATH = /usr/bin/perl +#--------------------------------------------------------------------------- +# Configuration options related to the dot tool +#--------------------------------------------------------------------------- +CLASS_DIAGRAMS = NO +MSCGEN_PATH = +HIDE_UNDOC_RELATIONS = YES +HAVE_DOT = YES +CLASS_GRAPH = YES +COLLABORATION_GRAPH = NO +GROUP_GRAPHS = NO +UML_LOOK = YES +TEMPLATE_RELATIONS = NO +INCLUDE_GRAPH = NO +INCLUDED_BY_GRAPH = NO +CALL_GRAPH = NO +CALLER_GRAPH = NO +GRAPHICAL_HIERARCHY = NO +DIRECTORY_GRAPH = YES +DOT_IMAGE_FORMAT = png +DOT_PATH = +DOTFILE_DIRS = +DOT_GRAPH_MAX_NODES = 50 +MAX_DOT_GRAPH_DEPTH = 1000 +DOT_TRANSPARENT = NO +DOT_MULTI_TARGETS = NO +GENERATE_LEGEND = YES +DOT_CLEANUP = YES +#--------------------------------------------------------------------------- +# Configuration::additions related to the search engine +#--------------------------------------------------------------------------- +SEARCHENGINE = YES diff --git a/build/doxygen-footer.html b/build/doxygen-footer.html new file mode 100644 index 0000000000000000000000000000000000000000..eee6b160d6f7e9391f56704d04b41ccc8510e727 --- /dev/null +++ b/build/doxygen-footer.html @@ -0,0 +1,5 @@ +<hr size="1"><address style="text-align: right;"><small> +Generated on $datetime for $projectname by <a href="http://www.doxygen.org/index.html">doxygen</a> $doxygenversion</small></address><br> +<small><a href="http://www.sun.com/contact">Contact</a> | <a href="http://www.sun.com/aboutsun/index.html">About Sun</a> | <a href="http://www.sun.com/aboutsun/media/index.html">News</a> | <a href="http://www.sun.com/corp_emp/">Employment</a> | <a href="http://www.sun.com/privacy/">Privacy</a> | <a href="http://www.sun.com/share/text/termsofuse.html">Terms of Use</a> | <a href="http://www.sun.com/suntrademarks/">Trademarks</a> | (C) 2008 Sun Microsystems, Inc.</small> +</body> +</html> diff --git a/build/doxygen-header.html b/build/doxygen-header.html new file mode 100644 index 0000000000000000000000000000000000000000..b0cd01394fe5834095c34218cac1165247ce151f --- /dev/null +++ b/build/doxygen-header.html @@ -0,0 +1,7 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"> +<html><head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8"> +<title>$title</title> +<link href="$relpath$doxygen.css" rel="stylesheet" type="text/css"> +<link href="$relpath$tabs.css" rel="stylesheet" type="text/css"> +</head><body fgcolor="#202020"> +<img src="http://wiki.lustre.org/apidoc/header.jpg"> diff --git a/build/doxygen-stylesheet.css b/build/doxygen-stylesheet.css new file mode 100644 index 0000000000000000000000000000000000000000..f4d07f6f585d207a6f380b3db6900a97357e075f --- /dev/null +++ b/build/doxygen-stylesheet.css @@ -0,0 +1,360 @@ +BODY,H1,H2,H3,H4,H5,H6,P,CENTER,TD,TH,UL,DL,DIV { + font-family: Geneva, Arial, Helvetica, sans-serif; +} +BODY,TD { + font-size: 90%; +} +H1 { + text-align: center; + font-size: 160%; +} +H2 { + font-size: 120%; +} +H3 { + font-size: 100%; +} +CAPTION { font-weight: bold } +DIV.qindex { + width: 100%; + background-color: #e8eef2; + border: 1px solid #84b0c7; + text-align: center; + margin: 2px; + padding: 2px; + line-height: 140%; +} +DIV.nav { + width: 100%; + background-color: #e8eef2; + border: 1px solid #84b0c7; + text-align: center; + margin: 2px; + padding: 2px; + line-height: 140%; +} +DIV.navtab { + background-color: #e8eef2; + border: 1px solid #84b0c7; + text-align: center; + margin: 2px; + margin-right: 15px; + padding: 2px; +} +TD.navtab { + font-size: 70%; +} +A.qindex { + text-decoration: none; + font-weight: bold; + color: #1A419D; +} +A.qindex:visited { + text-decoration: none; + font-weight: bold; + color: #1A419D +} +A.qindex:hover { + text-decoration: none; + background-color: #ddddff; +} +A.qindexHL { + text-decoration: none; + font-weight: bold; + background-color: #6666cc; + color: #ffffff; + border: 1px double #9295C2; +} +A.qindexHL:hover { + text-decoration: none; + background-color: #6666cc; + color: #ffffff; +} +A.qindexHL:visited { text-decoration: none; background-color: #6666cc; color: #ffffff } +A.el { text-decoration: none; font-weight: bold } +A.elRef { font-weight: bold } +A.code:link { text-decoration: none; font-weight: normal; color: #0000FF} +A.code:visited { text-decoration: none; font-weight: normal; color: #0000FF} +A.codeRef:link { font-weight: normal; color: #0000FF} +A.codeRef:visited { font-weight: normal; color: #0000FF} +A:hover { text-decoration: none; background-color: #f2f2ff } +DL.el { margin-left: -1cm } +.fragment { + font-family: monospace, fixed; + font-size: 95%; +} +PRE.fragment { + border: 1px solid #CCCCCC; + background-color: #f5f5f5; + margin-top: 4px; + margin-bottom: 4px; + margin-left: 2px; + margin-right: 8px; + padding-left: 6px; + padding-right: 6px; + padding-top: 4px; + padding-bottom: 4px; +} +DIV.ah { background-color: black; font-weight: bold; color: #ffffff; margin-bottom: 3px; margin-top: 3px } + +DIV.groupHeader { + margin-left: 16px; + margin-top: 12px; + margin-bottom: 6px; + font-weight: bold; +} +DIV.groupText { margin-left: 16px; font-style: italic; font-size: 90% } +BODY { + background: white; + color: black; + margin-right: 20px; + margin-left: 20px; +} +TD.indexkey { + background-color: #e8eef2; + font-weight: bold; + padding-right : 10px; + padding-top : 2px; + padding-left : 10px; + padding-bottom : 2px; + margin-left : 0px; + margin-right : 0px; + margin-top : 2px; + margin-bottom : 2px; + border: 1px solid #CCCCCC; +} +TD.indexvalue { + background-color: #e8eef2; + font-style: italic; + padding-right : 10px; + padding-top : 2px; + padding-left : 10px; + padding-bottom : 2px; + margin-left : 0px; + margin-right : 0px; + margin-top : 2px; + margin-bottom : 2px; + border: 1px solid #CCCCCC; +} +TR.memlist { + background-color: #f0f0f0; +} +P.formulaDsp { text-align: center; } +IMG.formulaDsp { } +IMG.formulaInl { vertical-align: middle; } +SPAN.keyword { color: #008000 } +SPAN.keywordtype { color: #604020 } +SPAN.keywordflow { color: #e08000 } +SPAN.comment { color: #800000 } +SPAN.preprocessor { color: #806020 } +SPAN.stringliteral { color: #002080 } +SPAN.charliteral { color: #008080 } +.mdescLeft { + padding: 0px 8px 4px 8px; + font-size: 80%; + font-style: italic; + background-color: #FAFAFA; + border-top: 1px none #E0E0E0; + border-right: 1px none #E0E0E0; + border-bottom: 1px none #E0E0E0; + border-left: 1px none #E0E0E0; + margin: 0px; +} +.mdescRight { + padding: 0px 8px 4px 8px; + font-size: 80%; + font-style: italic; + background-color: #FAFAFA; + border-top: 1px none #E0E0E0; + border-right: 1px none #E0E0E0; + border-bottom: 1px none #E0E0E0; + border-left: 1px none #E0E0E0; + margin: 0px; +} +.memItemLeft { + padding: 1px 0px 0px 8px; + margin: 4px; + border-top-width: 1px; + border-right-width: 1px; + border-bottom-width: 1px; + border-left-width: 1px; + border-top-color: #E0E0E0; + border-right-color: #E0E0E0; + border-bottom-color: #E0E0E0; + border-left-color: #E0E0E0; + border-top-style: solid; + border-right-style: none; + border-bottom-style: none; + border-left-style: none; + background-color: #FAFAFA; + font-size: 80%; +} +.memItemRight { + padding: 1px 8px 0px 8px; + margin: 4px; + border-top-width: 1px; + border-right-width: 1px; + border-bottom-width: 1px; + border-left-width: 1px; + border-top-color: #E0E0E0; + border-right-color: #E0E0E0; + border-bottom-color: #E0E0E0; + border-left-color: #E0E0E0; + border-top-style: solid; + border-right-style: none; + border-bottom-style: none; + border-left-style: none; + background-color: #FAFAFA; + font-size: 80%; +} +.memTemplItemLeft { + padding: 1px 0px 0px 8px; + margin: 4px; + border-top-width: 1px; + border-right-width: 1px; + border-bottom-width: 1px; + border-left-width: 1px; + border-top-color: #E0E0E0; + border-right-color: #E0E0E0; + border-bottom-color: #E0E0E0; + border-left-color: #E0E0E0; + border-top-style: none; + border-right-style: none; + border-bottom-style: none; + border-left-style: none; + background-color: #FAFAFA; + font-size: 80%; +} +.memTemplItemRight { + padding: 1px 8px 0px 8px; + margin: 4px; + border-top-width: 1px; + border-right-width: 1px; + border-bottom-width: 1px; + border-left-width: 1px; + border-top-color: #E0E0E0; + border-right-color: #E0E0E0; + border-bottom-color: #E0E0E0; + border-left-color: #E0E0E0; + border-top-style: none; + border-right-style: none; + border-bottom-style: none; + border-left-style: none; + background-color: #FAFAFA; + font-size: 80%; +} +.memTemplParams { + padding: 1px 0px 0px 8px; + margin: 4px; + border-top-width: 1px; + border-right-width: 1px; + border-bottom-width: 1px; + border-left-width: 1px; + border-top-color: #E0E0E0; + border-right-color: #E0E0E0; + border-bottom-color: #E0E0E0; + border-left-color: #E0E0E0; + border-top-style: solid; + border-right-style: none; + border-bottom-style: none; + border-left-style: none; + color: #606060; + background-color: #FAFAFA; + font-size: 80%; +} +.search { color: #003399; + font-weight: bold; +} +FORM.search { + margin-bottom: 0px; + margin-top: 0px; +} +INPUT.search { font-size: 75%; + color: #000080; + font-weight: normal; + background-color: #e8eef2; +} +TD.tiny { font-size: 75%; +} +a { + color: #1A41A8; +} +a:visited { + color: #2A3798; +} +.dirtab { padding: 4px; + border-collapse: collapse; + border: 1px solid #84b0c7; +} +TH.dirtab { background: #e8eef2; + font-weight: bold; +} +HR { height: 1px; + border: none; + border-top: 1px solid black; +} + +/* Style for detailed member documentation */ +.memtemplate { + font-size: 80%; + color: #606060; + font-weight: normal; +} +.memnav { + background-color: #e8eef2; + border: 1px solid #84b0c7; + text-align: center; + margin: 2px; + margin-right: 15px; + padding: 2px; +} +.memitem { + padding: 4px; + background-color: #eef3f5; + border-width: 1px; + border-style: solid; + border-color: #dedeee; + -moz-border-radius: 8px 8px 8px 8px; +} +.memname { + white-space: nowrap; + font-weight: bold; +} +.memdoc{ + padding-left: 10px; +} +.memproto { + background-color: #d5e1e8; + width: 100%; + border-width: 1px; + border-style: solid; + border-color: #84b0c7; + font-weight: bold; + -moz-border-radius: 8px 8px 8px 8px; +} +.paramkey { + text-align: right; +} +.paramtype { + white-space: nowrap; +} +.paramname { + color: #602020; + font-style: italic; + white-space: nowrap; +} +/* End Styling for detailed member documentation */ + +/* for the tree view */ +.ftvtree { + font-family: sans-serif; + margin:0.5em; +} +.directory { font-size: 9pt; font-weight: bold; } +.directory h3 { margin: 0px; margin-top: 1em; font-size: 11pt; } +.directory > h3 { margin-top: 0; } +.directory p { margin: 0px; white-space: nowrap; } +.directory div { display: none; margin: 0px; } +.directory img { vertical-align: -30%; } + +.footerlinks a{border-right:1px solid #A3B8CB;padding-right:5px;padding-left:2px;text-decoration:none !important} \ No newline at end of file diff --git a/build/lbuild b/build/lbuild index 5c35f107a3f2ac9d9b7269bedf43e50ded9fd68d..354a0226e193a1663763083c585af91ef818014e 100755 --- a/build/lbuild +++ b/build/lbuild @@ -382,6 +382,10 @@ download_and_build_tarball() { -O "$KERNELDIR/$srpm" ; then fatal 1 "Could not download target $kernel_file's kernel SRPM $srpm from downloads.lustre.org." fi + [ -s "$KERNELDIR/$srpm" ] || { + rm -rf $KERNELDIR/$srpm + fatal 1 "Could not download target $kernel_file's kernel SRPM $srpm from downloads.lustre.org." + } build_tarball $target $srpm } @@ -420,10 +424,15 @@ load_target() build_tarball $CANONICAL_TARGET $KERNEL_SRPM else if (( $DOWNLOAD )) ; then - echo "Downloading http://downloads.lustre.org/public/kernels/$CANONICAL_TARGET/old/$KERNEL..." - if ! wget -nv "http://downloads.lustre.org/public/kernels/$CANONICAL_TARGET/old/$KERNEL" -O "$KERNELDIR/$KERNEL" ; then + echo "Downloading http://downloads.lustre.org/public/kernels/$DISTRO/old/$KERNEL..." + if ! wget -nv "http://downloads.lustre.org/public/kernels/$DISTRO/old/$KERNEL" -O "$KERNELDIR/$KERNEL" ; then # see if we can do it with an SRPM from the download site download_and_build_tarball $CANONICAL_TARGET $KERNEL_FILE + else + [ -s "$KERNELDIR/$KERNEL" ] || { + rm -rf "$KERNELDIR/$KERNEL" + fatal 1 "Target $TARGET's kernel $KERNEL not found in directory $KERNELDIR." + } fi else fatal 1 "Target $TARGET's kernel file $KERNEL not found in kernel directory $KERNELDIR." @@ -455,7 +464,7 @@ load_target() fi if [ -f $TOPDIR/lustre/lustre/kernel_patches/kernel_configs/kernel-$lnxmaj-$TARGET-$TARGET_ARCH.config ]; then - CONFIG_FILE="$TOPDIR/lustre/lustre/kernel_patches/kernel_configs/kernel-$lnxmaj-$TARGET-$TARGET_ARCH.config" + CONFIG_FILE="$TOPDIR/lustre/lustre/kernel_patches/kernel_configs/kernel-$lnxmaj-$TARGET-$TARGET_ARCH.config" fi local smptype for smptype in $SMPTYPES; do @@ -466,7 +475,7 @@ load_target() fi done - local lnxrelnew=$( echo ${lnxrel} | sed s/-/_/g ) + local lnxrelnew=${lnxrel//-/_} [ -f "$CONFIG_FILE" ] || \ fatal 1 "Config file for target $TARGET missing from $TOPDIR/lustre/lustre/kernel_patches/kernel_configs/." @@ -916,13 +925,8 @@ gen_lustre_version() [ "$KERNCONFSMPTYPE" = "" ] || smptype=$KERNCONFSMPTYPE [ "$RPMSMPTYPE" = "" ] || smptype=$RPMSMPTYPE - LUSTRE_EXTRA_VERSION="${lnxmaj}-${EXTRA_VERSION}" - if [ "$PATCHLESS" = "true" -a "$DISTRO" = "sles10" ]; then - LUSTRE_EXTRA_VERSION="${LUSTRE_EXTRA_VERSION}-${smptype}" - else - LUSTRE_EXTRA_VERSION="${LUSTRE_EXTRA_VERSION}${smptype}" - fi - LUSTRE_EXTRA_VERSION=$( echo $LUSTRE_EXTRA_VERSION | sed -e "s^-^_^g" ) + LUSTRE_EXTRA_VERSION="${lnxmaj}${EXTRA_VERSION_DELIMITER}${EXTRA_VERSION}${TARGET_DELIMITER}${smptype}" + LUSTRE_EXTRA_VERSION=${LUSTRE_EXTRA_VERSION//-/_} } #store RPMs and/or BUILD dir for future reuse @@ -936,36 +940,40 @@ store_for_reuse() return 255 fi - local lnxrelnew=$( echo ${lnxrel} | sed s/-/_/g ) - local EXTRA_VERSIONnew=$( echo ${EXTRA_VERSION} | sed s/-/_/g ) - local KERNELRPMnew=$(basename "$KERNELRPM") + local lnxrelnew=${lnxrel//-/_} + local EXTRA_VERSIONnew=${EXTRA_VERSION//-/_} + local KERNELRPMnew=$(basename "$KERNELRPM") if [ ! "$rpmonly" = "rpmonly" ]; then local builddir= if [ ! "$KERNELCOMPILEDIR" = "" ]; then builddir="$KERNELCOMPILEDIR" else builddir="BUILD/lustre-kernel-${lnxmaj}/lustre/linux-${lnxmaj}" - [ "$KERNELCOMPILEDIR" = "" ] || builddir="$KERNELCOMPILEDIR" + [ "$KERNELCOMPILEDIR" = "" ] || builddir="$KERNELCOMPILEDIR" [ -d "$builddir" ] || builddir="BUILD/lustre-kernel-${lnxmaj}/lustre/linux-${lnxmaj}.${lnxrel}" [ -d "$builddir" ] || builddir="BUILD/lustre-kernel-${lnxmaj}/lustre/linux-${lnxmaj}-${lnxrel}" - if [ ! -d "$builddir" ]; then + if [ ! -d "$builddir" ]; then pushd "BUILD/lustre-kernel-${lnxmaj}/lustre/" || return 255 local basebuilddir=$(ls -d linux-${lnxmaj}* | head -1) [ "$basebuilddir" = "" ] || builddir="BUILD/lustre-kernel-${lnxmaj}/lustre/${basebuilddir}" popd - fi + fi fi [ -d "$builddir" ] || return 255 - local dstdir="${REUSEBUILD}/${TIMESTAMP}/linux-${KERNCONFSMPTYPE}-${lnxmaj}-${EXTRA_VERSIONnew}.${TARGET_ARCH}" - ( $PATCHLESS ) && dstdir="${REUSEBUILD}/${TIMESTAMP}/linux-$KERNELRPMnew" && \ - dstdir="${dstdir%.rpm}" - [ -d "$dstdir" ] && rm -rf "$dstdir" + local dstdir="${REUSEBUILD}/${TIMESTAMP}/linux-${KERNCONFSMPTYPE}-${lnxmaj}-${EXTRA_VERSIONnew}.${TARGET_ARCH}" + ( $PATCHLESS ) && dstdir="${REUSEBUILD}/${TIMESTAMP}/linux-$KERNELRPMnew" && \ + dstdir="${dstdir%.rpm}" + [ -d "$dstdir" ] && rm -rf "$dstdir" mv "${builddir}" "$dstdir" || return 255 + if [ -n "$OFED_VERSION" ]; then + # move the OFED kernel-ib-devel tree as well + mv "${builddir%/*}/kernel-ib-devel/usr/src/ofa_kernel" "${dstdir%/*}" || return 255 + fi fi #store kernel rpm local kernelrpmname="kernel-lustre-${KERNCONFSMPTYPE}-${lnxmaj}-${EXTRA_VERSIONnew}.${TARGET_ARCH}.rpm" [ -f "RPMS/${TARGET_ARCH}/${kernelrpmname}" ] || kernelrpmname="kernel-${KERNCONFSMPTYPE}-${lnxmaj}-${EXTRA_VERSNnew}.${TARGET_ARCH}.rpm" - ( $PATCHLESS ) && [ -f "$KERNELRPM" ] && kernelrpmname="$KERNELRPMnew" + ( $PATCHLESS ) && [ -f "$KERNELRPM" ] && kernelrpmname="$KERNELRPMnew" if [ "$rpmonly" = "rpmonly" ] && [ -f "${REUSEBUILD}/${TIMESTAMP}/${kernelrpmname}" ]; then echo "RPM already exist in store directory tree" else @@ -980,17 +988,36 @@ store_for_reuse() else [ -f "RPMS/${TARGET_ARCH}/${kernelrpmname}" ] && cp -f "RPMS/${TARGET_ARCH}/${kernelrpmname}" "${REUSEBUILD}/${TIMESTAMP}/" fi + if [ -n "$OFED_VERSION" ]; then + # store kernel-ib RPMs + local rpmname + for rpmname in "kernel-ib" "kernel-ib-devel"; do + rpmname="${rpmname}-${OFED_VERSION}" + if $PATCHLESS; then + rpmname="${rpmname}-${LINUXRELEASE//-/_}" + else + rpmname="${rpmname}-${lnxmaj}${EXTRA_VERSION_DELIMITER//-/_}${EXTRA_VERSIONnew}${TARGET_DELIMITER//-/_}${KERNCONFSMPTYPE}" + fi + rpmname="${rpmname}.${TARGET_ARCH}.rpm" + if [ "$rpmonly" = "rpmonly" ] && [ -f "${REUSEBUILD}/${TIMESTAMP}/${rpmname}" ]; then + echo "RPM already exist in store directory tree" + else + [ -f "RPMS/${TARGET_ARCH}/${rpmname}" ] && cp -f "RPMS/${TARGET_ARCH}/${rpmname}" "${REUSEBUILD}/${TIMESTAMP}/" + fi + done + fi } set_rpm_smp_type() { + local infact_arch=${TARGET_ARCH} RPMSMPTYPE=default + [ "$infact_arch" == "i586" ] && infact_arch="i686" for smp_type in $SMP_ARCHS; do - [ $TARGET_ARCH == $smp_type ] && RPMSMPTYPE=smp && break + [ $infact_arch == $smp_type ] && RPMSMPTYPE=smp && break done - for smp_type in $BIGSMP_ARCHS; do - [ $TARGET_ARCH == $smp_type ] && RPMSMPTYPE=bigsmp && break + [ $infact_arch == $smp_type ] && RPMSMPTYPE=bigsmp && break done } @@ -1047,7 +1074,20 @@ unpack_linux_rpm() LINUXOBJ="$(pwd)/$src/$objects" fi done - [ -z "$LINUX" ] && RC=255 + if [ -z "$LINUX" ]; then + RC=255 + else + # dig out the release version + local LINUXRELEASEHEADER=version.h + if test -s ${LINUXOBJ:-$LINUX}/include/linux/utsrelease.h ; then + LINUXRELEASEHEADER=utsrelease.h + fi + LINUXRELEASE=$(sed -ne 's/#define UTS_RELEASE "\(.*\)"$/\1/p' ${LINUXOBJ:-$LINUX}/include/linux/$LINUXRELEASEHEADER) + if [ -z "$LINUXRELEASE" ]; then + echo "Failed to find linux release in ${LINUXOBJ:-$LINUX}/include/linux/$LINUXRELEASEHEADER" + RC=255 + fi + fi else RC=255 fi @@ -1108,6 +1148,7 @@ find_linux_source_rpm() reuse_kernel_rpm() { local pathtorpm=$1 + local pathtokernelibrpm=$2 [ "$pathtorpm" = "" ] && return 255 [ -f "$pathtorpm" ] || return 255 [ -d $TOPDIR/reused ] || mkdir $TOPDIR/reused @@ -1116,6 +1157,12 @@ reuse_kernel_rpm() rpm2cpio < $pathtorpm | cpio -idc [ ${PIPESTATUS[0]} -eq 0 ] || return 255 + if [ -n "$pathtokernelibrpm" ] && [ -f "$pathtokernelibrpm" ]; then + rpm2cpio < $pathtokernelibrpm | cpio -idc + [ ${PIPESTATUS[0]} -eq 0 -o ${PIPESTATUS[1]} -eq 0 ] || return 255 + CONFIGURE_FLAGS="--with-o2ib=$(pwd)/usr/src/ofa_kernel ${CONFIGURE_FLAGS}" + fi + local smptype= if pushd usr/src/linux-*-obj/${TARGET_ARCH}; then local smptypes="$SMPTYPES" @@ -1213,6 +1260,41 @@ build_linux() return } +build_kernel_ib() +{ + # build kernel-ib{,-devel} + # some I/B drivers are architecture dependent and kernel-ib's configure + # does not figure it out for us ~sigh~ + local configure_options="" + case "$TARGET_ARCH" in + x86_64 | ia64) + configure_options="--with-ipath_inf-mod" + ;; + ppc64) + configure_options="--with-ipath_inf-mod --with-ehca-mod" + ;; + esac + $RPMBUILD --rebuild --define 'build_kernel_ib 1' --define 'build_kernel_ib_devel 1' \ + --define "_topdir ${TOPDIR}" --target ${TARGET_ARCH} \ + --define "KVERSION ${LINUXRELEASE}" \ + --define "KSRC ${LINUXOBJ:-${LINUX}}" \ + --define "LIB_MOD_DIR /lib/modules/${LINUXRELEASE}/updates" \ + --define "configure_options --without-quilt --with-core-mod --with-user_mad-mod --with-user_access-mod --with-addr_trans-mod --with-srp-target-mod --with-core-mod --with-mthca-mod --with-mlx4-mod --with-cxgb3-mod --with-nes-mod --with-ipoib-mod --with-sdp-mod --with-srp-mod --without-srp-target-mod --with-rds-mod --with-iser-mod --with-qlgc_vnic-mod --with-madeye-mod $configure_options" ${TOPDIR}/OFED/SRPMS/ofa_kernel-${OFED_VERSION}-ofed${OFED_VERSION}.src.rpm + + if [ ${PIPESTATUS[0]} != 0 ]; then + fatal 1 "Error building kernel-ib" + fi + + pushd "$TOPDIR" >/dev/null + rm -rf kernel-ib-devel + mkdir kernel-ib-devel + cd kernel-ib-devel + local rpm=$(ls $TOPDIR/RPMS/*/kernel-ib-devel-${OFED_VERSION}-${LINUXRELEASE//-/_}.*.rpm) + rpm2cpio -itv < $rpm | cpio -id + CONFIGURE_FLAGS="--with-o2ib=$(pwd)/usr/src/ofa_kernel ${CONFIGURE_FLAGS}" + popd >/dev/null +} + #build patchless lustre patchless_build_sequence() { @@ -1237,6 +1319,16 @@ patchless_build_sequence() esac unpack_linux_rpm $type $delimiter && rpmfound=true + + [ -d SRPMS ] || mkdir SRPMS + [ -d RPMS ] || mkdir RPMS + [ -d BUILD ] || mkdir BUILD + [ -d SOURCES ] || mkdir SOURCES + + # first build kernel-ib + if [ -n "$OFED_VERSION" ]; then + $rpmfound && build_kernel_ib + fi ( $rpmfound ) && build_lustre && buildsuccess=true && find_linux_source_rpm if $buildsuccess; then @@ -1246,7 +1338,7 @@ patchless_build_sequence() cp "$KERNELSOURCERPM" RPMS/${TARGET_ARCH}/ KERNELCOMPILEDIR="$LINUX" if $storeforreuse; then - store_for_reuse || echo "Cannot store for feature reuse" + store_for_reuse || echo "Cannot store for future reuse" fi return @@ -1332,8 +1424,11 @@ build_sequence_rpm_reuse() reusedkernelsourcerpm=$(ls ${curdir}/${REUSEDKERNELMASKnew}.rpm | head -1 ) [ -f "$reusedkernelsourcerpm" ] || continue + # don't need to check for kernel-ib RPM reuse here because sles9 is not supported + # by OFED >= 1.3.0 and this function appears to only be used for sles9 + [ -d $TOPDIR/reused ] && rm -rf $TOPDIR/reused - reuse_kernel_rpm "$reusedkernelsourcerpm" && build_linux nofullmake copyrpmkernel && build_lustre && buildsuccess=true + reuse_kernel_rpm "$reusedkernelsourcerpm" "" && build_linux nofullmake copyrpmkernel && build_lustre && buildsuccess=true ( $buildsuccess ) || continue if ( ! $NORPM ) && ( ! $PATCHLESS ) ; then [ -f "$reusedkernelrpm" ] && \ @@ -1363,7 +1458,9 @@ build_sequence_reuse() for curdir in $(echo $dirsforreuse); do local reusedkernelrpm= local reusedkernelsourcerpm= + local reusedkernelibrpm= [ -d "$curdir" ] || continue + [ -n "$OFED_VERSION" -a ! -d "${curdir%/*}/ofa_kernel" ] && continue local reusedkernelprefix="kernel-lustre-" ( $PATCHLESS ) && reusedkernelprefix= [ -f ${curdir}/../${reusedkernelprefix}${REUSEDKERNELMASK}.rpm ] && \ @@ -1371,12 +1468,25 @@ build_sequence_reuse() reusedkernelprefix="kernel-lustre-source-" [ -f ${curdir}/../${reusedkernelprefix}${REUSEDKERNELMASKnew}.rpm ] && \ reusedkernelsourcerpm=$(ls ${curdir}/../${reusedkernelprefix}${REUSEDKERNELMASKnew}.rpm | head -1 ) + if [ -n "$OFED_VERSION" ]; then + gen_lustre_version + reusedkernelprefix="kernel-ib-" + [ -f ${curdir}/../${reusedkernelprefix}${OFED_VERSION}-${LUSTRE_EXTRA_VERSION}.${TARGET_ARCH}.rpm ] && \ + reusedkernelibrpm=$(ls ${curdir}/../${reusedkernelprefix}${OFED_VERSION}-${LUSTRE_EXTRA_VERSION}.${TARGET_ARCH}.rpm | head -1 ) + reusedkernelibdevelrpm=$(ls ${curdir}/../${reusedkernelprefix}devel-${OFED_VERSION}-${LUSTRE_EXTRA_VERSION}.${TARGET_ARCH}.rpm | head -1 ) + fi if ! ( $NORPM ) && ! [ -f "$reusedkernelrpm" ]; then #kernel rpm not found. Build all continue fi if ! ( $NORPM ) && ! [ -f "$reusedkernelsourcerpm" ]; then #kernel source rpm not found. Build all continue fi + if [ -n "$OFED_VERSION" ]; then + if ! ( $NORPM ) && [ ! -f "$reusedkernelibrpm" -o ! -f "$reusedkernelibdevelrpm"]; then #kernel-ib{,-devel} rpm not found. Build all + continue + fi + CONFIGURE_FLAGS="--with-o2ib=${curdir%/*}/ofa_kernel ${CONFIGURE_FLAGS}" + fi LINUX="$curdir" build_lustre || continue touch "$curdir/../" @@ -1387,6 +1497,9 @@ build_sequence_reuse() touch RPMS/${TARGET_ARCH}/kernel_was_reused [ -f "$reusedkernelsourcerpm" ] && \ cp -f "$reusedkernelsourcerpm" RPMS/${TARGET_ARCH}/ > /dev/null 2>&1 + [ -f "$reusedkernelibrpm" ] && \ + cp -f "$reusedkernelibrpm" RPMS/${TARGET_ARCH}/ > /dev/null 2>&1 + cp -f "$reusedkernelibdevelrpm" RPMS/${TARGET_ARCH}/ > /dev/null 2>&1 fi return done @@ -1567,12 +1680,12 @@ elif [ -z "$LINUX" ] ; then if ! $build_success; then build_sequence && build_success=true if $build_success; then - store_for_reuse || echo "Cannot store for feature reuse" + store_for_reuse || echo "Cannot store for future reuse" fi fi fi else - build_lustre + build_lustre && build_success=true fi ( $build_success ) || fatal 1 "Cannot build lustre" diff --git a/build/lustre-kernel-2.4.spec.in b/build/lustre-kernel-2.4.spec.in index 772e13b9e2c0a15f35c765197b0d703e35ebe09f..41efbb3520370886a7d98518e0981108df306394 100644 --- a/build/lustre-kernel-2.4.spec.in +++ b/build/lustre-kernel-2.4.spec.in @@ -715,7 +715,7 @@ if [ "%{buildbase}" -ne 0 ] ; then if [ "%{buildup}" -ne 0 ] ; then BuildObj up fi - perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = %{kextraverdelim}%{kextraver}custom/" $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/Makefile + perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = %{kextraverdelim}%{kextraver}%{flavordelim}custom/" $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/Makefile # Remove $RPM_BUILD_ROOT prefix from symlinks. for link in $(find $objdir -type l); do target=$(readlink $link) @@ -723,7 +723,7 @@ if [ "%{buildbase}" -ne 0 ] ; then ln -s ${target/$RPM_BUILD_ROOT/} $link done else # 2.4 rh-style - perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = %{kextraverdelim}%{kextraver}custom/" $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/Makefile + perl -p -i -e "s/^EXTRAVERSION.*/EXTRAVERSION = %{kextraverdelim}%{kextraver}%{flavordelim}custom/" $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/Makefile # get the one from the build we just completed as it might have picked # up new options #cp ../lustre/kernel_patches/kernel_configs/kernel-%{kversion}-@LUSTRE_TARGET@-%{_target_cpu}%{dashtargetboard}.config $RPM_BUILD_ROOT/usr/src/linux-%{KVERREL}/.config diff --git a/ldiskfs/.cvsignore b/ldiskfs/.cvsignore index 5b47d43c0f4e42c410399b34bc13802c33e3d9d3..1f947acac8e412ce2cfeba136c9b99f35d69511f 100644 --- a/ldiskfs/.cvsignore +++ b/ldiskfs/.cvsignore @@ -17,4 +17,6 @@ lustre-ldiskfs-*.tar.gz lustre-ldiskfs.spec missing mkinstalldirs +Module.symvers +Modules.symvers stamp-h1 diff --git a/ldiskfs/ChangeLog b/ldiskfs/ChangeLog index 7622c5f010a6e29528bc17b3fb7b1a24a411ded9..67892e5e3a47831665afb27d47388e85db18a13a 100644 --- a/ldiskfs/ChangeLog +++ b/ldiskfs/ChangeLog @@ -1,5 +1,28 @@ +tbd Sun Microsystems, Inc. + * version 3.0.6 + +Severity : normal +Bugzilla : 15320 +Description: OSS crashes frequently, e2fsck does not fix +Details : Add an extra check to ldiskfs extents code for the condition, + eh_entries = 0 & eh_depth != 0 + +Severity : normal +Bugzilla : 15459 +Description: migrate ldiskfs - ldiskfs2 (extents on directories) +Details : disable preallocation for non-regular files. + +Severity : normal +Frequency : blocks per group < blocksize*8 and uninit_groups is enabled +Bugzilla : 15932 +Description: ldiskfs error: XXX blocks in bitmap, YYY in gd +Details : If blocks per group is less than blocksize*8, set rest of the + bitmap to 1. + +------------------------------------------------------------------------------- + 04-26-2008 Sun Microsystems, Inc. - * version 3.0.5 + * version 3.0.5 Severity : normal Bugzilla : 14493 @@ -20,6 +43,18 @@ Description: soft lockups on 1.6.2 MDS (is_subdir) Details : don't add dentries with ".." to dcache and ignore such dentries in iopen_lookup(). +Severity : critical +Frequency : very rare, if additional xattrs are used on kernels >= 2.6.12 +Bugzilla : 15777 +Description: files may lose file attributes in some cases +Details : If there are multiple extended attributes stored on the inode, + in particular ACLs, SELinux, or user attributes (if user_xattr + is specified for the client mount options) then there is a risk + of attribute loss. If an additional attribute is be stored + initially in the inode and then increase in size enough to be + moved to the external attribute block (e.g. ACL growing in size) + for the attribute to be lost. + -------------------------------------------------------------------------------- 2008-01-11 Sun Microsystems, Inc. diff --git a/ldiskfs/configure.ac b/ldiskfs/configure.ac index fee6f5689c6d95c1471affa6341ff57e9f38c102..79651c4f76fb93e5da759d04bb1eab2d67799ca8 100644 --- a/ldiskfs/configure.ac +++ b/ldiskfs/configure.ac @@ -1,6 +1,6 @@ # Process this file with autoconf to produce a configure script. -AC_INIT([Lustre ldiskfs], 3.0.4, [https://bugzilla.lustre.org/]) +AC_INIT([Lustre ldiskfs], 3.0.6, [https://bugzilla.lustre.org/]) AC_CONFIG_SRCDIR([lustre-ldiskfs.spec.in]) # Don't look for install-sh, etc. in .. diff --git a/ldiskfs/kernel_patches/patches/ext3-block-bitmap-validation-2.6-rhel5.patch b/ldiskfs/kernel_patches/patches/ext3-block-bitmap-validation-2.6-rhel5.patch new file mode 100644 index 0000000000000000000000000000000000000000..39cbcc84a4074ea8f06b6d1f3d61c2b09c819ace --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-block-bitmap-validation-2.6-rhel5.patch @@ -0,0 +1,169 @@ + fs/ext3/balloc.c | 99 ++++++++++++++++++++++++++++++++++++++++++++---------- + 1 files changed, 81 insertions(+), 18 deletions(-) +diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c +index ff3428e..a9140ea 100644 +Index: linux-stage/fs/ext3/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/balloc.c ++++ linux-stage/fs/ext3/balloc.c +@@ -143,9 +143,96 @@ unsigned ext3_init_block_bitmap(struct s + return free_blocks - sbi->s_itb_per_group - 2; + } + +-/* +- * Read the bitmap for a given block_group, reading into the specified +- * slot in the superblock's bitmap cache. ++/** ++* bh_uptodate_or_lock: Test whether the buffer is uptodate ++* @bh: struct buffer_head ++* ++* Return true if the buffer is up-to-date and false, ++* with the buffer locked, if not. ++*/ ++int bh_uptodate_or_lock(struct buffer_head *bh) ++{ ++ if (!buffer_uptodate(bh)) { ++ lock_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ return 0; ++ unlock_buffer(bh); ++ } ++ return 1; ++} ++ ++/** ++* bh_submit_read: Submit a locked buffer for reading ++* @bh: struct buffer_head ++* ++* Returns a negative error ++*/ ++int bh_submit_read(struct buffer_head *bh) ++{ ++ if (!buffer_locked(bh)) ++ lock_buffer(bh); ++ if (buffer_uptodate(bh)) ++ return 0; ++ get_bh(bh); ++ bh->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh); ++ wait_on_buffer(bh); ++ if (buffer_uptodate(bh)) ++ return 0; ++ return -EIO; ++} ++ ++static int ext3_valid_block_bitmap(struct super_block *sb, ++ struct ext3_group_desc *desc, ++ unsigned int block_group, ++ struct buffer_head *bh) ++{ ++ ext3_grpblk_t offset; ++ ext3_grpblk_t next_zero_bit; ++ ext3_fsblk_t bitmap_blk; ++ ext3_fsblk_t group_first_block; ++ ++ group_first_block = ext3_group_first_block_no(sb, block_group); ++ ++ /* check whether block bitmap block number is set */ ++ bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); ++ offset = bitmap_blk - group_first_block; ++ if (!ext3_test_bit(offset, bh->b_data)) ++ /* bad block bitmap */ ++ goto err_out; ++ ++ /* check whether the inode bitmap block number is set */ ++ bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap); ++ offset = bitmap_blk - group_first_block; ++ if (!ext3_test_bit(offset, bh->b_data)) ++ /* bad block bitmap */ ++ goto err_out; ++ ++ /* check whether the inode table block number is set */ ++ bitmap_blk = le32_to_cpu(desc->bg_inode_table); ++ offset = bitmap_blk - group_first_block; ++ next_zero_bit = ext3_find_next_zero_bit(bh->b_data, ++ offset + EXT3_SB(sb)->s_itb_per_group, ++ offset); ++ if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group) ++ /* good bitmap for inode tables */ ++ return 1; ++ ++err_out: ++ ext3_error(sb, __FUNCTION__, ++ "Invalid block bitmap - " ++ "block_group = %d, block = %lu", ++ (int)block_group, bitmap_blk); ++ return 0; ++} ++ ++/** ++ * read_block_bitmap() ++ * @sb: super block ++ * @block_group: given block group ++ * ++ * Read the bitmap for a given block_group,and validate the ++ * bits for block/inode/inode tables are set in the bitmaps. + * + * Return buffer_head on success or NULL in case of failure. + */ +@@ -154,29 +241,42 @@ read_block_bitmap(struct super_block *sb + { + struct ext3_group_desc * desc; + struct buffer_head * bh = NULL; ++ ext3_fsblk_t bitmap_blk; + + desc = ext3_get_group_desc (sb, block_group, NULL); + if (!desc) +- goto error_out; ++ return NULL; ++ bitmap_blk = desc->bg_block_bitmap; ++ bh = sb_getblk(sb, bitmap_blk); ++ if (unlikely(!bh)) { ++ ext3_error(sb, __FUNCTION__, ++ "Can not read block bitmap - " ++ "block group = %d, block_bitmap = %lu", ++ (int)block_group, bitmap_blk); ++ return NULL; ++ } ++ if (bh_uptodate_or_lock(bh)) ++ return bh; ++ + if (desc->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) { +- bh = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); +- if (!buffer_uptodate(bh)) { +- lock_buffer(bh); +- if (!buffer_uptodate(bh)) { +- ext3_init_block_bitmap(sb, bh,block_group,desc); +- set_buffer_uptodate(bh); +- } +- unlock_buffer(bh); +- } +- } else { +- bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ ext3_init_block_bitmap(sb, bh, block_group, desc); ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ return bh; ++ } ++ if (bh_submit_read(bh) < 0) { ++ brelse(bh); ++ ext3_error(sb, __FUNCTION__, ++ "Cannot read block bitmap - " ++ "block group = %d block_bitmap = %lu", ++ (int)block_group, bitmap_blk); ++ return NULL; ++ } ++ if (!ext3_valid_block_bitmap(sb, desc, block_group, bh)) { ++ brelse(bh); ++ return NULL; + } +- if (!bh) +- ext3_error (sb, "read_block_bitmap", +- "Cannot read block bitmap - " +- "block_group = %d, block_bitmap = %u", +- block_group, le32_to_cpu(desc->bg_block_bitmap)); +-error_out: ++ + return bh; + } + /* diff --git a/ldiskfs/kernel_patches/patches/ext3-block-bitmap-validation-2.6-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-block-bitmap-validation-2.6-sles10.patch new file mode 100644 index 0000000000000000000000000000000000000000..417591d1c3e664ab1d62d7afc31574646133d871 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-block-bitmap-validation-2.6-sles10.patch @@ -0,0 +1,188 @@ + fs/ext3/balloc.c | 99 ++++++++++++++++++++++++++++++++++++++++++++---------- + 1 files changed, 81 insertions(+), 18 deletions(-) +diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c +index ff3428e..a9140ea 100644 +Index: linux-stage/fs/ext3/balloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/balloc.c ++++ linux-stage/fs/ext3/balloc.c +@@ -144,9 +144,96 @@ unsigned ext3_init_block_bitmap(struct s + return free_blocks - sbi->s_itb_per_group - 2; + } + +-/* +- * Read the bitmap for a given block_group, reading into the specified +- * slot in the superblock's bitmap cache. ++/** ++* bh_uptodate_or_lock: Test whether the buffer is uptodate ++* @bh: struct buffer_head ++* ++* Return true if the buffer is up-to-date and false, ++* with the buffer locked, if not. ++*/ ++int bh_uptodate_or_lock(struct buffer_head *bh) ++{ ++ if (!buffer_uptodate(bh)) { ++ lock_buffer(bh); ++ if (!buffer_uptodate(bh)) ++ return 0; ++ unlock_buffer(bh); ++ } ++ return 1; ++} ++ ++/** ++* bh_submit_read: Submit a locked buffer for reading ++* @bh: struct buffer_head ++* ++* Returns a negative error ++*/ ++int bh_submit_read(struct buffer_head *bh) ++{ ++ if (!buffer_locked(bh)) ++ lock_buffer(bh); ++ if (buffer_uptodate(bh)) ++ return 0; ++ get_bh(bh); ++ bh->b_end_io = end_buffer_read_sync; ++ submit_bh(READ, bh); ++ wait_on_buffer(bh); ++ if (buffer_uptodate(bh)) ++ return 0; ++ return -EIO; ++} ++ ++static int ext3_valid_block_bitmap(struct super_block *sb, ++ struct ext3_group_desc *desc, ++ unsigned int block_group, ++ struct buffer_head *bh) ++{ ++ unsigned long long offset; ++ unsigned long long next_zero_bit; ++ unsigned long long bitmap_blk; ++ unsigned long long group_first_block; ++ ++ group_first_block = ext3_group_first_block_no(sb, block_group); ++ ++ /* check whether block bitmap block number is set */ ++ bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); ++ offset = bitmap_blk - group_first_block; ++ if (!ext3_test_bit(offset, bh->b_data)) ++ /* bad block bitmap */ ++ goto err_out; ++ ++ /* check whether the inode bitmap block number is set */ ++ bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap); ++ offset = bitmap_blk - group_first_block; ++ if (!ext3_test_bit(offset, bh->b_data)) ++ /* bad block bitmap */ ++ goto err_out; ++ ++ /* check whether the inode table block number is set */ ++ bitmap_blk = le32_to_cpu(desc->bg_inode_table); ++ offset = bitmap_blk - group_first_block; ++ next_zero_bit = ext3_find_next_zero_bit(bh->b_data, ++ offset + EXT3_SB(sb)->s_itb_per_group, ++ offset); ++ if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group) ++ /* good bitmap for inode tables */ ++ return 1; ++ ++err_out: ++ ext3_error(sb, __FUNCTION__, ++ "Invalid block bitmap - " ++ "block_group = %d, block = %llu", ++ block_group, bitmap_blk); ++ return 0; ++} ++ ++/** ++ * read_block_bitmap() ++ * @sb: super block ++ * @block_group: given block group ++ * ++ * Read the bitmap for a given block_group,and validate the ++ * bits for block/inode/inode tables are set in the bitmaps. + * + * Return buffer_head on success or NULL in case of failure. + */ +@@ -155,29 +242,42 @@ read_block_bitmap(struct super_block *sb + { + struct ext3_group_desc * desc; + struct buffer_head * bh = NULL; ++ unsigned long long bitmap_blk; + + desc = ext3_get_group_desc (sb, block_group, NULL); + if (!desc) +- goto error_out; ++ return NULL; ++ bitmap_blk = desc->bg_block_bitmap; ++ bh = sb_getblk(sb, bitmap_blk); ++ if (unlikely(!bh)) { ++ ext3_error(sb, __FUNCTION__, ++ "Can not read block bitmap - " ++ "block group = %d, block_bitmap = %llu", ++ (int)block_group, bitmap_blk); ++ return NULL; ++ } ++ if (bh_uptodate_or_lock(bh)) ++ return bh; ++ + if (desc->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) { +- bh = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); +- if (!buffer_uptodate(bh)) { +- lock_buffer(bh); +- if (!buffer_uptodate(bh)) { +- ext3_init_block_bitmap(sb, bh,block_group,desc); +- set_buffer_uptodate(bh); +- } +- unlock_buffer(bh); +- } +- } else { +- bh = sb_bread(sb, le32_to_cpu(desc->bg_block_bitmap)); ++ ext3_init_block_bitmap(sb, bh, block_group, desc); ++ set_buffer_uptodate(bh); ++ unlock_buffer(bh); ++ return bh; ++ } ++ if (bh_submit_read(bh) < 0) { ++ brelse(bh); ++ ext3_error(sb, __FUNCTION__, ++ "Cannot read block bitmap - " ++ "block group = %d block_bitmap = %llu", ++ (int)block_group, bitmap_blk); ++ return NULL; + } +- if (!bh) +- ext3_error (sb, "read_block_bitmap", +- "Cannot read block bitmap - " +- "block_group = %d, block_bitmap = %u", +- block_group, le32_to_cpu(desc->bg_block_bitmap)); +-error_out: ++ if (!ext3_valid_block_bitmap(sb, desc, block_group, bh)) { ++ brelse(bh); ++ return NULL; ++ } ++ + return bh; + } + /* +Index: linux-stage/include/linux/ext3_fs.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs.h ++++ linux-stage/include/linux/ext3_fs.h +@@ -872,6 +872,14 @@ struct dir_private_info { + __u32 next_hash; + }; + ++/* calculate the first block number of the group */ ++static inline long long ++ext3_group_first_block_no(struct super_block *sb, unsigned long group_no) ++{ ++ return group_no * (long long)EXT3_BLOCKS_PER_GROUP(sb) + ++ le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); ++} ++ + /* + * Special error return code only used by dx_probe() and its callers. + */ diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-expand-lose-block.patch b/ldiskfs/kernel_patches/patches/ext3-ea-expand-lose-block.patch new file mode 100644 index 0000000000000000000000000000000000000000..d40b40d4f9864bb7c0bb8db828ee3369eb945882 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-ea-expand-lose-block.patch @@ -0,0 +1,27 @@ +Date: Mon, 12 May 2008 11:24:40 +0800 +From: Tiger Yang <tiger.yang@oracle.com> +Subject: [PATCH] ext3/4: fix uninitialized bs in ext3/4_xattr_set_handle() +To: linux-ext4@vger.kernel.org +Cc: linux-fsdevel@vger.kernel.org, linux-kernel@vger.kernel.org + +This fix the uninitialized bs when we try to replace a xattr entry in ibody +with the new value which require more than free space. + +Signed-off-by: Tiger Yang <tiger.yang@oracle.com> + + +diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c +--- a/fs/ext3/xattr.c ++++ b/fs/ext3/xattr.c +@@ -1000,6 +1000,11 @@ ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, + i.value = NULL; + error = ext3_xattr_block_set(handle, inode, &i, &bs); + } else if (error == -ENOSPC) { ++ if (EXT3_I(inode)->i_file_acl && !bs.s.base) { ++ error = ext3_xattr_block_find(inode, &i, &bs); ++ if (error) ++ goto cleanup; ++ } + error = ext3_xattr_block_set(handle, inode, &i, &bs); + if (error) + goto cleanup; diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch index 89cc1b5d4bf0b3d5a5cd6364ac33c45ea6362825..bf3ba6c406769518ca222d169c57e64389220451 100644 --- a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-rhel4.patch @@ -1,8 +1,8 @@ -Index: linux-stage/fs/ext3/ialloc.c +Index: linux-2.6.9-67.0.20/fs/ext3/ialloc.c =================================================================== ---- linux-stage.orig/fs/ext3/ialloc.c 2005-10-04 16:53:24.000000000 -0600 -+++ linux-stage/fs/ext3/ialloc.c 2005-10-04 17:07:25.000000000 -0600 -@@ -629,6 +629,9 @@ +--- linux-2.6.9-67.0.20.orig/fs/ext3/ialloc.c ++++ linux-2.6.9-67.0.20/fs/ext3/ialloc.c +@@ -632,6 +632,9 @@ got: spin_unlock(&sbi->s_next_gen_lock); ei->i_state = EXT3_STATE_NEW; @@ -12,11 +12,11 @@ Index: linux-stage/fs/ext3/ialloc.c ret = inode; if(DQUOT_ALLOC_INODE(inode)) { -Index: linux-stage/fs/ext3/inode.c +Index: linux-2.6.9-67.0.20/fs/ext3/inode.c =================================================================== ---- linux-stage.orig/fs/ext3/inode.c 2005-10-04 17:00:22.000000000 -0600 -+++ linux-stage/fs/ext3/inode.c 2005-10-04 17:07:25.000000000 -0600 -@@ -2274,7 +2274,7 @@ +--- linux-2.6.9-67.0.20.orig/fs/ext3/inode.c ++++ linux-2.6.9-67.0.20/fs/ext3/inode.c +@@ -2275,7 +2275,7 @@ static unsigned long ext3_get_inode_bloc * trying to determine the inode's location on-disk and no read need be * performed. */ @@ -25,7 +25,7 @@ Index: linux-stage/fs/ext3/inode.c struct ext3_iloc *iloc, int in_mem) { unsigned long block; -@@ -2484,6 +2484,11 @@ void ext3_read_inode(struct inode * inod +@@ -2485,6 +2485,11 @@ void ext3_read_inode(struct inode * inod ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); @@ -37,7 +37,7 @@ Index: linux-stage/fs/ext3/inode.c if (S_ISREG(inode->i_mode)) { inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; -@@ -2619,6 +2624,9 @@ static int ext3_do_update_inode(handle_t +@@ -2620,6 +2625,9 @@ static int ext3_do_update_inode(handle_t } else for (block = 0; block < EXT3_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; @@ -47,7 +47,7 @@ Index: linux-stage/fs/ext3/inode.c BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) -@@ -2849,7 +2857,8 @@ ext3_reserve_inode_write(handle_t *handl +@@ -2850,7 +2858,8 @@ ext3_reserve_inode_write(handle_t *handl { int err = 0; if (handle) { @@ -57,11 +57,11 @@ Index: linux-stage/fs/ext3/inode.c if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, iloc->bh); -Index: linux-stage/fs/ext3/xattr.c +Index: linux-2.6.9-67.0.20/fs/ext3/xattr.c =================================================================== ---- linux-stage.orig/fs/ext3/xattr.c 2005-10-04 16:50:11.000000000 -0600 -+++ linux-stage/fs/ext3/xattr.c 2005-10-04 17:19:43.000000000 -0600 -@@ -149,17 +149,12 @@ +--- linux-2.6.9-67.0.20.orig/fs/ext3/xattr.c ++++ linux-2.6.9-67.0.20/fs/ext3/xattr.c +@@ -149,17 +149,12 @@ ext3_listxattr(struct dentry *dentry, ch } /* @@ -82,7 +82,7 @@ Index: linux-stage/fs/ext3/xattr.c void *buffer, size_t buffer_size) { struct buffer_head *bh = NULL; -@@ -173,7 +168,6 @@ +@@ -173,7 +168,6 @@ ext3_xattr_get(struct inode *inode, int if (name == NULL) return -EINVAL; @@ -90,7 +90,7 @@ Index: linux-stage/fs/ext3/xattr.c error = -ENODATA; if (!EXT3_I(inode)->i_file_acl) goto cleanup; -@@ -246,15 +240,87 @@ +@@ -246,15 +240,87 @@ found: cleanup: brelse(bh); @@ -181,7 +181,7 @@ Index: linux-stage/fs/ext3/xattr.c * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * -@@ -262,7 +328,31 @@ +@@ -262,7 +328,31 @@ cleanup: * used / required on success. */ int @@ -214,7 +214,7 @@ Index: linux-stage/fs/ext3/xattr.c { struct buffer_head *bh = NULL; struct ext3_xattr_entry *entry; -@@ -273,7 +363,6 @@ +@@ -273,7 +363,6 @@ ext3_xattr_list(struct inode *inode, cha ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -222,7 +222,7 @@ Index: linux-stage/fs/ext3/xattr.c error = 0; if (!EXT3_I(inode)->i_file_acl) goto cleanup; -@@ -330,11 +419,149 @@ +@@ -330,11 +419,149 @@ bad_block: ext3_error(inode->i_sb, "ext3 cleanup: brelse(bh); @@ -373,7 +373,7 @@ Index: linux-stage/fs/ext3/xattr.c /* * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is * not set, set it. -@@ -356,6 +583,279 @@ +@@ -356,6 +583,279 @@ static void ext3_xattr_update_super_bloc } /* @@ -653,7 +653,7 @@ Index: linux-stage/fs/ext3/xattr.c * ext3_xattr_set_handle() * * Create, replace or remove an extended attribute for this inode. Buffer -@@ -369,6 +869,104 @@ +@@ -369,6 +869,110 @@ static void ext3_xattr_update_super_bloc */ int ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, @@ -710,6 +710,9 @@ Index: linux-stage/fs/ext3/xattr.c + } else if (!found && (flags & XATTR_REPLACE)) { + err = -ENODATA; + goto finish; ++ } else if (!found && !value) { ++ err = 0; ++ goto finish; + } + + /* check if we have enough space to store attribute */ @@ -733,14 +736,17 @@ Index: linux-stage/fs/ext3/xattr.c + } + } + -+ /* try to store EA in inode body */ -+ err = ext3_xattr_ibody_set(handle, inode, name_index, name, -+ value, value_len, flags); -+ if (err) { -+ /* can't store EA in inode body */ -+ /* try to store in block */ -+ err = ext3_xattr_block_set(handle, inode, name_index, -+ name, value, value_len, flags); ++ /* Add entry if value is provided */ ++ if (value != NULL) { ++ /* try to store EA in inode body */ ++ err = ext3_xattr_ibody_set(handle, inode, name_index, name, ++ value, value_len, flags); ++ if (err) { ++ /* can't store EA in inode body */ ++ /* try to store in block */ ++ err = ext3_xattr_block_set(handle, inode, name_index, ++ name, value, value_len, flags); ++ } + } + +finish: @@ -758,7 +764,7 @@ Index: linux-stage/fs/ext3/xattr.c const char *name, const void *value, size_t value_len, int flags) { -@@ -391,22 +989,7 @@ +@@ -391,22 +995,7 @@ ext3_xattr_set_handle(handle_t *handle, * towards the end of the block). * end -- Points right after the block pointed to by header. */ @@ -781,7 +787,7 @@ Index: linux-stage/fs/ext3/xattr.c if (EXT3_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); -@@ -638,7 +1221,6 @@ +@@ -638,7 +1227,6 @@ cleanup: brelse(bh); if (!(bh && header == HDR(bh))) kfree(header); @@ -789,11 +795,11 @@ Index: linux-stage/fs/ext3/xattr.c return error; } -Index: linux-stage/fs/ext3/xattr.h +Index: linux-2.6.9-67.0.20/fs/ext3/xattr.h =================================================================== ---- linux-stage.orig/fs/ext3/xattr.h 2005-10-04 16:50:11.000000000 -0600 -+++ linux-stage/fs/ext3/xattr.h 2005-10-04 17:07:25.000000000 -0600 -@@ -67,7 +67,8 @@ +--- linux-2.6.9-67.0.20.orig/fs/ext3/xattr.h ++++ linux-2.6.9-67.0.20/fs/ext3/xattr.h +@@ -67,7 +67,8 @@ extern ssize_t ext3_listxattr(struct den extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext3_xattr_list(struct inode *, char *, size_t); extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); @@ -803,11 +809,11 @@ Index: linux-stage/fs/ext3/xattr.h extern void ext3_xattr_delete_inode(handle_t *, struct inode *); extern void ext3_xattr_put_super(struct super_block *); -Index: linux-stage/include/linux/ext3_fs.h +Index: linux-2.6.9-67.0.20/include/linux/ext3_fs.h =================================================================== ---- linux-stage.orig/include/linux/ext3_fs.h 2005-10-04 16:53:29.000000000 -0600 -+++ linux-stage/include/linux/ext3_fs.h 2005-10-04 17:07:25.000000000 -0600 -@@ -293,6 +293,8 @@ +--- linux-2.6.9-67.0.20.orig/include/linux/ext3_fs.h ++++ linux-2.6.9-67.0.20/include/linux/ext3_fs.h +@@ -293,6 +293,8 @@ struct ext3_inode { __u32 m_i_reserved2[2]; } masix2; } osd2; /* OS dependent 2 */ @@ -816,7 +822,7 @@ Index: linux-stage/include/linux/ext3_fs.h }; #define i_size_high i_dir_acl -@@ -757,6 +759,7 @@ +@@ -757,6 +759,7 @@ extern unsigned long ext3_count_free (st extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); @@ -824,11 +830,11 @@ Index: linux-stage/include/linux/ext3_fs.h extern void ext3_read_inode (struct inode *); extern int ext3_write_inode (struct inode *, int); -Index: linux-stage/include/linux/ext3_fs_i.h +Index: linux-2.6.9-67.0.20/include/linux/ext3_fs_i.h =================================================================== ---- linux-stage.orig/include/linux/ext3_fs_i.h 2005-10-04 16:50:11.000000000 -0600 -+++ linux-stage/include/linux/ext3_fs_i.h 2005-10-04 17:07:25.000000000 -0600 -@@ -113,6 +113,9 @@ +--- linux-2.6.9-67.0.20.orig/include/linux/ext3_fs_i.h ++++ linux-2.6.9-67.0.20/include/linux/ext3_fs_i.h +@@ -113,6 +113,9 @@ struct ext3_inode_info { */ loff_t i_disksize; diff --git a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch index 72c25a4def3e5e01a3bb60a932a7665890627d9b..18efe81e0dae295c1d345c54e125ea94e9e80163 100644 --- a/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-ea-in-inode-2.6-suse.patch @@ -1,10 +1,10 @@ %patch -Index: linux-2.6.0/fs/ext3/ialloc.c +Index: linux-2.6.5-7.311/fs/ext3/ialloc.c =================================================================== ---- linux-2.6.0.orig/fs/ext3/ialloc.c 2004-01-14 18:54:11.000000000 +0300 -+++ linux-2.6.0/fs/ext3/ialloc.c 2004-01-14 18:54:12.000000000 +0300 -@@ -627,6 +627,9 @@ - inode->i_generation = EXT3_SB(sb)->s_next_generation++; +--- linux-2.6.5-7.311.orig/fs/ext3/ialloc.c ++++ linux-2.6.5-7.311/fs/ext3/ialloc.c +@@ -633,6 +633,9 @@ got: + spin_unlock(&sbi->s_next_gen_lock); ei->i_state = EXT3_STATE_NEW; + ei->i_extra_isize = @@ -13,11 +13,11 @@ Index: linux-2.6.0/fs/ext3/ialloc.c ret = inode; if(DQUOT_ALLOC_INODE(inode)) { -Index: linux-2.6.0/fs/ext3/inode.c +Index: linux-2.6.5-7.311/fs/ext3/inode.c =================================================================== ---- linux-2.6.0.orig/fs/ext3/inode.c 2004-01-14 18:54:12.000000000 +0300 -+++ linux-2.6.0/fs/ext3/inode.c 2004-01-14 19:09:46.000000000 +0300 -@@ -2339,7 +2339,7 @@ +--- linux-2.6.5-7.311.orig/fs/ext3/inode.c ++++ linux-2.6.5-7.311/fs/ext3/inode.c +@@ -2284,7 +2284,7 @@ static unsigned long ext3_get_inode_bloc * trying to determine the inode's location on-disk and no read need be * performed. */ @@ -26,7 +26,7 @@ Index: linux-2.6.0/fs/ext3/inode.c struct ext3_iloc *iloc, int in_mem) { unsigned long block; -@@ -2547,6 +2547,11 @@ +@@ -2495,6 +2495,11 @@ void ext3_read_inode(struct inode * inod ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); @@ -38,7 +38,7 @@ Index: linux-2.6.0/fs/ext3/inode.c if (S_ISREG(inode->i_mode)) { inode->i_op = &ext3_file_inode_operations; inode->i_fop = &ext3_file_operations; -@@ -2682,6 +2687,9 @@ +@@ -2630,6 +2635,9 @@ static int ext3_do_update_inode(handle_t } else for (block = 0; block < EXT3_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; @@ -48,7 +48,7 @@ Index: linux-2.6.0/fs/ext3/inode.c BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata"); rc = ext3_journal_dirty_metadata(handle, bh); if (!err) -@@ -2849,7 +2857,8 @@ ext3_reserve_inode_write(handle_t *handl +@@ -2839,7 +2847,8 @@ ext3_reserve_inode_write(handle_t *handl { int err = 0; if (handle) { @@ -58,11 +58,11 @@ Index: linux-2.6.0/fs/ext3/inode.c if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); err = ext3_journal_get_write_access(handle, iloc->bh); -Index: linux-2.6.0/fs/ext3/xattr.c +Index: linux-2.6.5-7.311/fs/ext3/xattr.c =================================================================== ---- linux-2.6.0.orig/fs/ext3/xattr.c 2003-12-30 08:33:13.000000000 +0300 -+++ linux-2.6.0/fs/ext3/xattr.c 2004-01-14 18:54:12.000000000 +0300 -@@ -246,17 +246,12 @@ +--- linux-2.6.5-7.311.orig/fs/ext3/xattr.c ++++ linux-2.6.5-7.311/fs/ext3/xattr.c +@@ -245,17 +245,12 @@ ext3_removexattr(struct dentry *dentry, } /* @@ -83,7 +83,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c void *buffer, size_t buffer_size) { struct buffer_head *bh = NULL; -@@ -270,7 +265,6 @@ +@@ -269,7 +264,6 @@ ext3_xattr_get(struct inode *inode, int if (name == NULL) return -EINVAL; @@ -91,7 +91,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c error = -ENODATA; if (!EXT3_I(inode)->i_file_acl) goto cleanup; -@@ -343,15 +337,87 @@ +@@ -342,15 +336,87 @@ found: cleanup: brelse(bh); @@ -182,7 +182,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * -@@ -359,7 +425,31 @@ +@@ -358,7 +424,31 @@ cleanup: * used / required on success. */ int @@ -215,7 +215,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c { struct buffer_head *bh = NULL; struct ext3_xattr_entry *entry; -@@ -370,7 +460,6 @@ +@@ -369,7 +459,6 @@ ext3_xattr_list(struct inode *inode, cha ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); @@ -223,7 +223,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c error = 0; if (!EXT3_I(inode)->i_file_acl) goto cleanup; -@@ -431,11 +520,138 @@ +@@ -430,11 +519,138 @@ bad_block: ext3_error(inode->i_sb, "ext3 cleanup: brelse(bh); @@ -363,7 +363,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c /* * If the EXT3_FEATURE_COMPAT_EXT_ATTR feature of this file system is * not set, set it. -@@ -457,6 +673,279 @@ +@@ -456,6 +672,279 @@ static void ext3_xattr_update_super_bloc } /* @@ -643,7 +643,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c * ext3_xattr_set_handle() * * Create, replace or remove an extended attribute for this inode. Buffer -@@ -470,6 +959,104 @@ +@@ -469,6 +958,110 @@ static void ext3_xattr_update_super_bloc */ int ext3_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, @@ -700,6 +700,9 @@ Index: linux-2.6.0/fs/ext3/xattr.c + } else if (!found && (flags & XATTR_REPLACE)) { + err = -ENODATA; + goto finish; ++ } else if (!found && !value) { ++ err = 0; ++ goto finish; + } + + /* check if we have enough space to store attribute */ @@ -723,14 +726,17 @@ Index: linux-2.6.0/fs/ext3/xattr.c + } + } + -+ /* try to store EA in inode body */ -+ err = ext3_xattr_ibody_set(handle, inode, name_index, name, -+ value, value_len, flags); -+ if (err) { -+ /* can't store EA in inode body */ -+ /* try to store in block */ -+ err = ext3_xattr_block_set(handle, inode, name_index, -+ name, value, value_len, flags); ++ /* Add entry if value is provided */ ++ if (value != NULL) { ++ /* try to store EA in inode body */ ++ err = ext3_xattr_ibody_set(handle, inode, name_index, name, ++ value, value_len, flags); ++ if (err) { ++ /* can't store EA in inode body */ ++ /* try to store in block */ ++ err = ext3_xattr_block_set(handle, inode, name_index, ++ name, value, value_len, flags); ++ } + } + +finish: @@ -748,7 +754,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c const char *name, const void *value, size_t value_len, int flags) { -@@ -492,22 +1078,7 @@ +@@ -491,22 +1084,7 @@ ext3_xattr_set_handle(handle_t *handle, * towards the end of the block). * end -- Points right after the block pointed to by header. */ @@ -771,7 +777,7 @@ Index: linux-2.6.0/fs/ext3/xattr.c if (EXT3_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bh = sb_bread(sb, EXT3_I(inode)->i_file_acl); -@@ -733,7 +1304,6 @@ +@@ -730,7 +1308,6 @@ cleanup: brelse(bh); if (!(bh && header == HDR(bh))) kfree(header); @@ -779,11 +785,11 @@ Index: linux-2.6.0/fs/ext3/xattr.c return error; } -Index: linux-2.6.0/fs/ext3/xattr.h +Index: linux-2.6.5-7.311/fs/ext3/xattr.h =================================================================== ---- linux-2.6.0.orig/fs/ext3/xattr.h 2003-06-24 18:04:43.000000000 +0400 -+++ linux-2.6.0/fs/ext3/xattr.h 2004-01-14 18:54:12.000000000 +0300 -@@ -77,7 +77,8 @@ +--- linux-2.6.5-7.311.orig/fs/ext3/xattr.h ++++ linux-2.6.5-7.311/fs/ext3/xattr.h +@@ -77,7 +77,8 @@ extern int ext3_removexattr(struct dentr extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext3_xattr_list(struct inode *, char *, size_t); extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); @@ -793,11 +799,11 @@ Index: linux-2.6.0/fs/ext3/xattr.h extern void ext3_xattr_delete_inode(handle_t *, struct inode *); extern void ext3_xattr_put_super(struct super_block *); -Index: linux-2.6.0/include/linux/ext3_fs.h +Index: linux-2.6.5-7.311/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.0.orig/include/linux/ext3_fs.h 2004-01-14 18:54:11.000000000 +0300 -+++ linux-2.6.0/include/linux/ext3_fs.h 2004-01-14 18:54:12.000000000 +0300 -@@ -265,6 +265,8 @@ +--- linux-2.6.5-7.311.orig/include/linux/ext3_fs.h ++++ linux-2.6.5-7.311/include/linux/ext3_fs.h +@@ -267,6 +267,8 @@ struct ext3_inode { __u32 m_i_reserved2[2]; } masix2; } osd2; /* OS dependent 2 */ @@ -806,7 +812,7 @@ Index: linux-2.6.0/include/linux/ext3_fs.h }; #define i_size_high i_dir_acl -@@ -721,6 +723,7 @@ +@@ -729,6 +731,7 @@ extern unsigned long ext3_count_free (st extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); @@ -814,11 +820,11 @@ Index: linux-2.6.0/include/linux/ext3_fs.h extern void ext3_read_inode (struct inode *); extern void ext3_write_inode (struct inode *, int); -Index: linux-2.6.0/include/linux/ext3_fs_i.h +Index: linux-2.6.5-7.311/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.0.orig/include/linux/ext3_fs_i.h 2003-12-30 08:32:44.000000000 +0300 -+++ linux-2.6.0/include/linux/ext3_fs_i.h 2004-01-14 18:54:12.000000000 +0300 -@@ -96,6 +96,9 @@ +--- linux-2.6.5-7.311.orig/include/linux/ext3_fs_i.h ++++ linux-2.6.5-7.311/include/linux/ext3_fs_i.h +@@ -113,6 +113,9 @@ struct ext3_inode_info { */ loff_t i_disksize; @@ -828,13 +834,3 @@ Index: linux-2.6.0/include/linux/ext3_fs_i.h /* * truncate_sem is for serialising ext3_truncate() against * ext3_getblock(). In the 2.4 ext2 design, great chunks of inode's - -%diffstat - fs/ext3/ialloc.c | 5 - fs/ext3/inode.c | 10 - fs/ext3/xattr.c | 634 +++++++++++++++++++++++++++++++++++++++++++--- - fs/ext3/xattr.h | 3 - include/linux/ext3_fs.h | 2 - include/linux/ext3_fs_i.h | 3 - 6 files changed, 623 insertions(+), 34 deletions(-) - diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch index 1aac380f3786224d2e30c3e53d249e2b479f175a..1592a7e02e9a68aec79c3a5d883a144dd0a8e597 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.18-vanilla.patch @@ -16,7 +16,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.18.8/fs/ext3/extents.c 2007-07-17 11:08:59.000000000 +0200 -@@ -0,0 +1,2272 @@ +@@ -0,0 +1,2276 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> @@ -265,6 +265,10 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + error_msg = "invalid eh_entries"; + goto corrupted; + } ++ if (unlikely((eh->eh_entries == 0) && (eh->eh_depth != 0))) { ++ error_msg = "invalid index, eh_entries=0 && eh_depth != 0"; ++ goto corrupted; ++ } + return 0; + +corrupted: @@ -1161,7 +1165,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + * returns first allocated block from next leaf or EXT_MAX_BLOCK + */ +static unsigned ext3_ext_next_leaf_block(struct inode *inode, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ + int depth; + @@ -1244,8 +1248,8 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + struct ext3_extent *ex2) +{ + /* FIXME: 48bit support */ -+ if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) -+ != le32_to_cpu(ex2->ee_block)) ++ if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) != ++ le32_to_cpu(ex2->ee_block)) + return 0; + +#ifdef AGRESSIVE_TEST @@ -1253,8 +1257,8 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + return 0; +#endif + -+ if (le32_to_cpu(ex1->ee_start) + le16_to_cpu(ex1->ee_len) -+ == le32_to_cpu(ex2->ee_start)) ++ if (le32_to_cpu(ex1->ee_start) + le16_to_cpu(ex1->ee_len) == ++ le32_to_cpu(ex2->ee_start)) + return 1; + return 0; +} @@ -2537,8 +2541,8 @@ Index: linux-2.6.18.8/include/linux/ext3_extents.h +#ifdef EXT_DEBUG +#define ext_debug(inode,fmt,a...) \ +do { \ -+ if (test_opt(inode->i_sb, EXTDEBUG)) \ -+ printk(fmt, ##a); \ ++ if (test_opt(inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ +} while (0); +#else +#define ext_debug(inode,fmt,a...) @@ -2650,8 +2654,8 @@ Index: linux-2.6.18.8/include/linux/ext3_extents.h + ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ -+ (le16_to_cpu((__path__)->p_hdr->eh_entries) \ -+ < le16_to_cpu((__path__)->p_hdr->eh_max)) ++ (le16_to_cpu((__path__)->p_hdr->eh_entries) < \ ++ le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.22-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.22-vanilla.patch index 956fc07f6d36c13dbc0b3b52955a4ee7f88cc113..f4db7ca788cc1acdf59308866f1a66ffa9880d70 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-2.6.22-vanilla.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-2.6.22-vanilla.patch @@ -16,7 +16,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.18.8/fs/ext3/extents.c 2007-07-17 11:08:59.000000000 +0200 -@@ -0,0 +1,2272 @@ +@@ -0,0 +1,2276 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> @@ -265,6 +265,10 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + error_msg = "invalid eh_entries"; + goto corrupted; + } ++ if (unlikely((eh->eh_entries == 0) && (eh->eh_depth != 0))) { ++ error_msg = "invalid index, eh_entries=0 && eh_depth != 0"; ++ goto corrupted; ++ } + return 0; + +corrupted: @@ -1161,7 +1165,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + * returns first allocated block from next leaf or EXT_MAX_BLOCK + */ +static unsigned ext3_ext_next_leaf_block(struct inode *inode, -+ struct ext3_ext_path *path) ++ struct ext3_ext_path *path) +{ + int depth; + @@ -1244,8 +1248,8 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + struct ext3_extent *ex2) +{ + /* FIXME: 48bit support */ -+ if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) -+ != le32_to_cpu(ex2->ee_block)) ++ if (le32_to_cpu(ex1->ee_block) + le16_to_cpu(ex1->ee_len) != ++ le32_to_cpu(ex2->ee_block)) + return 0; + +#ifdef AGRESSIVE_TEST @@ -1253,8 +1257,8 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + return 0; +#endif + -+ if (le32_to_cpu(ex1->ee_start) + le16_to_cpu(ex1->ee_len) -+ == le32_to_cpu(ex2->ee_start)) ++ if (le32_to_cpu(ex1->ee_start) + le16_to_cpu(ex1->ee_len) == ++ le32_to_cpu(ex2->ee_start)) + return 1; + return 0; +} @@ -2538,8 +2542,8 @@ Index: linux-2.6.18.8/include/linux/ext3_extents.h +#ifdef EXT_DEBUG +#define ext_debug(inode,fmt,a...) \ +do { \ -+ if (test_opt(inode->i_sb, EXTDEBUG)) \ -+ printk(fmt, ##a); \ ++ if (test_opt(inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ +} while (0); +#else +#define ext_debug(inode,fmt,a...) @@ -2651,8 +2655,8 @@ Index: linux-2.6.18.8/include/linux/ext3_extents.h + ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext3_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ -+ (le16_to_cpu((__path__)->p_hdr->eh_entries) \ -+ < le16_to_cpu((__path__)->p_hdr->eh_max)) ++ (le16_to_cpu((__path__)->p_hdr->eh_entries) < \ ++ le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ diff --git a/ldiskfs/kernel_patches/patches/ext3-extents-sanity-checks.patch b/ldiskfs/kernel_patches/patches/ext3-extents-sanity-checks.patch index eab19b4e5ea94f9a2a69d53a56004fb3d72c61ed..5c954eb113bceb1c7edb45f1dc20b223fc0983ac 100644 --- a/ldiskfs/kernel_patches/patches/ext3-extents-sanity-checks.patch +++ b/ldiskfs/kernel_patches/patches/ext3-extents-sanity-checks.patch @@ -1,8 +1,8 @@ -Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c +Index: linux-2.6.16.54-0.2.5/fs/ext3/extents.c =================================================================== ---- linux-2.6.9-42.0.10.EL_lustre.1.4.10.orig/fs/ext3/extents.c 2007-07-17 22:14:08.000000000 +0200 -+++ linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c 2007-07-17 22:40:57.000000000 +0200 -@@ -44,26 +44,56 @@ +--- linux-2.6.16.54-0.2.5.orig/fs/ext3/extents.c ++++ linux-2.6.16.54-0.2.5/fs/ext3/extents.c +@@ -44,26 +44,60 @@ #include <asm/uaccess.h> @@ -51,6 +51,10 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c + } + if (unlikely(eh->eh_entries > eh->eh_max)) { + error_msg = "invalid eh_entries"; ++ goto corrupted; ++ } ++ if (unlikely((eh->eh_entries == 0) && (eh->eh_depth != 0))) { ++ error_msg = "invalid index, eh_entries=0 && eh_depth != 0"; + goto corrupted; } return 0; @@ -73,7 +77,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) { int err; -@@ -227,6 +257,26 @@ static inline int ext3_ext_space_root_id +@@ -226,6 +260,26 @@ static inline int ext3_ext_space_root_id return size; } @@ -100,7 +104,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c static void ext3_ext_show_path(struct ext3_extents_tree *tree, struct ext3_ext_path *path) { -@@ -297,10 +347,6 @@ ext3_ext_binsearch_idx(struct ext3_exten +@@ -296,10 +350,6 @@ ext3_ext_binsearch_idx(struct ext3_exten struct ext3_extent_idx *ix; int l = 0, k, r; @@ -111,7 +115,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c ext_debug(tree, "binsearch for %d(idx): ", block); path->p_idx = ix = EXT_FIRST_INDEX(eh); -@@ -360,9 +406,6 @@ ext3_ext_binsearch(struct ext3_extents_t +@@ -359,9 +409,6 @@ ext3_ext_binsearch(struct ext3_extents_t struct ext3_extent *ex; int l = 0, k, r; @@ -121,7 +125,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c if (eh->eh_entries == 0) { /* * this leaf is empty yet: -@@ -437,6 +480,7 @@ ext3_ext_find_extent(struct ext3_extents +@@ -436,6 +483,7 @@ ext3_ext_find_extent(struct ext3_extents struct ext3_extent_header *eh; struct buffer_head *bh; int depth, i, ppos = 0; @@ -129,7 +133,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c EXT_ASSERT(tree); EXT_ASSERT(tree->inode); -@@ -444,17 +488,15 @@ ext3_ext_find_extent(struct ext3_extents +@@ -443,17 +491,15 @@ ext3_ext_find_extent(struct ext3_extents eh = EXT_ROOT_HDR(tree); EXT_ASSERT(eh); @@ -150,7 +154,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c /* account possible depth increase */ if (!path) { path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), -@@ -485,7 +527,8 @@ ext3_ext_find_extent(struct ext3_extents +@@ -484,7 +530,8 @@ ext3_ext_find_extent(struct ext3_extents path[ppos].p_hdr = eh; i--; @@ -160,7 +164,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c goto err; } -@@ -494,9 +537,6 @@ ext3_ext_find_extent(struct ext3_extents +@@ -493,9 +540,6 @@ ext3_ext_find_extent(struct ext3_extents path[ppos].p_ext = NULL; path[ppos].p_idx = NULL; @@ -170,7 +174,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c /* find extent */ ext3_ext_binsearch(tree, path + ppos, block); -@@ -993,7 +1033,7 @@ ext3_ext_search_right(struct ext3_extent +@@ -992,7 +1036,7 @@ ext3_ext_search_right(struct ext3_extent struct ext3_extent_idx *ix; struct ext3_extent *ex; unsigned long block; @@ -179,7 +183,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c BUG_ON(path == NULL); depth = path->p_depth; -@@ -1051,7 +1091,9 @@ ext3_ext_search_right(struct ext3_extent +@@ -1050,7 +1094,9 @@ ext3_ext_search_right(struct ext3_extent if (bh == NULL) return -EIO; eh = EXT_BLOCK_HDR(bh); @@ -190,7 +194,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c brelse(bh); return -EIO; } -@@ -1064,7 +1106,8 @@ ext3_ext_search_right(struct ext3_extent +@@ -1063,7 +1109,8 @@ ext3_ext_search_right(struct ext3_extent if (bh == NULL) return -EIO; eh = EXT_BLOCK_HDR(bh); @@ -200,7 +204,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c brelse(bh); return -EIO; } -@@ -1694,6 +1737,8 @@ ext3_ext_rm_leaf(handle_t *handle, struc +@@ -1693,6 +1740,8 @@ ext3_ext_rm_leaf(handle_t *handle, struc ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); @@ -209,7 +213,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c eh = path[depth].p_hdr; EXT_ASSERT(eh); EXT_ASSERT(eh->eh_entries <= eh->eh_max); -@@ -1856,7 +1901,7 @@ int ext3_ext_remove_space(struct ext3_ex +@@ -1855,7 +1904,7 @@ int ext3_ext_remove_space(struct ext3_ex int depth = EXT_DEPTH(tree); struct ext3_ext_path *path; handle_t *handle; @@ -218,7 +222,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); -@@ -1879,7 +1924,13 @@ int ext3_ext_remove_space(struct ext3_ex +@@ -1878,7 +1927,13 @@ int ext3_ext_remove_space(struct ext3_ex } memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); path[i].p_hdr = EXT_ROOT_HDR(tree); @@ -233,7 +237,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c while (i >= 0 && err == 0) { if (i == depth) { /* this is leaf block */ -@@ -1889,16 +1940,13 @@ int ext3_ext_remove_space(struct ext3_ex +@@ -1888,16 +1943,13 @@ int ext3_ext_remove_space(struct ext3_ex i--; continue; } @@ -251,7 +255,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c if (!path[i].p_idx) { /* this level hasn't touched yet */ path[i].p_idx = -@@ -1925,6 +1973,14 @@ int ext3_ext_remove_space(struct ext3_ex +@@ -1924,6 +1976,14 @@ int ext3_ext_remove_space(struct ext3_ex err = -EIO; break; } @@ -266,7 +270,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c /* put actual number of indexes to know is this * number got changed at the next iteration */ path[i].p_block = path[i].p_hdr->eh_entries; -@@ -1945,7 +2001,7 @@ int ext3_ext_remove_space(struct ext3_ex +@@ -1944,7 +2004,7 @@ int ext3_ext_remove_space(struct ext3_ex } /* TODO: flexible tree reduction should be here */ @@ -275,7 +279,7 @@ Index: linux-2.6.9-42.0.10.EL_lustre.1.4.10/fs/ext3/extents.c /* * truncate to zero freed all the tree * so, we need to correct eh_depth -@@ -1959,6 +2015,7 @@ int ext3_ext_remove_space(struct ext3_ex +@@ -1958,6 +2018,7 @@ int ext3_ext_remove_space(struct ext3_ex } ext3_ext_tree_changed(tree); diff --git a/ldiskfs/kernel_patches/patches/ext3-inode-version-2.6-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-inode-version-2.6-sles10.patch index d29c136750431c06dff3731f951b78d776fea2b4..63e6ad277bcd2e45f13a312c4d57cfeb94476d2b 100644 --- a/ldiskfs/kernel_patches/patches/ext3-inode-version-2.6-sles10.patch +++ b/ldiskfs/kernel_patches/patches/ext3-inode-version-2.6-sles10.patch @@ -454,3 +454,15 @@ Index: linux-2.6.16-sles10/fs/ext3/xattr.h extern int init_ext3_xattr(void); extern void exit_ext3_xattr(void); +Index: linux-2.6.16-sles10/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.16-sles10.orig/fs/ext3/ialloc.c ++++ linux-2.6.16-sles10/fs/ext3/ialloc.c +@@ -750,6 +750,7 @@ got: + ei->i_dtime = 0; + ei->i_block_alloc_info = NULL; + ei->i_block_group = group; ++ ei->i_fs_version = 0; + + ext3_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) diff --git a/ldiskfs/kernel_patches/patches/ext3-inode-version-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-inode-version-2.6.18-vanilla.patch index 249a1e06fcac9864cd96ab54cd01bb717a44f590..c7f3c44658d299e10502dde8e9977ac6a4b98dad 100644 --- a/ldiskfs/kernel_patches/patches/ext3-inode-version-2.6.18-vanilla.patch +++ b/ldiskfs/kernel_patches/patches/ext3-inode-version-2.6.18-vanilla.patch @@ -452,3 +452,15 @@ Index: linux-2.6.18/fs/ext3/xattr.h extern int init_ext3_xattr(void); extern void exit_ext3_xattr(void); +Index: linux-2.6.18/fs/ext3/ialloc.c +=================================================================== +--- linux-2.6.18.orig/fs/ext3/ialloc.c ++++ linux-2.6.18/fs/ext3/ialloc.c +@@ -751,6 +751,7 @@ got: + ei->i_dtime = 0; + ei->i_block_alloc_info = NULL; + ei->i_block_group = group; ++ ei->i_fs_version = 0; + + ext3_set_inode_flags(inode); + if (IS_DIRSYNC(inode)) diff --git a/ldiskfs/kernel_patches/patches/ext3-max-dir-size-2.6.5-suse.patch b/ldiskfs/kernel_patches/patches/ext3-max-dir-size-2.6.5-suse.patch new file mode 100644 index 0000000000000000000000000000000000000000..39f5b276e5b7d0ea998a6d3968960ae2727d6e38 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-max-dir-size-2.6.5-suse.patch @@ -0,0 +1,144 @@ +Index: linux-stage/fs/ext3/ialloc.c +=================================================================== +--- linux-stage.orig/fs/ext3/ialloc.c ++++ linux-stage/fs/ext3/ialloc.c +@@ -520,12 +520,15 @@ struct inode *ext3_new_inode(handle_t *h + return ERR_PTR(-EPERM); + + sb = dir->i_sb; ++ sbi = EXT3_SB(sb); ++ if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size) ++ return ERR_PTR(-EFBIG); ++ + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT3_I(inode); + +- sbi = EXT3_SB(sb); + es = sbi->s_es; + if (goal) { + group = (goal - 1) / EXT3_INODES_PER_GROUP(sb); +Index: linux-stage/fs/ext3/super.c +=================================================================== +--- linux-stage.orig/fs/ext3/super.c ++++ linux-stage/fs/ext3/super.c +@@ -37,6 +37,12 @@ + #include "acl.h" + #include "group.h" + ++/* ++ * max directory size tunable ++ */ ++#define EXT3_DEFAULT_MAX_DIR_SIZE 0 ++#define EXT3_MAX_DIR_SIZE_NAME "max_dir_size" ++ + static int ext3_load_journal(struct super_block *, struct ext3_super_block *, + unsigned long journal_devnum); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, +@@ -431,6 +437,7 @@ void ext3_put_super (struct super_block + invalidate_bdev(sbi->journal_bdev, 0); + ext3_blkdev_remove(sbi); + } ++ remove_proc_entry(EXT3_MAX_DIR_SIZE_NAME, sbi->s_dev_proc); + remove_proc_entry(sb->s_id, proc_root_ext3); + sbi->s_dev_proc = NULL; + sb->s_fs_info = NULL; +@@ -1251,6 +1258,45 @@ static unsigned long descriptor_loc(stru + return (first_data_block + has_super + (bg * sbi->s_blocks_per_group)); + } + ++static int ext3_max_dir_size_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ struct ext3_sb_info *sbi = data; ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%lu\n", sbi->s_max_dir_size); ++ *start = page; ++ return len; ++} ++ ++static int ext3_max_dir_size_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ struct ext3_sb_info *sbi = data; ++ char str[32]; ++ unsigned long value; ++ char *end; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MAX_DIR_SIZE_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ value = simple_strtol(str, &end, 0); ++ if (value < 0) ++ return -ERANGE; ++ ++ sbi->s_max_dir_size = value; ++ return count; ++} + + static int ext3_fill_super (struct super_block *sb, void *data, int silent) + { +@@ -1270,6 +1316,7 @@ static int ext3_fill_super (struct super + int db_count; + int i; + int needs_recovery; ++ struct proc_dir_entry *proc; + + sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) +@@ -1287,6 +1334,23 @@ static int ext3_fill_super (struct super + return -ENOMEM; + } + ++ sbi->s_max_dir_size = EXT3_DEFAULT_MAX_DIR_SIZE; ++ proc = create_proc_entry(EXT3_MAX_DIR_SIZE_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, sbi->s_dev_proc); ++ if (proc == NULL) { ++ printk(KERN_ERR "EXT3-fs: unable to create %s\n", ++ EXT3_MAX_DIR_SIZE_NAME); ++ remove_proc_entry(EXT3_MAX_DIR_SIZE_NAME, sbi->s_dev_proc); ++ remove_proc_entry(sb->s_id, proc_root_ext3); ++ sbi->s_dev_proc = NULL; ++ sb->s_fs_info = NULL; ++ kfree(sbi); ++ return -ENOMEM; ++ } ++ proc->data = sbi; ++ proc->read_proc = ext3_max_dir_size_read; ++ proc->write_proc = ext3_max_dir_size_write; ++ + blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "EXT3-fs: unable to set blocksize\n"); +@@ -1701,6 +1765,7 @@ failed_mount: + ext3_blkdev_remove(sbi); + brelse(bh); + out_fail: ++ remove_proc_entry(EXT3_MAX_DIR_SIZE_NAME, sbi->s_dev_proc); + remove_proc_entry(sb->s_id, proc_root_ext3); + sbi->s_dev_proc = NULL; + sb->s_fs_info = NULL; +Index: linux-stage/include/linux/ext3_fs_sb.h +=================================================================== +--- linux-stage.orig/include/linux/ext3_fs_sb.h ++++ linux-stage/include/linux/ext3_fs_sb.h +@@ -111,6 +111,7 @@ struct ext3_sb_info { + unsigned long s_mb_max_groups_to_scan; + unsigned long s_mb_stats; + unsigned long s_mb_order2_reqs; ++ unsigned long s_max_dir_size; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; diff --git a/ldiskfs/kernel_patches/patches/ext3-max-dir-size.patch b/ldiskfs/kernel_patches/patches/ext3-max-dir-size.patch new file mode 100644 index 0000000000000000000000000000000000000000..104b5b2723784f6131035ae8b0edd9bde3023331 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-max-dir-size.patch @@ -0,0 +1,141 @@ +diff -pur linux-stage.orig/fs/ext3/ialloc.c linux-stage/fs/ext3/ialloc.c +--- linux-stage.orig/fs/ext3/ialloc.c 2008-06-01 16:18:53.000000000 +0800 ++++ linux-stage/fs/ext3/ialloc.c 2008-06-03 02:21:02.000000000 +0800 +@@ -519,12 +519,15 @@ struct inode *ext3_new_inode(handle_t *h + return ERR_PTR(-EPERM); + + sb = dir->i_sb; ++ sbi = EXT3_SB(sb); ++ if (sbi->s_max_dir_size > 0 && i_size_read(dir) >= sbi->s_max_dir_size) ++ return ERR_PTR(-EFBIG); ++ + inode = new_inode(sb); + if (!inode) + return ERR_PTR(-ENOMEM); + ei = EXT3_I(inode); + +- sbi = EXT3_SB(sb); + es = sbi->s_es; + if (goal) { + group = (goal - 1) / EXT3_INODES_PER_GROUP(sb); +diff -pur linux-stage.orig/fs/ext3/super.c linux-stage/fs/ext3/super.c +--- linux-stage.orig/fs/ext3/super.c 2008-06-03 01:53:34.000000000 +0800 ++++ linux-stage/fs/ext3/super.c 2008-06-03 19:39:19.000000000 +0800 +@@ -42,6 +42,12 @@ + #include "acl.h" + #include "group.h" + ++/* ++ * max directory size tunable ++ */ ++#define EXT3_DEFAULT_MAX_DIR_SIZE 0 ++#define EXT3_MAX_DIR_SIZE_NAME "max_dir_size" ++ + static int ext3_load_journal(struct super_block *, struct ext3_super_block *, + unsigned long journal_devnum); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, +@@ -446,6 +452,7 @@ void ext3_put_super (struct super_block + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + ++ remove_proc_entry(EXT3_MAX_DIR_SIZE_NAME, sbi->s_dev_proc); + remove_proc_entry(sb->s_id, proc_root_ext3); + sbi->s_dev_proc = NULL; + sb->s_fs_info = NULL; +@@ -1765,6 +1772,45 @@ failed: + return 1; + } + ++static int ext3_max_dir_size_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ struct ext3_sb_info *sbi = data; ++ int len; ++ ++ *eof = 1; ++ if (off != 0) ++ return 0; ++ ++ len = sprintf(page, "%lu\n", sbi->s_max_dir_size); ++ *start = page; ++ return len; ++} ++ ++static int ext3_max_dir_size_write(struct file *file, const char *buffer, ++ unsigned long count, void *data) ++{ ++ struct ext3_sb_info *sbi = data; ++ char str[32]; ++ unsigned long value; ++ char *end; ++ ++ if (count >= sizeof(str)) { ++ printk(KERN_ERR "EXT3-fs: %s string too long, max %u bytes\n", ++ EXT3_MAX_DIR_SIZE_NAME, (int)sizeof(str)); ++ return -EOVERFLOW; ++ } ++ ++ if (copy_from_user(str, buffer, count)) ++ return -EFAULT; ++ ++ value = simple_strtol(str, &end, 0); ++ if (value < 0) ++ return -ERANGE; ++ ++ sbi->s_max_dir_size = value; ++ return count; ++} + + static int ext3_fill_super (struct super_block *sb, void *data, int silent) + { +@@ -1785,6 +1831,7 @@ static int ext3_fill_super (struct super + int i; + int needs_recovery; + __le32 features; ++ struct proc_dir_entry *proc; + + sbi = kmalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) +@@ -1802,6 +1849,23 @@ static int ext3_fill_super (struct super + return -ENOMEM; + } + ++ sbi->s_max_dir_size = EXT3_DEFAULT_MAX_DIR_SIZE; ++ proc = create_proc_entry(EXT3_MAX_DIR_SIZE_NAME, ++ S_IFREG | S_IRUGO | S_IWUSR, sbi->s_dev_proc); ++ if (proc == NULL) { ++ printk(KERN_ERR "EXT3-fs: unable to create %s\n", ++ EXT3_MAX_DIR_SIZE_NAME); ++ remove_proc_entry(EXT3_MAX_DIR_SIZE_NAME, sbi->s_dev_proc); ++ remove_proc_entry(sb->s_id, proc_root_ext3); ++ sbi->s_dev_proc = NULL; ++ sb->s_fs_info = NULL; ++ kfree(sbi); ++ return -ENOMEM; ++ } ++ proc->data = sbi; ++ proc->read_proc = ext3_max_dir_size_read; ++ proc->write_proc = ext3_max_dir_size_write; ++ + blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); + if (!blocksize) { + printk(KERN_ERR "EXT3-fs: unable to set blocksize\n"); +@@ -2224,6 +2288,7 @@ failed_mount: + ext3_blkdev_remove(sbi); + brelse(bh); + out_fail: ++ remove_proc_entry(EXT3_MAX_DIR_SIZE_NAME, sbi->s_dev_proc); + remove_proc_entry(sb->s_id, proc_root_ext3); + sbi->s_dev_proc = NULL; + sb->s_fs_info = NULL; +diff -pur linux-stage.orig/include/linux/ext3_fs_sb.h linux-stage/include/linux/ext3_fs_sb.h +--- linux-stage.orig/include/linux/ext3_fs_sb.h 2008-06-01 16:18:54.000000000 +0800 ++++ linux-stage/include/linux/ext3_fs_sb.h 2008-06-03 02:21:02.000000000 +0800 +@@ -114,6 +114,7 @@ struct ext3_sb_info { + unsigned long s_mb_max_groups_to_scan; + unsigned long s_mb_stats; + unsigned long s_mb_order2_reqs; ++ unsigned long s_max_dir_size; + + /* history to debug policy */ + struct ext3_mb_history *s_mb_history; diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc3-2.6.18.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc3-2.6.18.patch index 805f2f3ad5adc4c8387d87304c6418938d231772..bc9313679a90dd2f8670adbc5a73fd76eba17588 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc3-2.6.18.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc3-2.6.18.patch @@ -1,7 +1,7 @@ Index: linux-2.6.18.8/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.18.8.orig/include/linux/ext3_fs_i.h 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/include/linux/ext3_fs_i.h 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.18.8.orig/include/linux/ext3_fs_i.h ++++ linux-2.6.18.8/include/linux/ext3_fs_i.h @@ -154,6 +154,10 @@ struct ext3_inode_info { struct inode vfs_inode; @@ -15,8 +15,8 @@ Index: linux-2.6.18.8/include/linux/ext3_fs_i.h #endif /* _LINUX_EXT3_FS_I */ Index: linux-2.6.18.8/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.18.8.orig/include/linux/ext3_fs_sb.h 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/include/linux/ext3_fs_sb.h 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.18.8.orig/include/linux/ext3_fs_sb.h ++++ linux-2.6.18.8/include/linux/ext3_fs_sb.h @@ -21,8 +21,15 @@ #include <linux/wait.h> #include <linux/blockgroup_lock.h> @@ -35,8 +35,8 @@ Index: linux-2.6.18.8/include/linux/ext3_fs_sb.h * third extended-fs super-block data in memory Index: linux-2.6.18.8/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.18.8.orig/include/linux/ext3_fs.h 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/include/linux/ext3_fs.h 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.18.8.orig/include/linux/ext3_fs.h ++++ linux-2.6.18.8/include/linux/ext3_fs.h @@ -17,6 +17,7 @@ #define _LINUX_EXT3_FS_H @@ -101,8 +101,8 @@ Index: linux-2.6.18.8/include/linux/ext3_fs.h extern void ext3_mb_release_blocks(struct super_block *, int); extern void ext3_mb_release_blocks(struct super_block *, int); extern void ext3_mb_discard_inode_preallocations(struct inode *); - extern int __init init_ext3_proc(void); - extern void exit_ext3_proc(void); + extern int __init init_ext3_mb_proc(void); + extern void exit_ext3_mb_proc(void); -extern void ext3_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, int *); +extern void ext3_mb_free_blocks(handle_t *, struct inode *, unsigned long, + unsigned long, int, unsigned long *); @@ -138,8 +138,8 @@ Index: linux-2.6.18.8/include/linux/ext3_fs.h int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, Index: linux-2.6.18.8/fs/ext3/super.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/super.c 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/super.c 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.18.8.orig/fs/ext3/super.c ++++ linux-2.6.18.8/fs/ext3/super.c @@ -688,6 +688,7 @@ enum { Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_grpquota, @@ -188,9 +188,9 @@ Index: linux-2.6.18.8/fs/ext3/super.c return 0; Index: linux-2.6.18.8/fs/ext3/extents.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/extents.c 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/extents.c 2007-07-17 09:18:53.000000000 +0200 -@@ -795,7 +795,7 @@ cleanup: +--- linux-2.6.18.8.orig/fs/ext3/extents.c ++++ linux-2.6.18.8/fs/ext3/extents.c +@@ -801,7 +801,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -199,7 +199,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c } } kfree(ablocks); -@@ -1613,7 +1613,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1619,7 +1619,7 @@ int ext3_ext_rm_idx(handle_t *handle, st ext_debug(inode, "index is empty, remove it, free block %lu\n", leaf); bh = sb_find_get_block(inode->i_sb, leaf); ext3_forget(handle, 1, inode, bh, leaf); @@ -208,7 +208,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c return err; } -@@ -1672,7 +1672,7 @@ static int ext3_remove_blocks(handle_t * +@@ -1678,7 +1678,7 @@ static int ext3_remove_blocks(handle_t * unsigned long from, unsigned long to) { struct buffer_head *bh; @@ -217,7 +217,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c #ifdef EXTENTS_STATS { -@@ -1690,6 +1690,8 @@ static int ext3_remove_blocks(handle_t * +@@ -1696,6 +1696,8 @@ static int ext3_remove_blocks(handle_t * spin_unlock(&sbi->s_ext_stats_lock); } #endif @@ -226,7 +226,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c if (from >= le32_to_cpu(ex->ee_block) && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { /* tail removal */ -@@ -1701,7 +1703,7 @@ static int ext3_remove_blocks(handle_t * +@@ -1707,7 +1709,7 @@ static int ext3_remove_blocks(handle_t * bh = sb_find_get_block(inode->i_sb, start + i); ext3_forget(handle, 0, inode, bh, start + i); } @@ -235,7 +235,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", -@@ -2034,7 +2036,7 @@ int ext3_ext_get_blocks(handle_t *handle +@@ -2040,7 +2042,7 @@ int ext3_ext_get_blocks(handle_t *handle struct ext3_extent newex, *ex; int goal, newblock, err = 0, depth; unsigned long allocated = 0; @@ -244,7 +244,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c __clear_bit(BH_New, &bh_result->b_state); ext_debug(inode, "blocks %d/%lu requested for inode %u\n", (int) iblock, -@@ -2116,18 +2118,33 @@ int ext3_ext_get_blocks(handle_t *handle +@@ -2122,18 +2124,36 @@ int ext3_ext_get_blocks(handle_t *handle if (S_ISREG(inode->i_mode) && (!EXT3_I(inode)->i_block_alloc_info)) ext3_init_block_alloc_info(inode); @@ -279,12 +279,15 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + ar.goal = ext3_ext_find_goal(inode, path, iblock); + ar.logical = iblock; + ar.len = allocated; -+ ar.flags = EXT3_MB_HINT_DATA; ++ if (S_ISREG(inode->i_mode)) ++ ar.flags = EXT3_MB_HINT_DATA; ++ else ++ ar.flags = 0; + newblock = ext3_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; ext_debug(inode, "allocate new block: goal %d, found %d/%lu\n", -@@ -2137,12 +2154,16 @@ int ext3_ext_get_blocks(handle_t *handle +@@ -2143,12 +2163,16 @@ int ext3_ext_get_blocks(handle_t *handle newex.ee_block = cpu_to_le32(iblock); newex.ee_start = cpu_to_le32(newblock); newex.ee_start_hi = 0; @@ -304,7 +307,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c goto out2; } -@@ -2151,6 +2172,7 @@ int ext3_ext_get_blocks(handle_t *handle +@@ -2157,6 +2181,7 @@ int ext3_ext_get_blocks(handle_t *handle /* previous routine could use block we allocated */ newblock = le32_to_cpu(newex.ee_start); @@ -312,7 +315,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c __set_bit(BH_New, &bh_result->b_state); ext3_ext_put_in_cache(inode, iblock, allocated, newblock, -@@ -2202,6 +2224,9 @@ void ext3_ext_truncate(struct inode * in +@@ -2208,6 +2233,9 @@ void ext3_ext_truncate(struct inode * in mutex_lock(&EXT3_I(inode)->truncate_mutex); ext3_ext_invalidate_cache(inode); @@ -324,8 +327,8 @@ Index: linux-2.6.18.8/fs/ext3/extents.c * probably we need not scaning at all, Index: linux-2.6.18.8/fs/ext3/Makefile =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/Makefile 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/Makefile 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.18.8.orig/fs/ext3/Makefile ++++ linux-2.6.18.8/fs/ext3/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o @@ -337,8 +340,8 @@ Index: linux-2.6.18.8/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o Index: linux-2.6.18.8/fs/ext3/xattr.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/xattr.c 2007-02-24 00:52:30.000000000 +0100 -+++ linux-2.6.18.8/fs/ext3/xattr.c 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.18.8.orig/fs/ext3/xattr.c ++++ linux-2.6.18.8/fs/ext3/xattr.c @@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl ea_bdebug(bh, "refcount now=0; freeing"); if (ce) @@ -359,8 +362,8 @@ Index: linux-2.6.18.8/fs/ext3/xattr.c } Index: linux-2.6.18.8/fs/ext3/balloc.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/balloc.c 2007-02-24 00:52:30.000000000 +0100 -+++ linux-2.6.18.8/fs/ext3/balloc.c 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.18.8.orig/fs/ext3/balloc.c ++++ linux-2.6.18.8/fs/ext3/balloc.c @@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. @@ -436,8 +439,8 @@ Index: linux-2.6.18.8/fs/ext3/balloc.c unsigned long count = 1; Index: linux-2.6.18.8/fs/ext3/inode.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/inode.c 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/inode.c 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.18.8.orig/fs/ext3/inode.c ++++ linux-2.6.18.8/fs/ext3/inode.c @@ -560,7 +560,7 @@ static int ext3_alloc_blocks(handle_t *h return ret; failed_out: @@ -492,8 +495,8 @@ Index: linux-2.6.18.8/fs/ext3/inode.c /* Index: linux-2.6.18.8/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/mballoc.c 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/mballoc.c 2007-07-17 09:23:56.000000000 +0200 +--- linux-2.6.18.8.orig/fs/ext3/mballoc.c ++++ linux-2.6.18.8/fs/ext3/mballoc.c @@ -350,8 +350,8 @@ struct ext3_prealloc_space { spinlock_t pa_lock; atomic_t pa_count; @@ -541,7 +544,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c + fex->fe_start + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); return block; -@@ -3174,7 +3174,7 @@ void ext3_mb_collect_stats(struct ext3_a +@@ -3202,7 +3202,7 @@ void ext3_mb_collect_stats(struct ext3_a void ext3_mb_use_inode_pa(struct ext3_allocation_context *ac, struct ext3_prealloc_space *pa) { @@ -550,7 +553,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); -@@ -4027,13 +4027,13 @@ int ext3_mb_discard_preallocations(struc +@@ -4053,13 +4053,13 @@ int ext3_mb_discard_preallocations(struc * it tries to use preallocation first, then falls back * to usual allocation */ @@ -566,7 +569,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c int freed, inquota; sb = ar->inode->i_sb; -@@ -4044,8 +4044,8 @@ unsigned long ext3_mb_new_blocks(handle_ +@@ -4070,8 +4070,8 @@ unsigned long ext3_mb_new_blocks(handle_ if (ext3_mballoc_warning++ == 0) printk(KERN_ERR "EXT3-fs: multiblock request with " "mballoc disabled!\n"); @@ -577,7 +580,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c return block; } -@@ -4109,11 +4109,11 @@ out: +@@ -4135,11 +4135,11 @@ out: } EXPORT_SYMBOL(ext3_mb_new_blocks); @@ -592,7 +595,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c if (!test_opt(inode->i_sb, MBALLOC)) { ret = ext3_new_block_old(handle, inode, goal, errp); -@@ -4228,8 +4228,8 @@ int ext3_mb_free_metadata(handle_t *hand +@@ -4254,8 +4254,8 @@ int ext3_mb_free_metadata(handle_t *hand * Main entry point into mballoc to free blocks */ void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc3-2.6.22.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc3-2.6.22.patch index ddc64d876bd304ddee6bc08a271a9b9a81e74419..68ca8ae13b50f345f00c37a9e3fc2e52784289e5 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc3-2.6.22.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc3-2.6.22.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.18.8/include/linux/ext3_fs_i.h +Index: linux-2.6.22.19/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.18.8.orig/include/linux/ext3_fs_i.h 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/include/linux/ext3_fs_i.h 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.22.19.orig/include/linux/ext3_fs_i.h ++++ linux-2.6.22.19/include/linux/ext3_fs_i.h @@ -154,6 +154,10 @@ struct ext3_inode_info { struct inode vfs_inode; @@ -13,10 +13,10 @@ Index: linux-2.6.18.8/include/linux/ext3_fs_i.h }; #endif /* _LINUX_EXT3_FS_I */ -Index: linux-2.6.18.8/include/linux/ext3_fs_sb.h +Index: linux-2.6.22.19/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.18.8.orig/include/linux/ext3_fs_sb.h 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/include/linux/ext3_fs_sb.h 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.22.19.orig/include/linux/ext3_fs_sb.h ++++ linux-2.6.22.19/include/linux/ext3_fs_sb.h @@ -21,8 +21,15 @@ #include <linux/wait.h> #include <linux/blockgroup_lock.h> @@ -33,10 +33,10 @@ Index: linux-2.6.18.8/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -Index: linux-2.6.18.8/include/linux/ext3_fs.h +Index: linux-2.6.22.19/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.18.8.orig/include/linux/ext3_fs.h 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/include/linux/ext3_fs.h 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.22.19.orig/include/linux/ext3_fs.h ++++ linux-2.6.22.19/include/linux/ext3_fs.h @@ -17,6 +17,7 @@ #define _LINUX_EXT3_FS_H @@ -45,7 +45,7 @@ Index: linux-2.6.18.8/include/linux/ext3_fs.h #include <linux/magic.h> /* -@@ -67,12 +68,12 @@ +@@ -68,12 +69,12 @@ struct ext3_allocation_request { struct inode *inode; /* target inode for block we're allocating */ @@ -64,7 +64,7 @@ Index: linux-2.6.18.8/include/linux/ext3_fs.h unsigned long len; /* how many blocks we want to allocate */ unsigned long flags; /* flags. see above EXT3_MB_HINT_* */ }; -@@ -400,6 +401,7 @@ struct ext3_inode { +@@ -414,6 +415,7 @@ struct ext3_inode { #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ #define EXT3_MOUNT_EXTENTS 0x2000000/* Extents support */ #define EXT3_MOUNT_EXTDEBUG 0x4000000/* Extents debug */ @@ -72,7 +72,7 @@ Index: linux-2.6.18.8/include/linux/ext3_fs.h /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -787,12 +789,12 @@ ext3_group_first_block_no(struct super_b +@@ -801,12 +803,12 @@ ext3_group_first_block_no(struct super_b /* balloc.c */ extern int ext3_bg_has_super(struct super_block *sb, int group); extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); @@ -88,7 +88,7 @@ Index: linux-2.6.18.8/include/linux/ext3_fs.h extern void ext3_free_blocks_sb (handle_t *handle, struct super_block *sb, ext3_fsblk_t block, unsigned long count, unsigned long *pdquot_freed_blocks); -@@ -836,15 +838,45 @@ extern long ext3_mb_stats; +@@ -850,15 +852,45 @@ extern long ext3_mb_stats; extern long ext3_mb_max_to_scan; extern int ext3_mb_init(struct super_block *, int); extern int ext3_mb_release(struct super_block *); @@ -101,8 +101,8 @@ Index: linux-2.6.18.8/include/linux/ext3_fs.h extern void ext3_mb_release_blocks(struct super_block *, int); extern void ext3_mb_release_blocks(struct super_block *, int); extern void ext3_mb_discard_inode_preallocations(struct inode *); - extern int __init init_ext3_proc(void); - extern void exit_ext3_proc(void); + extern int __init init_ext3_mb_proc(void); + extern void exit_ext3_mb_proc(void); -extern void ext3_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, int *); +extern void ext3_mb_free_blocks(handle_t *, struct inode *, unsigned long, + unsigned long, int, unsigned long *); @@ -136,11 +136,11 @@ Index: linux-2.6.18.8/include/linux/ext3_fs.h /* inode.c */ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, -Index: linux-2.6.18.8/fs/ext3/super.c +Index: linux-2.6.22.19/fs/ext3/super.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/super.c 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/super.c 2007-07-17 09:18:53.000000000 +0200 -@@ -688,6 +688,7 @@ enum { +--- linux-2.6.22.19.orig/fs/ext3/super.c ++++ linux-2.6.22.19/fs/ext3/super.c +@@ -685,6 +685,7 @@ enum { Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_grpquota, Opt_extents, Opt_noextents, Opt_extdebug, @@ -148,7 +148,7 @@ Index: linux-2.6.18.8/fs/ext3/super.c }; static match_table_t tokens = { -@@ -743,6 +744,9 @@ static match_table_t tokens = { +@@ -740,6 +741,9 @@ static match_table_t tokens = { {Opt_extents, "extents"}, {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, @@ -158,7 +158,7 @@ Index: linux-2.6.18.8/fs/ext3/super.c {Opt_err, NULL}, {Opt_resize, "resize"}, }; -@@ -1096,6 +1100,19 @@ clear_qf_name: +@@ -1093,6 +1097,19 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; @@ -178,7 +178,7 @@ Index: linux-2.6.18.8/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1826,6 +1843,7 @@ static int ext3_fill_super (struct super +@@ -1832,6 +1849,7 @@ static int ext3_fill_super (struct super "writeback"); ext3_ext_init(sb); @@ -186,11 +186,11 @@ Index: linux-2.6.18.8/fs/ext3/super.c lock_kernel(); return 0; -Index: linux-2.6.18.8/fs/ext3/extents.c +Index: linux-2.6.22.19/fs/ext3/extents.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/extents.c 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/extents.c 2007-07-17 09:18:53.000000000 +0200 -@@ -795,7 +795,7 @@ cleanup: +--- linux-2.6.22.19.orig/fs/ext3/extents.c ++++ linux-2.6.22.19/fs/ext3/extents.c +@@ -801,7 +801,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -199,7 +199,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c } } kfree(ablocks); -@@ -1613,7 +1613,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1619,7 +1619,7 @@ int ext3_ext_rm_idx(handle_t *handle, st ext_debug(inode, "index is empty, remove it, free block %lu\n", leaf); bh = sb_find_get_block(inode->i_sb, leaf); ext3_forget(handle, 1, inode, bh, leaf); @@ -208,7 +208,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c return err; } -@@ -1672,7 +1672,7 @@ static int ext3_remove_blocks(handle_t * +@@ -1678,7 +1678,7 @@ static int ext3_remove_blocks(handle_t * unsigned long from, unsigned long to) { struct buffer_head *bh; @@ -217,7 +217,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c #ifdef EXTENTS_STATS { -@@ -1690,6 +1690,8 @@ static int ext3_remove_blocks(handle_t * +@@ -1696,6 +1696,8 @@ static int ext3_remove_blocks(handle_t * spin_unlock(&sbi->s_ext_stats_lock); } #endif @@ -226,7 +226,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c if (from >= le32_to_cpu(ex->ee_block) && to == le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { /* tail removal */ -@@ -1701,7 +1703,7 @@ static int ext3_remove_blocks(handle_t * +@@ -1707,7 +1709,7 @@ static int ext3_remove_blocks(handle_t * bh = sb_find_get_block(inode->i_sb, start + i); ext3_forget(handle, 0, inode, bh, start + i); } @@ -235,7 +235,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c } else if (from == le32_to_cpu(ex->ee_block) && to <= le32_to_cpu(ex->ee_block) + le16_to_cpu(ex->ee_len) - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", -@@ -2034,7 +2036,7 @@ int ext3_ext_get_blocks(handle_t *handle +@@ -2040,7 +2042,7 @@ int ext3_ext_get_blocks(handle_t *handle struct ext3_extent newex, *ex; int goal, newblock, err = 0, depth; unsigned long allocated = 0; @@ -244,7 +244,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c __clear_bit(BH_New, &bh_result->b_state); ext_debug(inode, "blocks %d/%lu requested for inode %u\n", (int) iblock, -@@ -2116,18 +2118,33 @@ int ext3_ext_get_blocks(handle_t *handle +@@ -2122,18 +2124,36 @@ int ext3_ext_get_blocks(handle_t *handle if (S_ISREG(inode->i_mode) && (!EXT3_I(inode)->i_block_alloc_info)) ext3_init_block_alloc_info(inode); @@ -279,12 +279,15 @@ Index: linux-2.6.18.8/fs/ext3/extents.c + ar.goal = ext3_ext_find_goal(inode, path, iblock); + ar.logical = iblock; + ar.len = allocated; -+ ar.flags = EXT3_MB_HINT_DATA; ++ if (S_ISREG(inode->i_mode)) ++ ar.flags = EXT3_MB_HINT_DATA; ++ else ++ ar.flags = 0; + newblock = ext3_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; ext_debug(inode, "allocate new block: goal %d, found %d/%lu\n", -@@ -2137,12 +2154,16 @@ int ext3_ext_get_blocks(handle_t *handle +@@ -2143,12 +2163,16 @@ int ext3_ext_get_blocks(handle_t *handle newex.ee_block = cpu_to_le32(iblock); newex.ee_start = cpu_to_le32(newblock); newex.ee_start_hi = 0; @@ -304,7 +307,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c goto out2; } -@@ -2151,6 +2172,7 @@ int ext3_ext_get_blocks(handle_t *handle +@@ -2157,6 +2181,7 @@ int ext3_ext_get_blocks(handle_t *handle /* previous routine could use block we allocated */ newblock = le32_to_cpu(newex.ee_start); @@ -312,7 +315,7 @@ Index: linux-2.6.18.8/fs/ext3/extents.c __set_bit(BH_New, &bh_result->b_state); ext3_ext_put_in_cache(inode, iblock, allocated, newblock, -@@ -2202,6 +2224,9 @@ void ext3_ext_truncate(struct inode * in +@@ -2208,6 +2233,9 @@ void ext3_ext_truncate(struct inode * in mutex_lock(&EXT3_I(inode)->truncate_mutex); ext3_ext_invalidate_cache(inode); @@ -322,11 +325,11 @@ Index: linux-2.6.18.8/fs/ext3/extents.c /* * TODO: optimization is possible here * probably we need not scaning at all, -Index: linux-2.6.18.8/fs/ext3/Makefile +Index: linux-2.6.22.19/fs/ext3/Makefile =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/Makefile 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/Makefile 2007-07-17 09:18:53.000000000 +0200 -@@ -5,7 +5,7 @@ +--- linux-2.6.22.19.orig/fs/ext3/Makefile ++++ linux-2.6.22.19/fs/ext3/Makefile +@@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o ext3_jbd.o \ @@ -335,11 +338,11 @@ Index: linux-2.6.18.8/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.18.8/fs/ext3/xattr.c +Index: linux-2.6.22.19/fs/ext3/xattr.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/xattr.c 2007-02-24 00:52:30.000000000 +0100 -+++ linux-2.6.18.8/fs/ext3/xattr.c 2007-07-17 09:18:53.000000000 +0200 -@@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl +--- linux-2.6.22.19.orig/fs/ext3/xattr.c ++++ linux-2.6.22.19/fs/ext3/xattr.c +@@ -488,7 +488,7 @@ ext3_xattr_release_block(handle_t *handl ea_bdebug(bh, "refcount now=0; freeing"); if (ce) mb_cache_entry_free(ce); @@ -348,7 +351,7 @@ Index: linux-2.6.18.8/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, bh->b_blocknr); } else { -@@ -805,7 +805,7 @@ inserted: +@@ -813,7 +813,7 @@ inserted: new_bh = sb_getblk(sb, block); if (!new_bh) { getblk_failed: @@ -357,11 +360,11 @@ Index: linux-2.6.18.8/fs/ext3/xattr.c error = -EIO; goto cleanup; } -Index: linux-2.6.18.8/fs/ext3/balloc.c +Index: linux-2.6.22.19/fs/ext3/balloc.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/balloc.c 2007-02-24 00:52:30.000000000 +0100 -+++ linux-2.6.18.8/fs/ext3/balloc.c 2007-07-17 09:18:53.000000000 +0200 -@@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ +--- linux-2.6.22.19.orig/fs/ext3/balloc.c ++++ linux-2.6.22.19/fs/ext3/balloc.c +@@ -90,7 +90,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. */ @@ -370,7 +373,7 @@ Index: linux-2.6.18.8/fs/ext3/balloc.c read_block_bitmap(struct super_block *sb, unsigned int block_group) { struct ext3_group_desc * desc; -@@ -294,6 +294,8 @@ void ext3_discard_reservation(struct ino +@@ -391,6 +391,8 @@ void ext3_discard_reservation(struct ino struct ext3_reserve_window_node *rsv; spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock; @@ -379,7 +382,7 @@ Index: linux-2.6.18.8/fs/ext3/balloc.c if (!block_i) return; -@@ -490,19 +492,24 @@ +@@ -602,19 +604,24 @@ error_return: * @count: number of blocks to count */ void ext3_free_blocks(handle_t *handle, struct inode *inode, @@ -414,16 +417,16 @@ Index: linux-2.6.18.8/fs/ext3/balloc.c return; } -@@ -1199,7 +1205,7 @@ int ext3_should_retry_alloc(struct super - * bitmap, and then for any free bit if that fails. - * This function also updates quota and i_blocks field. +@@ -1403,7 +1410,7 @@ int ext3_should_retry_alloc(struct super + * any specific goal block. + * */ -ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, +ext3_fsblk_t ext3_new_blocks_old(handle_t *handle, struct inode *inode, ext3_fsblk_t goal, unsigned long *count, int *errp) { struct buffer_head *bitmap_bh = NULL; -@@ -1463,7 +1469,7 @@ out: +@@ -1666,7 +1673,7 @@ out: return 0; } @@ -432,10 +435,10 @@ Index: linux-2.6.18.8/fs/ext3/balloc.c ext3_fsblk_t goal, int *errp) { unsigned long count = 1; -Index: linux-2.6.18.8/fs/ext3/inode.c +Index: linux-2.6.22.19/fs/ext3/inode.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/inode.c 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/inode.c 2007-07-17 09:18:53.000000000 +0200 +--- linux-2.6.22.19.orig/fs/ext3/inode.c ++++ linux-2.6.22.19/fs/ext3/inode.c @@ -560,7 +560,7 @@ static int ext3_alloc_blocks(handle_t *h return ret; failed_out: @@ -470,7 +473,7 @@ Index: linux-2.6.18.8/fs/ext3/inode.c return err; } -@@ -1996,7 +1997,7 @@ static void ext3_clear_blocks(handle_t * +@@ -1988,7 +1989,7 @@ static void ext3_clear_blocks(handle_t * } } @@ -479,7 +482,7 @@ Index: linux-2.6.18.8/fs/ext3/inode.c } /** -@@ -2169,7 +2170,7 @@ static void ext3_free_branches(handle_t +@@ -2161,7 +2162,7 @@ static void ext3_free_branches(handle_t ext3_journal_test_restart(handle, inode); } @@ -488,11 +491,11 @@ Index: linux-2.6.18.8/fs/ext3/inode.c if (parent_bh) { /* -Index: linux-2.6.18.8/fs/ext3/mballoc.c +Index: linux-2.6.22.19/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.18.8.orig/fs/ext3/mballoc.c 2007-07-17 09:18:14.000000000 +0200 -+++ linux-2.6.18.8/fs/ext3/mballoc.c 2007-07-17 09:23:56.000000000 +0200 -@@ -307,7 +307,7 @@ +--- linux-2.6.22.19.orig/fs/ext3/mballoc.c ++++ linux-2.6.22.19/fs/ext3/mballoc.c +@@ -305,7 +305,7 @@ */ #define MB_DEFAULT_STRIPE 256 @@ -548,7 +551,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c + fex->fe_start + le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block); return block; -@@ -3174,7 +3174,7 @@ void ext3_mb_collect_stats(struct ext3_a +@@ -3202,7 +3202,7 @@ void ext3_mb_collect_stats(struct ext3_a void ext3_mb_use_inode_pa(struct ext3_allocation_context *ac, struct ext3_prealloc_space *pa) { @@ -557,7 +560,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); -@@ -4027,13 +4027,13 @@ int ext3_mb_discard_preallocations(struc +@@ -4053,13 +4053,13 @@ int ext3_mb_discard_preallocations(struc * it tries to use preallocation first, then falls back * to usual allocation */ @@ -573,7 +576,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c int freed, inquota; sb = ar->inode->i_sb; -@@ -4044,8 +4044,8 @@ unsigned long ext3_mb_new_blocks(handle_ +@@ -4070,8 +4070,8 @@ unsigned long ext3_mb_new_blocks(handle_ if (ext3_mballoc_warning++ == 0) printk(KERN_ERR "EXT3-fs: multiblock request with " "mballoc disabled!\n"); @@ -584,7 +587,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c return block; } -@@ -4109,11 +4109,11 @@ out: +@@ -4135,11 +4135,11 @@ out: } EXPORT_SYMBOL(ext3_mb_new_blocks); @@ -599,7 +602,7 @@ Index: linux-2.6.18.8/fs/ext3/mballoc.c if (!test_opt(inode->i_sb, MBALLOC)) { ret = ext3_new_block_old(handle, inode, goal, errp); -@@ -4228,8 +4228,8 @@ int ext3_mb_free_metadata(handle_t *hand +@@ -4254,8 +4254,8 @@ int ext3_mb_free_metadata(handle_t *hand * Main entry point into mballoc to free blocks */ void ext3_mb_free_blocks(handle_t *handle, struct inode *inode, diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch index f1d9bd61be3ce020206f7a99d057f9e6304db087..1fbce882ed66c2b2a0b957eefbc0919a21007d60 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc3-core.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs.h +Index: linux-2.6.5-7.311/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2007-06-08 23:44:08.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2007-10-17 22:25:01.000000000 +0400 +--- linux-2.6.5-7.311.orig/include/linux/ext3_fs.h ++++ linux-2.6.5-7.311/include/linux/ext3_fs.h @@ -57,6 +57,30 @@ struct statfs; #define ext3_debug(f, a...) do {} while (0) #endif @@ -33,7 +33,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h /* * Special inodes numbers */ -@@ -387,6 +411,14 @@ struct ext3_inode { +@@ -361,6 +385,14 @@ struct ext3_inode { #define ext3_find_first_zero_bit ext2_find_first_zero_bit #define ext3_find_next_zero_bit ext2_find_next_zero_bit @@ -48,7 +48,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h /* * Maximal mount counts between two filesystem checks */ -@@ -763,6 +795,20 @@ extern unsigned long ext3_count_dirs (st +@@ -735,6 +767,20 @@ extern unsigned long ext3_count_dirs (st extern void ext3_check_inodes_bitmap (struct super_block *); extern unsigned long ext3_count_free (struct buffer_head *, unsigned); @@ -62,20 +62,31 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h +extern void ext3_mb_release_blocks(struct super_block *, int); +extern void ext3_mb_release_blocks(struct super_block *, int); +extern void ext3_mb_discard_inode_preallocations(struct inode *); -+extern int __init init_ext3_proc(void); -+extern void exit_ext3_proc(void); ++extern int __init init_ext3_mb_proc(void); ++extern void exit_ext3_mb_proc(void); +extern void ext3_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, int *); + /* inode.c */ extern int ext3_block_truncate_page(handle_t *, struct page *, -Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h +@@ -769,6 +815,10 @@ extern int ext3_htree_fill_tree(struct f + __u32 start_minor_hash, __u32 *next_hash); + + /* super.c */ ++extern struct proc_dir_entry *proc_root_ext3; ++extern int __init init_ext3_proc(void); ++extern void exit_ext3_proc(void); ++ + extern void ext3_error (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); + extern void __ext3_std_error (struct super_block *, const char *, int); +Index: linux-2.6.5-7.311/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2007-06-08 23:44:07.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2007-10-17 22:25:01.000000000 +0400 -@@ -81,6 +81,61 @@ struct ext3_sb_info { - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ +--- linux-2.6.5-7.311.orig/include/linux/ext3_fs_sb.h ++++ linux-2.6.5-7.311/include/linux/ext3_fs_sb.h +@@ -78,6 +78,61 @@ struct ext3_sb_info { + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ #endif + + /* for buddy allocator */ @@ -105,7 +116,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h + int s_mb_history_cur; + int s_mb_history_max; + int s_mb_history_num; -+ struct proc_dir_entry *s_mb_proc; ++ struct proc_dir_entry *s_dev_proc; + spinlock_t s_mb_history_lock; + int s_mb_history_filter; + @@ -135,11 +146,11 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h + [(group) & (EXT3_DESC_PER_BLOCK(sb) - 1)] + #endif /* _LINUX_EXT3_FS_SB */ -Index: linux-2.6.9-full/fs/ext3/super.c +Index: linux-2.6.5-7.311/fs/ext3/super.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2007-06-08 23:44:08.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/super.c 2007-10-17 22:26:27.000000000 +0400 -@@ -394,6 +394,7 @@ void ext3_put_super (struct super_block +--- linux-2.6.5-7.311.orig/fs/ext3/super.c ++++ linux-2.6.5-7.311/fs/ext3/super.c +@@ -389,6 +389,7 @@ void ext3_put_super (struct super_block struct ext3_super_block *es = sbi->s_es; int i; @@ -147,17 +158,82 @@ Index: linux-2.6.9-full/fs/ext3/super.c ext3_ext_release(sb); ext3_xattr_put_super(sb); journal_destroy(sbi->s_journal); -@@ -463,6 +464,8 @@ static struct inode *ext3_alloc_inode(st +@@ -428,6 +429,8 @@ void ext3_put_super (struct super_block + invalidate_bdev(sbi->journal_bdev, 0); + ext3_blkdev_remove(sbi); + } ++ remove_proc_entry(sb->s_id, proc_root_ext3); ++ sbi->s_dev_proc = NULL; + sb->s_fs_info = NULL; + kfree(sbi); + return; +@@ -453,6 +456,8 @@ static struct inode *ext3_alloc_inode(st ei->vfs_inode.i_version = 1; - + memset(&ei->i_cached_extent, 0, sizeof(ei->i_cached_extent)); + INIT_LIST_HEAD(&ei->i_prealloc_list); + spin_lock_init(&ei->i_prealloc_lock); return &ei->vfs_inode; } -@@ -2576,7 +2579,13 @@ static struct file_system_type ext3_fs_t +@@ -1151,6 +1156,13 @@ static int ext3_fill_super (struct super + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; ++ sbi->s_dev_proc = proc_mkdir(sb->s_id, proc_root_ext3); ++ if (sbi->s_dev_proc == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", sb->s_id); ++ sb->s_fs_info = NULL; ++ kfree(sbi); ++ return -ENOMEM; ++ } + + blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); + if (!blocksize) { +@@ -1526,6 +1538,8 @@ failed_mount: + ext3_blkdev_remove(sbi); + brelse(bh); + out_fail: ++ remove_proc_entry(sb->s_id, proc_root_ext3); ++ sbi->s_dev_proc = NULL; + sb->s_fs_info = NULL; + kfree(sbi); + return -EINVAL; +@@ -2158,9 +2172,46 @@ static struct file_system_type ext3_fs_t + .fs_flags = FS_REQUIRES_DEV, + }; ++#define EXT3_ROOT "ext3" ++struct proc_dir_entry *proc_root_ext3; ++ ++int __init init_ext3_proc(void) ++{ ++ int ret; ++ ++ if ((ret = init_ext3_mb_proc())) ++ goto out; ++ ++ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); ++ if (proc_root_ext3 == NULL) { ++ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); ++ ret = -ENOMEM; ++ goto out_mb_proc; ++ } ++ ++ return 0; ++ ++out_mb_proc: ++ exit_ext3_mb_proc(); ++out: ++ return ret; ++} ++ ++void exit_ext3_proc(void) ++{ ++ exit_ext3_mb_proc(); ++ remove_proc_entry(EXT3_ROOT, proc_root_fs); ++} ++ static int __init init_ext3_fs(void) { - int err = init_ext3_xattr(); @@ -171,19 +247,19 @@ Index: linux-2.6.9-full/fs/ext3/super.c if (err) return err; err = init_inodecache(); -@@ -2598,6 +2607,7 @@ static void __exit exit_ext3_fs(void) +@@ -2189,6 +2240,7 @@ static void __exit exit_ext3_fs(void) unregister_filesystem(&ext3_fs_type); destroy_inodecache(); exit_ext3_xattr(); + exit_ext3_proc(); } - int ext3_prep_san_write(struct inode *inode, long *blocks, -Index: linux-2.6.9-full/fs/ext3/mballoc.c + int ext3_map_inode_page(struct inode *inode, struct page *page, +Index: linux-2.6.5-7.311/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2007-10-17 21:59:51.072534980 +0400 -+++ linux-2.6.9-full/fs/ext3/mballoc.c 2007-10-17 23:09:22.000000000 +0400 -@@ -0,0 +1,4398 @@ +--- /dev/null ++++ linux-2.6.5-7.311/fs/ext3/mballoc.c +@@ -0,0 +1,4385 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> @@ -499,7 +575,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +#define EXT3_BB_MAX_BLOCKS 30 + +struct ext3_free_metadata { -+ unsigned short group; ++ unsigned group; + unsigned short num; + unsigned short blocks[EXT3_BB_MAX_BLOCKS]; + struct list_head list; @@ -628,8 +704,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + void *bd_bitmap; + struct ext3_group_info *bd_info; + struct super_block *bd_sb; -+ __u16 bd_blkbits; -+ __u16 bd_group; ++ unsigned bd_group; ++ unsigned bd_blkbits; +}; +#define EXT3_MB_BITMAP(e3b) ((e3b)->bd_bitmap) +#define EXT3_MB_BUDDY(e3b) ((e3b)->bd_buddy) @@ -642,8 +718,6 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + -+static struct proc_dir_entry *proc_root_ext3; -+ +int ext3_create (struct inode *, struct dentry *, int, struct nameidata *); +struct buffer_head * read_block_bitmap(struct super_block *, unsigned int); +unsigned long ext3_new_blocks_old(handle_t *handle, struct inode *inode, @@ -1062,6 +1136,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + fragments++; + first = i; + i = ext2_find_next_le_bit(bitmap, max, i); ++ if (i > max) ++ i = max; + len = i - first; + free += len; + if (len > 1) @@ -2358,8 +2434,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); + -+ remove_proc_entry("mb_groups", sbi->s_mb_proc); -+ remove_proc_entry("mb_history", sbi->s_mb_proc); ++ remove_proc_entry("mb_groups", sbi->s_dev_proc); ++ remove_proc_entry("mb_history", sbi->s_dev_proc); + + if (sbi->s_mb_history) + kfree(sbi->s_mb_history); @@ -2370,14 +2446,14 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + struct ext3_sb_info *sbi = EXT3_SB(sb); + int i; + -+ if (sbi->s_mb_proc != NULL) { ++ if (sbi->s_dev_proc != NULL) { + struct proc_dir_entry *p; -+ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); ++ p = create_proc_entry("mb_history", S_IRUGO, sbi->s_dev_proc); + if (p) { + p->proc_fops = &ext3_mb_seq_history_fops; + p->data = sb; + } -+ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); ++ p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_dev_proc); + if (p) { + p->proc_fops = &ext3_mb_seq_groups_fops; + p->data = sb; @@ -2764,7 +2840,6 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + mb_debug("freed %u blocks in %u structures\n", count, count2); +} + -+#define EXT3_ROOT "ext3" +#define EXT3_MB_STATS_NAME "stats" +#define EXT3_MB_MAX_TO_SCAN_NAME "max_to_scan" +#define EXT3_MB_MIN_TO_SCAN_NAME "min_to_scan" @@ -2972,17 +3047,13 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + +int ext3_mb_init_per_dev_proc(struct super_block *sb) +{ -+ mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; + struct ext3_sb_info *sbi = EXT3_SB(sb); ++ mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; + struct proc_dir_entry *proc; -+ char devname[64], *name; -+ -+ snprintf(devname, sizeof(devname) - 1, "%s", -+ bdevname(sb->s_bdev, devname)); -+ sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext3); ++ char *name; + + name = EXT3_MB_STATS_NAME; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); ++ proc = create_proc_entry(name, mode, sbi->s_dev_proc); + if (proc == NULL) + goto err_out; + proc->data = sbi; @@ -2990,7 +3061,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + proc->write_proc = ext3_mb_stats_write; + + name = EXT3_MB_MAX_TO_SCAN_NAME; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); ++ proc = create_proc_entry(name, mode, sbi->s_dev_proc); + if (proc == NULL) + goto err_out; + proc->data = sbi; @@ -2998,7 +3069,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + proc->write_proc = ext3_mb_max_to_scan_write; + + name = EXT3_MB_MIN_TO_SCAN_NAME; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); ++ proc = create_proc_entry(name, mode, sbi->s_dev_proc); + if (proc == NULL) + goto err_out; + proc->data = sbi; @@ -3006,7 +3077,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + proc->write_proc = ext3_mb_min_to_scan_write; + + name = EXT3_MB_ORDER2_REQ; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); ++ proc = create_proc_entry(name, mode, sbi->s_dev_proc); + if (proc == NULL) + goto err_out; + proc->data = sbi; @@ -3014,7 +3085,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + proc->write_proc = ext3_mb_order2_req_write; + + name = EXT3_MB_STREAM_REQ; -+ proc = create_proc_entry(name, mode, sbi->s_mb_proc); ++ proc = create_proc_entry(name, mode, sbi->s_dev_proc); + if (proc == NULL) + goto err_out; + proc->data = sbi; @@ -3025,13 +3096,11 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + +err_out: + printk(KERN_ERR "EXT3-fs: Unable to create %s\n", name); -+ remove_proc_entry(EXT3_MB_STREAM_REQ, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_ORDER2_REQ, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_STATS_NAME, sbi->s_mb_proc); -+ remove_proc_entry(devname, proc_root_ext3); -+ sbi->s_mb_proc = NULL; ++ remove_proc_entry(EXT3_MB_STREAM_REQ, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_STATS_NAME, sbi->s_dev_proc); + + return -ENOMEM; +} @@ -3039,24 +3108,20 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c +int ext3_mb_destroy_per_dev_proc(struct super_block *sb) +{ + struct ext3_sb_info *sbi = EXT3_SB(sb); -+ char devname[64]; + -+ if (sbi->s_mb_proc == NULL) ++ if (sbi->s_dev_proc == NULL) + return -EINVAL; + -+ snprintf(devname, sizeof(devname) - 1, "%s", -+ bdevname(sb->s_bdev, devname)); -+ remove_proc_entry(EXT3_MB_STREAM_REQ, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_ORDER2_REQ, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); -+ remove_proc_entry(EXT3_MB_STATS_NAME, sbi->s_mb_proc); -+ remove_proc_entry(devname, proc_root_ext3); ++ remove_proc_entry(EXT3_MB_STREAM_REQ, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_ORDER2_REQ, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_MIN_TO_SCAN_NAME, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_MAX_TO_SCAN_NAME, sbi->s_dev_proc); ++ remove_proc_entry(EXT3_MB_STATS_NAME, sbi->s_dev_proc); + + return 0; +} + -+int __init init_ext3_proc(void) ++int __init init_ext3_mb_proc(void) +{ + ext3_pspace_cachep = + kmem_cache_create("ext3_prealloc_space", @@ -3065,18 +3130,13 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + if (ext3_pspace_cachep == NULL) + return -ENOMEM; + -+ proc_root_ext3 = proc_mkdir(EXT3_ROOT, proc_root_fs); -+ if (proc_root_ext3 == NULL) -+ printk(KERN_ERR "EXT3-fs: Unable to create %s\n", EXT3_ROOT); -+ + return 0; +} + -+void exit_ext3_proc(void) ++void exit_ext3_mb_proc(void) +{ + /* XXX: synchronize_rcu(); */ + kmem_cache_destroy(ext3_pspace_cachep); -+ remove_proc_entry(EXT3_ROOT, proc_root_fs); +} + + @@ -3564,7 +3624,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + + /* in this short window concurrent discard can set pa_deleted */ + spin_lock(&pa->pa_lock); -+ if (pa->pa_deleted == 0) { ++ if (pa->pa_deleted == 1) { + spin_unlock(&pa->pa_lock); + return; + } @@ -4582,3 +4642,6 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c + ext3_std_error(sb, err); + return; +} ++ ++EXPORT_SYMBOL(ext3_free_blocks); ++EXPORT_SYMBOL(ext3_mb_discard_inode_preallocations); diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc3-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc3-rhel4.patch index 910df7c0d0e71789ec9b1e2237a0ed41b018bf5c..a26a534195cd418019d1ea319ce674ef4166c4cd 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc3-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc3-rhel4.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.9-full/include/linux/ext3_fs_i.h +Index: linux-2.6.9/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_i.h 2007-03-28 01:29:38.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs_i.h 2007-03-28 15:45:41.000000000 +0400 +--- linux-2.6.9.orig/include/linux/ext3_fs_i.h ++++ linux-2.6.9/include/linux/ext3_fs_i.h @@ -130,6 +130,10 @@ struct ext3_inode_info { struct inode vfs_inode; @@ -13,10 +13,10 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_i.h }; #endif /* _LINUX_EXT3_FS_I */ -Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h +Index: linux-2.6.9/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs_sb.h 2007-03-28 15:42:16.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs_sb.h 2007-03-28 15:45:41.000000000 +0400 +--- linux-2.6.9.orig/include/linux/ext3_fs_sb.h ++++ linux-2.6.9/include/linux/ext3_fs_sb.h @@ -23,9 +23,16 @@ #define EXT_INCLUDE #include <linux/blockgroup_lock.h> @@ -34,10 +34,10 @@ Index: linux-2.6.9-full/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -Index: linux-2.6.9-full/include/linux/ext3_fs.h +Index: linux-2.6.9/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2007-03-28 15:45:07.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2007-03-28 15:45:41.000000000 +0400 +--- linux-2.6.9.orig/include/linux/ext3_fs.h ++++ linux-2.6.9/include/linux/ext3_fs.h @@ -389,6 +389,7 @@ struct ext3_inode { #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ @@ -46,7 +46,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -749,8 +750,9 @@ struct dir_private_info { +@@ -757,8 +758,9 @@ struct dir_private_info { extern int ext3_bg_has_super(struct super_block *sb, int group); extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); @@ -57,11 +57,11 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h extern void ext3_free_blocks_sb (handle_t *, struct super_block *, unsigned long, unsigned long, int *); extern unsigned long ext3_count_free_blocks (struct super_block *); -Index: linux-2.6.9-full/fs/ext3/super.c +Index: linux-2.6.9/fs/ext3/super.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2007-03-28 15:42:16.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/super.c 2007-03-28 15:45:41.000000000 +0400 -@@ -600,6 +600,7 @@ enum { +--- linux-2.6.9.orig/fs/ext3/super.c ++++ linux-2.6.9/fs/ext3/super.c +@@ -642,6 +642,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_extents, Opt_noextents, Opt_extdebug, @@ -69,7 +69,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c }; static match_table_t tokens = { -@@ -653,6 +654,9 @@ static match_table_t tokens = { +@@ -695,6 +696,9 @@ static match_table_t tokens = { {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, @@ -79,7 +79,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c {Opt_err, NULL}, {Opt_resize, "resize"}, }; -@@ -965,6 +969,19 @@ clear_qf_name: +@@ -1007,6 +1011,19 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; @@ -99,7 +99,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1654,6 +1671,7 @@ static int ext3_fill_super (struct super +@@ -1696,6 +1713,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -107,11 +107,11 @@ Index: linux-2.6.9-full/fs/ext3/super.c return 0; -Index: linux-2.6.9-full/fs/ext3/extents.c +Index: linux-2.6.9/fs/ext3/extents.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/extents.c 2007-03-28 01:29:41.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/extents.c 2007-03-28 15:45:41.000000000 +0400 -@@ -779,7 +779,7 @@ cleanup: +--- linux-2.6.9.orig/fs/ext3/extents.c ++++ linux-2.6.9/fs/ext3/extents.c +@@ -820,7 +820,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -120,7 +120,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c } } kfree(ablocks); -@@ -1586,7 +1586,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1630,7 +1630,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -129,7 +129,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c return err; } -@@ -2071,10 +2071,12 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -2129,10 +2129,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -143,7 +143,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -2086,7 +2088,7 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -2144,7 +2146,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -152,7 +152,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -@@ -2177,11 +2179,8 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2235,11 +2237,8 @@ int ext3_ext_get_block(handle_t *handle, struct ext3_extent *ex; int goal, newblock, err = 0, depth; struct ext3_extents_tree tree; @@ -166,7 +166,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c clear_buffer_new(bh_result); ext3_init_tree_desc(&tree, inode); -@@ -2253,18 +2252,33 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2311,18 +2310,36 @@ int ext3_ext_get_block(handle_t *handle, goto out2; } @@ -201,12 +201,15 @@ Index: linux-2.6.9-full/fs/ext3/extents.c + ar.goal = ext3_ext_find_goal(inode, path, iblock); + ar.logical = iblock; + ar.len = allocated; -+ ar.flags = EXT3_MB_HINT_DATA; ++ if (S_ISREG(inode->i_mode)) ++ ar.flags = EXT3_MB_HINT_DATA; ++ else ++ ar.flags = 0; + newblock = ext3_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; ext_debug(&tree, "allocate new block: goal %d, found %d\n", -@@ -2274,11 +2288,14 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2332,11 +2349,14 @@ int ext3_ext_get_block(handle_t *handle, newex.ee_block = iblock; newex.ee_start = newblock; newex.ee_start_hi = 0; @@ -223,7 +226,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c goto out2; } -@@ -2287,6 +2304,7 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2345,6 +2365,7 @@ int ext3_ext_get_block(handle_t *handle, /* previous routine could use block we allocated */ newblock = newex.ee_start; @@ -231,7 +234,7 @@ Index: linux-2.6.9-full/fs/ext3/extents.c set_buffer_new(bh_result); ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -@@ -2339,6 +2357,9 @@ void ext3_ext_truncate(struct inode * in +@@ -2397,6 +2418,9 @@ void ext3_ext_truncate(struct inode * in down(&EXT3_I(inode)->truncate_sem); ext3_ext_invalidate_cache(&tree); @@ -241,10 +244,10 @@ Index: linux-2.6.9-full/fs/ext3/extents.c /* * TODO: optimization is possible here * probably we need not scaning at all, -Index: linux-2.6.9-full/fs/ext3/Makefile +Index: linux-2.6.9/fs/ext3/Makefile =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/Makefile 2007-03-28 01:29:38.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/Makefile 2007-03-28 15:45:41.000000000 +0400 +--- linux-2.6.9.orig/fs/ext3/Makefile ++++ linux-2.6.9/fs/ext3/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ @@ -254,10 +257,10 @@ Index: linux-2.6.9-full/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.9-full/fs/ext3/xattr.c +Index: linux-2.6.9/fs/ext3/xattr.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/xattr.c 2006-05-18 23:57:04.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/xattr.c 2007-03-28 15:45:41.000000000 +0400 +--- linux-2.6.9.orig/fs/ext3/xattr.c ++++ linux-2.6.9/fs/ext3/xattr.c @@ -1281,7 +1281,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -285,10 +288,10 @@ Index: linux-2.6.9-full/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.9-full/fs/ext3/balloc.c +Index: linux-2.6.9/fs/ext3/balloc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/balloc.c 2006-03-10 18:20:03.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/balloc.c 2007-03-28 15:45:41.000000000 +0400 +--- linux-2.6.9.orig/fs/ext3/balloc.c ++++ linux-2.6.9/fs/ext3/balloc.c @@ -79,7 +79,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. @@ -354,10 +357,10 @@ Index: linux-2.6.9-full/fs/ext3/balloc.c unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.9-full/fs/ext3/inode.c +Index: linux-2.6.9/fs/ext3/inode.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/inode.c 2007-03-28 01:29:39.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/inode.c 2007-03-28 15:45:41.000000000 +0400 +--- linux-2.6.9.orig/fs/ext3/inode.c ++++ linux-2.6.9/fs/ext3/inode.c @@ -572,7 +572,7 @@ static int ext3_alloc_branch(handle_t *h ext3_journal_forget(handle, branch[i].bh); } diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc3-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc3-sles10.patch index 373f0c6680ec510a3f3035549707549d1e640140..88be686148a9f1990e5994185fb57db16f54fc34 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc3-sles10.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc3-sles10.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs_i.h +Index: linux-2.6.16.46-0.14/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.16.27-0.9-full.orig/include/linux/ext3_fs_i.h 2007-03-28 05:12:50.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/include/linux/ext3_fs_i.h 2007-03-28 16:03:20.000000000 +0400 +--- linux-2.6.16.46-0.14.orig/include/linux/ext3_fs_i.h ++++ linux-2.6.16.46-0.14/include/linux/ext3_fs_i.h @@ -135,6 +135,10 @@ struct ext3_inode_info { struct inode vfs_inode; @@ -13,10 +13,10 @@ Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs_i.h }; #endif /* _LINUX_EXT3_FS_I */ -Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs_sb.h +Index: linux-2.6.16.46-0.14/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.16.27-0.9-full.orig/include/linux/ext3_fs_sb.h 2007-03-28 16:03:19.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/include/linux/ext3_fs_sb.h 2007-03-28 16:03:20.000000000 +0400 +--- linux-2.6.16.46-0.14.orig/include/linux/ext3_fs_sb.h ++++ linux-2.6.16.46-0.14/include/linux/ext3_fs_sb.h @@ -21,8 +21,15 @@ #include <linux/wait.h> #include <linux/blockgroup_lock.h> @@ -33,10 +33,10 @@ Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h +Index: linux-2.6.16.46-0.14/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.16.27-0.9-full.orig/include/linux/ext3_fs.h 2007-03-28 16:03:19.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h 2007-03-28 16:03:20.000000000 +0400 +--- linux-2.6.16.46-0.14.orig/include/linux/ext3_fs.h ++++ linux-2.6.16.46-0.14/include/linux/ext3_fs.h @@ -407,6 +407,7 @@ struct ext3_inode { #define EXT3_MOUNT_IOPEN_NOPRIV 0x800000/* Make iopen world-readable */ #define EXT3_MOUNT_EXTENTS 0x1000000/* Extents support */ @@ -45,7 +45,7 @@ Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -767,8 +768,9 @@ struct dir_private_info { +@@ -784,8 +785,9 @@ struct dir_private_info { extern int ext3_bg_has_super(struct super_block *sb, int group); extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); @@ -56,11 +56,11 @@ Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h extern void ext3_free_blocks_sb (handle_t *, struct super_block *, unsigned long, unsigned long, int *); extern unsigned long ext3_count_free_blocks (struct super_block *); -Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c +Index: linux-2.6.16.46-0.14/fs/ext3/super.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/super.c 2007-03-28 16:03:19.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/fs/ext3/super.c 2007-03-28 16:03:20.000000000 +0400 -@@ -688,6 +688,7 @@ enum { +--- linux-2.6.16.46-0.14.orig/fs/ext3/super.c ++++ linux-2.6.16.46-0.14/fs/ext3/super.c +@@ -685,6 +685,7 @@ enum { Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_extents, Opt_noextents, Opt_extdebug, @@ -68,7 +68,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c Opt_grpquota }; -@@ -743,6 +744,9 @@ static match_table_t tokens = { +@@ -740,6 +741,9 @@ static match_table_t tokens = { {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, @@ -78,7 +78,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c {Opt_err, NULL}, {Opt_resize, "resize"}, }; -@@ -1092,6 +1096,19 @@ clear_qf_name: +@@ -1089,6 +1093,19 @@ clear_qf_name: case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; @@ -98,7 +98,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1819,6 +1836,7 @@ static int ext3_fill_super (struct super +@@ -1820,6 +1837,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -106,11 +106,11 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c lock_kernel(); return 0; -Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c +Index: linux-2.6.16.46-0.14/fs/ext3/extents.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/extents.c 2007-03-28 05:13:39.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/fs/ext3/extents.c 2007-03-28 16:03:20.000000000 +0400 -@@ -779,7 +779,7 @@ cleanup: +--- linux-2.6.16.46-0.14.orig/fs/ext3/extents.c ++++ linux-2.6.16.46-0.14/fs/ext3/extents.c +@@ -819,7 +819,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -119,7 +119,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c } } kfree(ablocks); -@@ -1586,7 +1586,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1629,7 +1629,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -128,7 +128,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c return err; } -@@ -2071,10 +2071,12 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -2128,10 +2128,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -142,7 +142,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -2086,7 +2088,7 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -2143,7 +2145,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -151,7 +151,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -@@ -2177,11 +2179,8 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2234,11 +2236,8 @@ int ext3_ext_get_block(handle_t *handle, struct ext3_extent *ex; int goal, newblock, err = 0, depth; struct ext3_extents_tree tree; @@ -165,7 +165,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c clear_buffer_new(bh_result); ext3_init_tree_desc(&tree, inode); -@@ -2253,18 +2252,33 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2310,18 +2309,36 @@ int ext3_ext_get_block(handle_t *handle, goto out2; } @@ -200,12 +200,15 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c + ar.goal = ext3_ext_find_goal(inode, path, iblock); + ar.logical = iblock; + ar.len = allocated; -+ ar.flags = EXT3_MB_HINT_DATA; ++ if (S_ISREG(inode->i_mode)) ++ ar.flags = EXT3_MB_HINT_DATA; ++ else ++ ar.flags = 0; + newblock = ext3_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; ext_debug(&tree, "allocate new block: goal %d, found %d\n", -@@ -2274,11 +2288,14 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2331,11 +2348,14 @@ int ext3_ext_get_block(handle_t *handle, newex.ee_block = iblock; newex.ee_start = newblock; newex.ee_start_hi = 0; @@ -222,7 +225,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c goto out2; } -@@ -2287,6 +2304,7 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2344,6 +2364,7 @@ int ext3_ext_get_block(handle_t *handle, /* previous routine could use block we allocated */ newblock = newex.ee_start; @@ -230,7 +233,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c set_buffer_new(bh_result); ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -@@ -2339,6 +2357,9 @@ void ext3_ext_truncate(struct inode * in +@@ -2396,6 +2417,9 @@ void ext3_ext_truncate(struct inode * in down(&EXT3_I(inode)->truncate_sem); ext3_ext_invalidate_cache(&tree); @@ -240,10 +243,10 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/extents.c /* * TODO: optimization is possible here * probably we need not scaning at all, -Index: linux-2.6.16.27-0.9-full/fs/ext3/Makefile +Index: linux-2.6.16.46-0.14/fs/ext3/Makefile =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/Makefile 2007-03-28 05:12:50.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/fs/ext3/Makefile 2007-03-28 16:03:20.000000000 +0400 +--- linux-2.6.16.46-0.14.orig/fs/ext3/Makefile ++++ linux-2.6.16.46-0.14/fs/ext3/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ @@ -253,10 +256,10 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.16.27-0.9-full/fs/ext3/xattr.c +Index: linux-2.6.16.46-0.14/fs/ext3/xattr.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/xattr.c 2007-03-13 02:56:52.000000000 +0300 -+++ linux-2.6.16.27-0.9-full/fs/ext3/xattr.c 2007-03-28 16:03:20.000000000 +0400 +--- linux-2.6.16.46-0.14.orig/fs/ext3/xattr.c ++++ linux-2.6.16.46-0.14/fs/ext3/xattr.c @@ -484,7 +484,7 @@ ext3_xattr_release_block(handle_t *handl ea_bdebug(bh, "refcount now=0; freeing"); if (ce) @@ -275,10 +278,10 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/xattr.c error = -EIO; goto cleanup; } -Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c +Index: linux-2.6.16.46-0.14/fs/ext3/balloc.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/balloc.c 2007-03-13 02:56:52.000000000 +0300 -+++ linux-2.6.16.27-0.9-full/fs/ext3/balloc.c 2007-03-28 16:03:20.000000000 +0400 +--- linux-2.6.16.46-0.14.orig/fs/ext3/balloc.c ++++ linux-2.6.16.46-0.14/fs/ext3/balloc.c @@ -80,7 +80,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. @@ -344,11 +347,11 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.16.27-0.9-full/fs/ext3/inode.c +Index: linux-2.6.16.46-0.14/fs/ext3/inode.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/inode.c 2007-03-28 05:13:38.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/fs/ext3/inode.c 2007-03-28 16:03:20.000000000 +0400 -@@ -568,7 +568,7 @@ static int ext3_alloc_branch(handle_t *h +--- linux-2.6.16.46-0.14.orig/fs/ext3/inode.c ++++ linux-2.6.16.46-0.14/fs/ext3/inode.c +@@ -569,7 +569,7 @@ static int ext3_alloc_branch(handle_t *h ext3_journal_forget(handle, branch[i].bh); } for (i = 0; i < keys; i++) @@ -357,7 +360,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/inode.c return err; } -@@ -1865,7 +1865,7 @@ ext3_clear_blocks(handle_t *handle, stru +@@ -1866,7 +1866,7 @@ ext3_clear_blocks(handle_t *handle, stru } } @@ -366,7 +369,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/inode.c } /** -@@ -2038,7 +2038,7 @@ static void ext3_free_branches(handle_t +@@ -2039,7 +2039,7 @@ static void ext3_free_branches(handle_t ext3_journal_test_restart(handle, inode); } diff --git a/ldiskfs/kernel_patches/patches/ext3-mballoc3-suse.patch b/ldiskfs/kernel_patches/patches/ext3-mballoc3-suse.patch index dd071486882804bdb6dcace69eb7cadba5dcc164..6968dfda555941d2837266c08c6fd90cdd8444fb 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mballoc3-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mballoc3-suse.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.5-7.283-full/include/linux/ext3_fs_i.h +Index: linux-2.6.5-7.286/include/linux/ext3_fs_i.h =================================================================== ---- linux-2.6.5-7.283-full.orig/include/linux/ext3_fs_i.h 2007-03-28 02:13:37.000000000 +0400 -+++ linux-2.6.5-7.283-full/include/linux/ext3_fs_i.h 2007-03-28 15:46:02.000000000 +0400 +--- linux-2.6.5-7.286.orig/include/linux/ext3_fs_i.h ++++ linux-2.6.5-7.286/include/linux/ext3_fs_i.h @@ -131,6 +131,10 @@ struct ext3_inode_info { struct inode vfs_inode; @@ -13,10 +13,10 @@ Index: linux-2.6.5-7.283-full/include/linux/ext3_fs_i.h }; #endif /* _LINUX_EXT3_FS_I */ -Index: linux-2.6.5-7.283-full/include/linux/ext3_fs_sb.h +Index: linux-2.6.5-7.286/include/linux/ext3_fs_sb.h =================================================================== ---- linux-2.6.5-7.283-full.orig/include/linux/ext3_fs_sb.h 2007-03-28 15:46:00.000000000 +0400 -+++ linux-2.6.5-7.283-full/include/linux/ext3_fs_sb.h 2007-03-28 15:46:02.000000000 +0400 +--- linux-2.6.5-7.286.orig/include/linux/ext3_fs_sb.h ++++ linux-2.6.5-7.286/include/linux/ext3_fs_sb.h @@ -23,9 +23,16 @@ #define EXT_INCLUDE #include <linux/blockgroup_lock.h> @@ -34,10 +34,10 @@ Index: linux-2.6.5-7.283-full/include/linux/ext3_fs_sb.h /* * third extended-fs super-block data in memory -Index: linux-2.6.5-7.283-full/include/linux/ext3_fs.h +Index: linux-2.6.5-7.286/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-7.283-full.orig/include/linux/ext3_fs.h 2007-03-28 15:46:00.000000000 +0400 -+++ linux-2.6.5-7.283-full/include/linux/ext3_fs.h 2007-03-28 15:46:02.000000000 +0400 +--- linux-2.6.5-7.286.orig/include/linux/ext3_fs.h ++++ linux-2.6.5-7.286/include/linux/ext3_fs.h @@ -363,6 +363,7 @@ struct ext3_inode { #define EXT3_MOUNT_IOPEN_NOPRIV 0x100000/* Make iopen world-readable */ #define EXT3_MOUNT_EXTENTS 0x200000/* Extents support */ @@ -46,7 +46,7 @@ Index: linux-2.6.5-7.283-full/include/linux/ext3_fs.h /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef clear_opt -@@ -723,8 +724,9 @@ struct dir_private_info { +@@ -731,8 +732,9 @@ struct dir_private_info { extern int ext3_bg_has_super(struct super_block *sb, int group); extern unsigned long ext3_bg_num_gdb(struct super_block *sb, int group); extern int ext3_new_block (handle_t *, struct inode *, unsigned long, int *); @@ -57,11 +57,11 @@ Index: linux-2.6.5-7.283-full/include/linux/ext3_fs.h extern unsigned long ext3_count_free_blocks (struct super_block *); extern void ext3_check_blocks_bitmap (struct super_block *); extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb, -Index: linux-2.6.5-7.283-full/fs/ext3/super.c +Index: linux-2.6.5-7.286/fs/ext3/super.c =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/super.c 2007-03-28 15:46:00.000000000 +0400 -+++ linux-2.6.5-7.283-full/fs/ext3/super.c 2007-03-28 15:46:02.000000000 +0400 -@@ -622,6 +622,7 @@ enum { +--- linux-2.6.5-7.286.orig/fs/ext3/super.c ++++ linux-2.6.5-7.286/fs/ext3/super.c +@@ -591,6 +591,7 @@ enum { Opt_err, Opt_iopen, Opt_noiopen, Opt_iopen_nopriv, Opt_extents, Opt_noextents, Opt_extdebug, @@ -69,7 +69,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/super.c }; static match_table_t tokens = { -@@ -669,6 +670,9 @@ static match_table_t tokens = { +@@ -638,6 +639,9 @@ static match_table_t tokens = { {Opt_noextents, "noextents"}, {Opt_extdebug, "extdebug"}, {Opt_barrier, "barrier=%u"}, @@ -79,7 +79,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/super.c {Opt_err, NULL} }; -@@ -893,6 +897,19 @@ static int parse_options (char * options +@@ -862,6 +866,19 @@ static int parse_options (char * options case Opt_extdebug: set_opt (sbi->s_mount_opt, EXTDEBUG); break; @@ -99,7 +99,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/super.c default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " -@@ -1548,6 +1565,7 @@ static int ext3_fill_super (struct super +@@ -1515,6 +1532,7 @@ static int ext3_fill_super (struct super ext3_count_dirs(sb)); ext3_ext_init(sb); @@ -107,11 +107,11 @@ Index: linux-2.6.5-7.283-full/fs/ext3/super.c return 0; -Index: linux-2.6.5-7.283-full/fs/ext3/extents.c +Index: linux-2.6.5-7.286/fs/ext3/extents.c =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/extents.c 2007-03-28 03:18:19.000000000 +0400 -+++ linux-2.6.5-7.283-full/fs/ext3/extents.c 2007-03-28 15:46:02.000000000 +0400 -@@ -779,7 +779,7 @@ cleanup: +--- linux-2.6.5-7.286.orig/fs/ext3/extents.c ++++ linux-2.6.5-7.286/fs/ext3/extents.c +@@ -819,7 +819,7 @@ cleanup: for (i = 0; i < depth; i++) { if (!ablocks[i]) continue; @@ -120,7 +120,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/extents.c } } kfree(ablocks); -@@ -1586,7 +1586,7 @@ int ext3_ext_rm_idx(handle_t *handle, st +@@ -1629,7 +1629,7 @@ int ext3_ext_rm_idx(handle_t *handle, st path->p_idx->ei_leaf); bh = sb_find_get_block(tree->inode->i_sb, path->p_idx->ei_leaf); ext3_forget(handle, 1, tree->inode, bh, path->p_idx->ei_leaf); @@ -129,7 +129,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/extents.c return err; } -@@ -2071,10 +2071,12 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -2128,10 +2128,12 @@ ext3_remove_blocks(struct ext3_extents_t int needed = ext3_remove_blocks_credits(tree, ex, from, to); handle_t *handle = ext3_journal_start(tree->inode, needed); struct buffer_head *bh; @@ -143,7 +143,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/extents.c if (from >= ex->ee_block && to == ex->ee_block + ex->ee_len - 1) { /* tail removal */ unsigned long num, start; -@@ -2086,7 +2088,7 @@ ext3_remove_blocks(struct ext3_extents_t +@@ -2143,7 +2145,7 @@ ext3_remove_blocks(struct ext3_extents_t bh = sb_find_get_block(tree->inode->i_sb, start + i); ext3_forget(handle, 0, tree->inode, bh, start + i); } @@ -152,7 +152,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/extents.c } else if (from == ex->ee_block && to <= ex->ee_block + ex->ee_len - 1) { printk("strange request: removal %lu-%lu from %u:%u\n", from, to, ex->ee_block, ex->ee_len); -@@ -2177,11 +2179,8 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2234,11 +2236,8 @@ int ext3_ext_get_block(handle_t *handle, struct ext3_extent *ex; int goal, newblock, err = 0, depth; struct ext3_extents_tree tree; @@ -166,7 +166,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/extents.c __clear_bit(BH_New, &bh_result->b_state); ext3_init_tree_desc(&tree, inode); -@@ -2253,18 +2252,33 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2310,18 +2309,36 @@ int ext3_ext_get_block(handle_t *handle, goto out2; } @@ -201,12 +201,15 @@ Index: linux-2.6.5-7.283-full/fs/ext3/extents.c + ar.goal = ext3_ext_find_goal(inode, path, iblock); + ar.logical = iblock; + ar.len = allocated; -+ ar.flags = EXT3_MB_HINT_DATA; ++ if (S_ISREG(inode->i_mode)) ++ ar.flags = EXT3_MB_HINT_DATA; ++ else ++ ar.flags = 0; + newblock = ext3_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; ext_debug(&tree, "allocate new block: goal %d, found %d\n", -@@ -2274,11 +2288,14 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2331,11 +2348,14 @@ int ext3_ext_get_block(handle_t *handle, newex.ee_block = iblock; newex.ee_start = newblock; newex.ee_start_hi = 0; @@ -223,7 +226,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/extents.c goto out2; } -@@ -2287,6 +2304,7 @@ int ext3_ext_get_block(handle_t *handle, +@@ -2344,6 +2364,7 @@ int ext3_ext_get_block(handle_t *handle, /* previous routine could use block we allocated */ newblock = newex.ee_start; @@ -231,7 +234,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/extents.c __set_bit(BH_New, &bh_result->b_state); ext3_ext_put_in_cache(&tree, newex.ee_block, newex.ee_len, -@@ -2341,6 +2359,9 @@ void ext3_ext_truncate(struct inode * in +@@ -2398,6 +2419,9 @@ void ext3_ext_truncate(struct inode * in down(&EXT3_I(inode)->truncate_sem); ext3_ext_invalidate_cache(&tree); @@ -241,10 +244,10 @@ Index: linux-2.6.5-7.283-full/fs/ext3/extents.c /* * TODO: optimization is possible here * probably we need not scaning at all, -Index: linux-2.6.5-7.283-full/fs/ext3/Makefile +Index: linux-2.6.5-7.286/fs/ext3/Makefile =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/Makefile 2007-03-28 15:27:39.000000000 +0400 -+++ linux-2.6.5-7.283-full/fs/ext3/Makefile 2007-03-28 15:46:02.000000000 +0400 +--- linux-2.6.5-7.286.orig/fs/ext3/Makefile ++++ linux-2.6.5-7.286/fs/ext3/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_EXT3_FS) += ext3.o ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ @@ -254,10 +257,10 @@ Index: linux-2.6.5-7.283-full/fs/ext3/Makefile ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o -Index: linux-2.6.5-7.283-full/fs/ext3/xattr.c +Index: linux-2.6.5-7.286/fs/ext3/xattr.c =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/xattr.c 2007-03-28 02:13:37.000000000 +0400 -+++ linux-2.6.5-7.283-full/fs/ext3/xattr.c 2007-03-28 15:46:02.000000000 +0400 +--- linux-2.6.5-7.286.orig/fs/ext3/xattr.c ++++ linux-2.6.5-7.286/fs/ext3/xattr.c @@ -1371,7 +1371,7 @@ ext3_xattr_set_handle2(handle_t *handle, new_bh = sb_getblk(sb, block); if (!new_bh) { @@ -285,10 +288,10 @@ Index: linux-2.6.5-7.283-full/fs/ext3/xattr.c get_bh(bh); ext3_forget(handle, 1, inode, bh, EXT3_I(inode)->i_file_acl); } else { -Index: linux-2.6.5-7.283-full/fs/ext3/balloc.c +Index: linux-2.6.5-7.286/fs/ext3/balloc.c =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/balloc.c 2006-12-01 18:39:48.000000000 +0300 -+++ linux-2.6.5-7.283-full/fs/ext3/balloc.c 2007-03-28 15:46:02.000000000 +0400 +--- linux-2.6.5-7.286.orig/fs/ext3/balloc.c ++++ linux-2.6.5-7.286/fs/ext3/balloc.c @@ -78,7 +78,7 @@ struct ext3_group_desc * ext3_get_group_ * * Return buffer_head on success or NULL in case of failure. @@ -355,10 +358,10 @@ Index: linux-2.6.5-7.283-full/fs/ext3/balloc.c unsigned long goal, int *errp) { struct buffer_head *bitmap_bh = NULL; -Index: linux-2.6.5-7.283-full/fs/ext3/inode.c +Index: linux-2.6.5-7.286/fs/ext3/inode.c =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/inode.c 2007-03-28 02:50:19.000000000 +0400 -+++ linux-2.6.5-7.283-full/fs/ext3/inode.c 2007-03-28 15:46:02.000000000 +0400 +--- linux-2.6.5-7.286.orig/fs/ext3/inode.c ++++ linux-2.6.5-7.286/fs/ext3/inode.c @@ -574,7 +574,7 @@ static int ext3_alloc_branch(handle_t *h ext3_journal_forget(handle, branch[i].bh); } @@ -377,7 +380,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/inode.c return err; } -@@ -1839,7 +1839,7 @@ ext3_clear_blocks(handle_t *handle, stru +@@ -1848,7 +1848,7 @@ ext3_clear_blocks(handle_t *handle, stru } } @@ -386,7 +389,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/inode.c } /** -@@ -2010,7 +2010,7 @@ static void ext3_free_branches(handle_t +@@ -2019,7 +2019,7 @@ static void ext3_free_branches(handle_t ext3_journal_test_restart(handle, inode); } diff --git a/ldiskfs/kernel_patches/patches/ext3-mmp-2.6-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-mmp-2.6-rhel4.patch index 3a86d41a42865ef77be0aecc2fdc433fa7c56434..ea3b19ab7d01e2b26420077b82a7b07be121b3bb 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mmp-2.6-rhel4.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mmp-2.6-rhel4.patch @@ -18,10 +18,10 @@ Index: linux-2.6.9/fs/ext3/super.c + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + + remove_proc_entry(sb->s_id, proc_root_ext3); + sbi->s_dev_proc = NULL; sb->s_fs_info = NULL; - kfree(sbi); - return; -@@ -1431,6 +1436,314 @@ static unsigned long descriptor_loc(stru +@@ -1439,6 +1444,314 @@ static unsigned long descriptor_loc(stru return (first_data_block + has_super + (bg * sbi->s_blocks_per_group)); } diff --git a/ldiskfs/kernel_patches/patches/ext3-mmp-2.6-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-mmp-2.6-sles10.patch index c9637f6da0739e0b4c96e1da34d6dd9e91e3c651..04635f5de21bb8e88c42c9dbb5a455b76900f10f 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mmp-2.6-sles10.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mmp-2.6-sles10.patch @@ -18,10 +18,10 @@ Index: linux-2.6.16.46-0.14/fs/ext3/super.c + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + + remove_proc_entry(sb->s_id, proc_root_ext3); + sbi->s_dev_proc = NULL; sb->s_fs_info = NULL; - kfree(sbi); - return; -@@ -1521,6 +1526,313 @@ static unsigned long descriptor_loc(stru +@@ -1529,6 +1534,313 @@ static unsigned long descriptor_loc(stru return (first_data_block + has_super + (bg * sbi->s_blocks_per_group)); } diff --git a/ldiskfs/kernel_patches/patches/ext3-mmp-2.6.18-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-mmp-2.6.18-vanilla.patch index b7c4a1c95207daf10c85325878166fecb3b610b0..26f86b8c959f035577c9ff7e591f604c7a5c6a30 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mmp-2.6.18-vanilla.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mmp-2.6.18-vanilla.patch @@ -18,10 +18,10 @@ Index: linux-2.6.18/fs/ext3/super.c + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + + remove_proc_entry(sb->s_id, proc_root_ext3); + sbi->s_dev_proc = NULL; sb->s_fs_info = NULL; - kfree(sbi); - return; -@@ -1528,6 +1533,313 @@ static ext3_fsblk_t descriptor_loc(struc +@@ -1536,6 +1541,313 @@ static ext3_fsblk_t descriptor_loc(struc return (has_super + ext3_group_first_block_no(sb, bg)); } diff --git a/ldiskfs/kernel_patches/patches/ext3-mmp-2.6.22-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-mmp-2.6.22-vanilla.patch index afb34e100d85c1f4f4365253b1b9950889c46955..c7fd5f69c060175e8336d8380db2ff4164f94a12 100644 --- a/ldiskfs/kernel_patches/patches/ext3-mmp-2.6.22-vanilla.patch +++ b/ldiskfs/kernel_patches/patches/ext3-mmp-2.6.22-vanilla.patch @@ -18,10 +18,10 @@ Index: linux-2.6.18/fs/ext3/super.c + if (sbi->s_mmp_tsk) + kthread_stop(sbi->s_mmp_tsk); + + remove_proc_entry(sb->s_id, proc_root_ext3); + sbi->s_dev_proc = NULL; sb->s_fs_info = NULL; - kfree(sbi); - return; -@@ -1528,6 +1533,313 @@ static ext3_fsblk_t descriptor_loc(struc +@@ -1536,6 +1541,313 @@ static ext3_fsblk_t descriptor_loc(struc return (has_super + ext3_group_first_block_no(sb, bg)); } diff --git a/ldiskfs/kernel_patches/patches/ext3-print-inum-in-htree-warning.patch b/ldiskfs/kernel_patches/patches/ext3-print-inum-in-htree-warning.patch new file mode 100644 index 0000000000000000000000000000000000000000..7796ec92cfbd65e82f488e1d03e1ef0783a8da1e --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-print-inum-in-htree-warning.patch @@ -0,0 +1,16 @@ +Index: linux-2.6.18.8/fs/ext3/namei.c +=================================================================== +--- linux-2.6.18.8.orig/fs/ext3/namei.c ++++ linux-2.6.18.8/fs/ext3/namei.c +@@ -347,8 +347,8 @@ dx_probe(struct dentry *dentry, struct i + root->info.hash_version != DX_HASH_HALF_MD4 && + root->info.hash_version != DX_HASH_LEGACY) { + ext3_warning(dir->i_sb, __FUNCTION__, +- "Unrecognised inode hash code %d", +- root->info.hash_version); ++ "Unrecognised inode hash code %d for directory " ++ "#%lu", root->info.hash_version, dir->i_ino); + brelse(bh); + *err = ERR_BAD_DX_DIR; + goto fail; + diff --git a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6-sles10.patch index 32a98105de4e54d1bf781fd3352b2ccba82bc993..709a0fd274b3d5ab7271299cb20e8e25a3bef68f 100644 --- a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6-sles10.patch +++ b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6-sles10.patch @@ -6,10 +6,10 @@ group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. -Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h +Index: linux-2.6.16.54-0.2.5/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.16.27-0.9-full.orig/include/linux/ext3_fs.h 2007-03-28 18:20:16.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h 2007-03-28 18:30:06.000000000 +0400 +--- linux-2.6.16.54-0.2.5.orig/include/linux/ext3_fs.h ++++ linux-2.6.16.54-0.2.5/include/linux/ext3_fs.h @@ -153,16 +153,22 @@ struct ext3_allocation_request { */ struct ext3_group_desc @@ -37,7 +37,7 @@ Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h /* * Macro-instructions used to manage group descriptors */ -@@ -590,6 +596,7 @@ static inline struct ext3_inode_info *EX +@@ -607,6 +613,7 @@ static inline int ext3_valid_inum(struct #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 @@ -45,7 +45,7 @@ Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 -@@ -606,6 +613,7 @@ static inline struct ext3_inode_info *EX +@@ -623,6 +630,7 @@ static inline int ext3_valid_inum(struct EXT3_FEATURE_INCOMPAT_EXTENTS) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ @@ -53,10 +53,10 @@ Index: linux-2.6.16.27-0.9-full/include/linux/ext3_fs.h EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -Index: linux-2.6.16.27-0.9-full/fs/ext3/resize.c +Index: linux-2.6.16.54-0.2.5/fs/ext3/resize.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/resize.c 2007-03-13 02:56:52.000000000 +0300 -+++ linux-2.6.16.27-0.9-full/fs/ext3/resize.c 2007-03-28 18:30:06.000000000 +0400 +--- linux-2.6.16.54-0.2.5.orig/fs/ext3/resize.c ++++ linux-2.6.16.54-0.2.5/fs/ext3/resize.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/slab.h> @@ -65,7 +65,33 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/resize.c #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) -@@ -818,6 +819,7 @@ int ext3_group_add(struct super_block *s +@@ -134,25 +135,6 @@ static struct buffer_head *bclean(handle + } + + /* +- * To avoid calling the atomic setbit hundreds or thousands of times, we only +- * need to use it within a single byte (to ensure we get endianness right). +- * We can use memset for the rest of the bitmap as there are no other users. +- */ +-static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +-{ +- int i; +- +- if (start_bit >= end_bit) +- return; +- +- ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); +- for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) +- ext3_set_bit(i, bitmap); +- if (i < end_bit) +- memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +-} +- +-/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to +@@ -818,6 +800,7 @@ int ext3_group_add(struct super_block *s gdp->bg_inode_table = cpu_to_le32(input->inode_table); gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); @@ -73,10 +99,10 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/resize.c /* * Make the new blocks and inodes valid next. We do this before -Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c +Index: linux-2.6.16.54-0.2.5/fs/ext3/super.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/super.c 2007-03-28 18:25:51.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/fs/ext3/super.c 2007-03-28 18:30:06.000000000 +0400 +--- linux-2.6.16.54-0.2.5.orig/fs/ext3/super.c ++++ linux-2.6.16.54-0.2.5/fs/ext3/super.c @@ -42,6 +42,7 @@ #include "xattr.h" #include "acl.h" @@ -85,7 +111,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); -@@ -1221,6 +1222,90 @@ static int ext3_setup_super(struct super +@@ -1220,6 +1221,90 @@ static int ext3_setup_super(struct super return res; } @@ -176,7 +202,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c /* Called at mount-time, super-block is locked */ static int ext3_check_descriptors (struct super_block * sb) { -@@ -1270,6 +1355,13 @@ static int ext3_check_descriptors (struc +@@ -1269,6 +1354,13 @@ static int ext3_check_descriptors (struc le32_to_cpu(gdp->bg_inode_table)); return 0; } @@ -190,11 +216,11 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/super.c block += EXT3_BLOCKS_PER_GROUP(sb); gdp++; } -Index: linux-2.6.16.27-0.9-full/fs/ext3/group.h +Index: linux-2.6.16.54-0.2.5/fs/ext3/group.h =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/group.h 2007-02-13 18:39:59.640066087 +0300 -+++ linux-2.6.16.27-0.9-full/fs/ext3/group.h 2007-03-28 18:30:06.000000000 +0400 -@@ -0,0 +1,29 @@ +--- /dev/null ++++ linux-2.6.16.54-0.2.5/fs/ext3/group.h +@@ -0,0 +1,30 @@ +/* + * linux/fs/ext3/group.h + * @@ -223,11 +249,12 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/group.h +extern unsigned ext3_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, int group, + struct ext3_group_desc *desc); ++extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +#endif /* _LINUX_EXT3_GROUP_H */ -Index: linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c +Index: linux-2.6.16.54-0.2.5/fs/ext3/ialloc.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/ialloc.c 2007-03-28 18:20:17.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c 2007-03-28 18:30:06.000000000 +0400 +--- linux-2.6.16.54-0.2.5.orig/fs/ext3/ialloc.c ++++ linux-2.6.16.54-0.2.5/fs/ext3/ialloc.c @@ -28,6 +28,7 @@ #include "xattr.h" @@ -245,7 +272,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ -+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) ++void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + @@ -320,7 +347,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_inc(&sbi->s_freeinodes_counter); if (is_directory) -@@ -453,7 +513,7 @@ struct inode *ext3_new_inode(handle_t *h +@@ -452,7 +512,7 @@ struct inode *ext3_new_inode(handle_t *h struct ext3_sb_info *sbi; int err = 0; struct inode *ret; @@ -329,7 +356,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) -@@ -570,11 +630,13 @@ repeat_in_this_group: +@@ -569,11 +629,13 @@ repeat_in_this_group: goto out; got: @@ -348,7 +375,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c err = -EIO; goto fail; } -@@ -582,13 +644,65 @@ got: +@@ -581,13 +643,65 @@ got: BUFFER_TRACE(bh2, "get_write_access"); err = ext3_journal_get_write_access(handle, bh2); if (err) goto fail; @@ -414,7 +441,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c spin_unlock(sb_bgl_lock(sbi, group)); BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bh2); -@@ -610,7 +724,7 @@ got: +@@ -609,7 +723,7 @@ got: inode->i_gid = current->fsgid; inode->i_mode = mode; @@ -423,10 +450,10 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/ialloc.c /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blksize = PAGE_SIZE; inode->i_blocks = 0; -Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c +Index: linux-2.6.16.54-0.2.5/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/mballoc.c 2007-03-28 16:03:19.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c 2007-03-28 18:30:36.000000000 +0400 +--- linux-2.6.16.54-0.2.5.orig/fs/ext3/mballoc.c ++++ linux-2.6.16.54-0.2.5/fs/ext3/mballoc.c @@ -36,6 +36,8 @@ #include <linux/seq_file.h> #include <linux/version.h> @@ -444,7 +471,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; -@@ -928,10 +931,7 @@ static int ext3_mb_init_cache(struct pag +@@ -943,10 +946,7 @@ static int ext3_mb_init_cache(struct pag if (first_group + i >= EXT3_SB(sb)->s_groups_count) break; @@ -456,7 +483,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c err = -ENOMEM; bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -@@ -946,7 +946,12 @@ static int ext3_mb_init_cache(struct pag +@@ -961,7 +961,12 @@ static int ext3_mb_init_cache(struct pag unlock_buffer(bh[i]); continue; } @@ -470,7 +497,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c get_bh(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); -@@ -1703,6 +1708,10 @@ static int ext3_mb_good_group(struct ext +@@ -1733,6 +1738,10 @@ static int ext3_mb_good_group(struct ext switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); @@ -481,7 +508,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c bits = ac->ac_sb->s_blocksize_bits + 1; for (i = ac->ac_2order; i <= bits; i++) if (grp->bb_counters[i] > 0) -@@ -1796,7 +1805,9 @@ repeat: +@@ -1826,7 +1835,9 @@ repeat: } ac->ac_groups_scanned++; @@ -492,7 +519,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c ext3_mb_simple_scan_group(ac, &e3b); else if (cr == 1 && ac->ac_g_ex.fe_len == sbi->s_stripe) ext3_mb_scan_aligned(ac, &e3b); -@@ -2267,12 +2278,13 @@ int ext3_mb_init_backend(struct super_bl +@@ -2306,12 +2317,13 @@ int ext3_mb_init_backend(struct super_bl i--; goto err_freebuddy; } @@ -507,8 +534,8 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &meta_group_info[j]->bb_state); -@@ -2936,9 +2948,17 @@ int ext3_mb_mark_diskspace_used(struct e - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); +@@ -2945,9 +2957,17 @@ int ext3_mb_mark_diskspace_used(struct e + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + if (gdp->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) { @@ -525,7 +552,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len); -@@ -4303,6 +4323,7 @@ do_more: +@@ -4357,6 +4377,7 @@ do_more: spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); @@ -533,10 +560,10 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c +Index: linux-2.6.16.54-0.2.5/fs/ext3/balloc.c =================================================================== ---- linux-2.6.16.27-0.9-full.orig/fs/ext3/balloc.c 2007-03-28 16:03:20.000000000 +0400 -+++ linux-2.6.16.27-0.9-full/fs/ext3/balloc.c 2007-03-28 18:30:06.000000000 +0400 +--- linux-2.6.16.54-0.2.5.orig/fs/ext3/balloc.c ++++ linux-2.6.16.54-0.2.5/fs/ext3/balloc.c @@ -21,6 +21,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> @@ -545,7 +572,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c /* * balloc.c contains the blocks allocation and deallocation routines */ -@@ -74,6 +75,75 @@ struct ext3_group_desc * ext3_get_group_ +@@ -74,6 +75,83 @@ struct ext3_group_desc * ext3_get_group_ return desc + offset; } @@ -613,6 +640,14 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c + for (bit = le32_to_cpu(gdp->bg_inode_table) - start, + bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++) + ext3_set_bit(bit, bh->b_data); ++ ++ /* ++ * Also if the number of blocks within the group is ++ * less than the blocksize * 8 ( which is the size ++ * of bitmap ), set rest of the block bitmap to 1 ++ */ ++ mark_bitmap_end(EXT3_BLOCKS_PER_GROUP(sb), sb->s_blocksize * 8, ++ bh->b_data); + } + + return free_blocks - sbi->s_itb_per_group - 2; @@ -621,7 +656,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c /* * Read the bitmap for a given block_group, reading into the specified * slot in the superblock's bitmap cache. -@@ -89,7 +159,19 @@ read_block_bitmap(struct super_block *sb +@@ -89,7 +167,19 @@ read_block_bitmap(struct super_block *sb desc = ext3_get_group_desc (sb, block_group, NULL); if (!desc) goto error_out; @@ -642,7 +677,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c if (!bh) ext3_error (sb, "read_block_bitmap", "Cannot read block bitmap - " -@@ -468,6 +550,7 @@ do_more: +@@ -468,6 +558,7 @@ do_more: desc->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + group_freed); @@ -650,7 +685,7 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -@@ -1378,8 +1461,11 @@ allocated: +@@ -1378,8 +1469,11 @@ allocated: ret_block, goal_hits, goal_attempts); spin_lock(sb_bgl_lock(sbi, group_no)); @@ -662,13 +697,3 @@ Index: linux-2.6.16.27-0.9-full/fs/ext3/balloc.c spin_unlock(sb_bgl_lock(sbi, group_no)); percpu_counter_mod(&sbi->s_freeblocks_counter, -1); - -%diffstat - fs/ext3/balloc.c | 88 +++++++++++++++++++++++++++++ - fs/ext3/group.h | 38 ++++++++++++ - fs/ext3/ialloc.c | 144 +++++++++++++++++++++++++++++++++++++++++++----- - fs/ext3/mballoc.c | 35 +++++++++-- - fs/ext3/resize.c | 2 - fs/ext3/super.c | 92 ++++++++++++++++++++++++++++++ - include/linux/ext3_fs.h | 16 ++++- - 7 files changed, 388 insertions(+), 27 deletions(-) diff --git a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6-suse.patch b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6-suse.patch index 80bf99b88d0c59e527bfb7c68d2d8f5d9b329694..8fccc396a201d9219cbd40d5257440f99bcf9c58 100644 --- a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6-suse.patch @@ -6,10 +6,10 @@ group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. -Index: linux-2.6.5-7.283-full/include/linux/ext3_fs.h +Index: linux-2.6.5-7.311/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.5-7.283-full.orig/include/linux/ext3_fs.h 2007-03-28 17:33:05.000000000 +0400 -+++ linux-2.6.5-7.283-full/include/linux/ext3_fs.h 2007-03-28 18:33:35.000000000 +0400 +--- linux-2.6.5-7.311.orig/include/linux/ext3_fs.h ++++ linux-2.6.5-7.311/include/linux/ext3_fs.h @@ -153,16 +153,22 @@ struct ext3_allocation_request { */ struct ext3_group_desc @@ -37,7 +37,7 @@ Index: linux-2.6.5-7.283-full/include/linux/ext3_fs.h /* * Macro-instructions used to manage group descriptors */ -@@ -458,7 +464,7 @@ struct ext3_super_block { +@@ -466,7 +472,7 @@ struct ext3_super_block { */ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ @@ -46,7 +46,7 @@ Index: linux-2.6.5-7.283-full/include/linux/ext3_fs.h /* * Journaling support valid if EXT3_FEATURE_COMPAT_HAS_JOURNAL set. */ -@@ -546,6 +552,7 @@ static inline struct ext3_inode_info *EX +@@ -554,6 +560,7 @@ static inline struct ext3_inode_info *EX #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 @@ -54,7 +54,7 @@ Index: linux-2.6.5-7.283-full/include/linux/ext3_fs.h #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 -@@ -562,6 +569,7 @@ static inline struct ext3_inode_info *EX +@@ -570,6 +577,7 @@ static inline struct ext3_inode_info *EX EXT3_FEATURE_INCOMPAT_EXTENTS) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ @@ -62,10 +62,10 @@ Index: linux-2.6.5-7.283-full/include/linux/ext3_fs.h EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -Index: linux-2.6.5-7.283-full/fs/ext3/super.c +Index: linux-2.6.5-7.311/fs/ext3/super.c =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/super.c 2007-03-28 17:33:05.000000000 +0400 -+++ linux-2.6.5-7.283-full/fs/ext3/super.c 2007-03-28 18:33:35.000000000 +0400 +--- linux-2.6.5-7.311.orig/fs/ext3/super.c ++++ linux-2.6.5-7.311/fs/ext3/super.c @@ -36,6 +36,7 @@ #include <linux/quotaops.h> #include "xattr.h" @@ -74,7 +74,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/super.c static int ext3_load_journal(struct super_block *, struct ext3_super_block *); static int ext3_create_journal(struct super_block *, struct ext3_super_block *, -@@ -996,6 +997,90 @@ static int ext3_setup_super(struct super +@@ -998,6 +999,90 @@ static int ext3_setup_super(struct super return res; } @@ -165,7 +165,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/super.c static int ext3_check_descriptors (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); -@@ -1044,6 +1129,13 @@ static int ext3_check_descriptors (struc +@@ -1046,6 +1131,13 @@ static int ext3_check_descriptors (struc le32_to_cpu(gdp->bg_inode_table)); return 0; } @@ -179,11 +179,11 @@ Index: linux-2.6.5-7.283-full/fs/ext3/super.c block += EXT3_BLOCKS_PER_GROUP(sb); gdp++; } -Index: linux-2.6.5-7.283-full/fs/ext3/group.h +Index: linux-2.6.5-7.311/fs/ext3/group.h =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/group.h 2007-02-13 18:39:59.640066087 +0300 -+++ linux-2.6.5-7.283-full/fs/ext3/group.h 2007-03-28 18:33:35.000000000 +0400 -@@ -0,0 +1,29 @@ +--- /dev/null ++++ linux-2.6.5-7.311/fs/ext3/group.h +@@ -0,0 +1,30 @@ +/* + * linux/fs/ext3/group.h + * @@ -212,11 +212,12 @@ Index: linux-2.6.5-7.283-full/fs/ext3/group.h +extern unsigned ext3_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, int group, + struct ext3_group_desc *desc); ++extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +#endif /* _LINUX_EXT3_GROUP_H */ -Index: linux-2.6.5-7.283-full/fs/ext3/ialloc.c +Index: linux-2.6.5-7.311/fs/ext3/ialloc.c =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/ialloc.c 2007-03-28 17:33:03.000000000 +0400 -+++ linux-2.6.5-7.283-full/fs/ext3/ialloc.c 2007-03-28 18:33:35.000000000 +0400 +--- linux-2.6.5-7.311.orig/fs/ext3/ialloc.c ++++ linux-2.6.5-7.311/fs/ext3/ialloc.c @@ -28,6 +28,7 @@ #include "xattr.h" @@ -234,7 +235,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/ialloc.c + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ -+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) ++void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + @@ -309,7 +310,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/ialloc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_inc(&sbi->s_freeinodes_counter); if (is_directory) -@@ -454,7 +514,7 @@ struct inode *ext3_new_inode(handle_t *h +@@ -453,7 +513,7 @@ struct inode *ext3_new_inode(handle_t *h struct ext3_sb_info *sbi; int err = 0; struct inode *ret; @@ -318,7 +319,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/ialloc.c /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) -@@ -570,11 +630,13 @@ repeat_in_this_group: +@@ -569,11 +629,13 @@ repeat_in_this_group: goto out; got: @@ -337,7 +338,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/ialloc.c err = -EIO; goto fail; } -@@ -582,13 +644,65 @@ got: +@@ -581,13 +643,65 @@ got: BUFFER_TRACE(bh2, "get_write_access"); err = ext3_journal_get_write_access(handle, bh2); if (err) goto fail; @@ -403,7 +404,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/ialloc.c spin_unlock(sb_bgl_lock(sbi, group)); BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bh2); -@@ -610,7 +724,7 @@ got: +@@ -609,7 +723,7 @@ got: inode->i_gid = current->fsgid; inode->i_mode = mode; @@ -412,10 +413,10 @@ Index: linux-2.6.5-7.283-full/fs/ext3/ialloc.c /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blksize = PAGE_SIZE; inode->i_blocks = 0; -Index: linux-2.6.5-7.283-full/fs/ext3/mballoc.c +Index: linux-2.6.5-7.311/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/mballoc.c 2007-03-28 15:46:00.000000000 +0400 -+++ linux-2.6.5-7.283-full/fs/ext3/mballoc.c 2007-03-28 18:33:35.000000000 +0400 +--- linux-2.6.5-7.311.orig/fs/ext3/mballoc.c ++++ linux-2.6.5-7.311/fs/ext3/mballoc.c @@ -36,6 +36,8 @@ #include <linux/seq_file.h> #include <linux/version.h> @@ -433,7 +434,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/mballoc.c unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; -@@ -928,10 +931,7 @@ static int ext3_mb_init_cache(struct pag +@@ -943,10 +946,7 @@ static int ext3_mb_init_cache(struct pag if (first_group + i >= EXT3_SB(sb)->s_groups_count) break; @@ -445,7 +446,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/mballoc.c err = -ENOMEM; bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -@@ -946,7 +946,12 @@ static int ext3_mb_init_cache(struct pag +@@ -961,7 +961,12 @@ static int ext3_mb_init_cache(struct pag unlock_buffer(bh[i]); continue; } @@ -459,7 +460,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/mballoc.c get_bh(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); -@@ -1703,6 +1708,10 @@ static int ext3_mb_good_group(struct ext +@@ -1733,6 +1738,10 @@ static int ext3_mb_good_group(struct ext switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); @@ -470,7 +471,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/mballoc.c bits = ac->ac_sb->s_blocksize_bits + 1; for (i = ac->ac_2order; i <= bits; i++) if (grp->bb_counters[i] > 0) -@@ -1796,7 +1805,9 @@ repeat: +@@ -1826,7 +1835,9 @@ repeat: } ac->ac_groups_scanned++; @@ -481,7 +482,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/mballoc.c ext3_mb_simple_scan_group(ac, &e3b); else if (cr == 1 && ac->ac_g_ex.fe_len == sbi->s_stripe) ext3_mb_scan_aligned(ac, &e3b); -@@ -2267,12 +2278,13 @@ int ext3_mb_init_backend(struct super_bl +@@ -2306,12 +2317,13 @@ int ext3_mb_init_backend(struct super_bl i--; goto err_freebuddy; } @@ -496,8 +497,8 @@ Index: linux-2.6.5-7.283-full/fs/ext3/mballoc.c set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &meta_group_info[j]->bb_state); -@@ -2936,9 +2948,17 @@ int ext3_mb_mark_diskspace_used(struct e - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); +@@ -2945,9 +2957,17 @@ int ext3_mb_mark_diskspace_used(struct e + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + if (gdp->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) { @@ -514,7 +515,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len); -@@ -4303,6 +4323,7 @@ do_more: +@@ -4357,6 +4377,7 @@ do_more: spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); @@ -522,10 +523,10 @@ Index: linux-2.6.5-7.283-full/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -Index: linux-2.6.5-7.283-full/fs/ext3/balloc.c +Index: linux-2.6.5-7.311/fs/ext3/balloc.c =================================================================== ---- linux-2.6.5-7.283-full.orig/fs/ext3/balloc.c 2007-03-28 17:33:02.000000000 +0400 -+++ linux-2.6.5-7.283-full/fs/ext3/balloc.c 2007-03-28 18:33:35.000000000 +0400 +--- linux-2.6.5-7.311.orig/fs/ext3/balloc.c ++++ linux-2.6.5-7.311/fs/ext3/balloc.c @@ -20,6 +20,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> @@ -534,7 +535,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/balloc.c /* * balloc.c contains the blocks allocation and deallocation routines */ -@@ -72,6 +73,75 @@ struct ext3_group_desc * ext3_get_group_ +@@ -72,6 +73,83 @@ struct ext3_group_desc * ext3_get_group_ return gdp + desc; } @@ -602,6 +603,14 @@ Index: linux-2.6.5-7.283-full/fs/ext3/balloc.c + for (bit = le32_to_cpu(gdp->bg_inode_table) - start, + bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++) + ext3_set_bit(bit, bh->b_data); ++ ++ /* ++ * Also if the number of blocks within the group is ++ * less than the blocksize * 8 ( which is the size ++ * of bitmap ), set rest of the block bitmap to 1 ++ */ ++ mark_bitmap_end(EXT3_BLOCKS_PER_GROUP(sb), sb->s_blocksize * 8, ++ bh->b_data); + } + + return free_blocks - sbi->s_itb_per_group - 2; @@ -610,7 +619,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/balloc.c /* * Read the bitmap for a given block_group, reading into the specified * slot in the superblock's bitmap cache. -@@ -87,7 +157,19 @@ read_block_bitmap(struct super_block *sb +@@ -87,7 +165,19 @@ read_block_bitmap(struct super_block *sb desc = ext3_get_group_desc (sb, block_group, NULL); if (!desc) goto error_out; @@ -631,7 +640,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/balloc.c if (!bh) ext3_error (sb, "read_block_bitmap", "Cannot read block bitmap - " -@@ -432,6 +514,7 @@ do_more: +@@ -432,6 +522,7 @@ do_more: gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + dquot_freed_blocks); @@ -639,7 +648,7 @@ Index: linux-2.6.5-7.283-full/fs/ext3/balloc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -@@ -1372,8 +1455,11 @@ allocated: +@@ -1372,8 +1463,11 @@ allocated: ret_block, goal_hits, goal_attempts); spin_lock(sb_bgl_lock(sbi, group_no)); diff --git a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.18.patch b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.18.patch index 8f1c07730d5310a8f29bd0233dd663c4666d863e..70e5d08887e0765335bbbd27185b50d4eb89ec85 100644 --- a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.18.patch +++ b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.18.patch @@ -6,10 +6,10 @@ group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. -Index: linux-rhel5/include/linux/ext3_fs.h +Index: linux-2.6.18-53.1.14/include/linux/ext3_fs.h =================================================================== ---- linux-rhel5.orig/include/linux/ext3_fs.h 2007-07-18 17:32:04.000000000 +0200 -+++ linux-rhel5/include/linux/ext3_fs.h 2007-07-18 17:32:15.000000000 +0200 +--- linux-2.6.18-53.1.14.orig/include/linux/ext3_fs.h ++++ linux-2.6.18-53.1.14/include/linux/ext3_fs.h @@ -150,16 +150,22 @@ struct ext3_allocation_request { */ struct ext3_group_desc @@ -53,10 +53,10 @@ Index: linux-rhel5/include/linux/ext3_fs.h EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -Index: linux-rhel5/fs/ext3/resize.c +Index: linux-2.6.18-53.1.14/fs/ext3/resize.c =================================================================== ---- linux-rhel5.orig/fs/ext3/resize.c 2007-07-15 09:36:00.000000000 +0200 -+++ linux-rhel5/fs/ext3/resize.c 2007-07-18 17:32:15.000000000 +0200 +--- linux-2.6.18-53.1.14.orig/fs/ext3/resize.c ++++ linux-2.6.18-53.1.14/fs/ext3/resize.c @@ -18,6 +18,7 @@ #include <linux/errno.h> #include <linux/slab.h> @@ -65,7 +65,33 @@ Index: linux-rhel5/fs/ext3/resize.c #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) -@@ -834,6 +835,7 @@ int ext3_group_add(struct super_block *s +@@ -137,25 +138,6 @@ static struct buffer_head *bclean(handle + } + + /* +- * To avoid calling the atomic setbit hundreds or thousands of times, we only +- * need to use it within a single byte (to ensure we get endianness right). +- * We can use memset for the rest of the bitmap as there are no other users. +- */ +-static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +-{ +- int i; +- +- if (start_bit >= end_bit) +- return; +- +- ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); +- for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) +- ext3_set_bit(i, bitmap); +- if (i < end_bit) +- memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +-} +- +-/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to +@@ -834,6 +816,7 @@ int ext3_group_add(struct super_block *s gdp->bg_inode_table = cpu_to_le32(input->inode_table); gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); @@ -73,10 +99,10 @@ Index: linux-rhel5/fs/ext3/resize.c /* * Make the new blocks and inodes valid next. We do this before -Index: linux-rhel5/fs/ext3/super.c +Index: linux-2.6.18-53.1.14/fs/ext3/super.c =================================================================== ---- linux-rhel5.orig/fs/ext3/super.c 2007-07-18 17:32:06.000000000 +0200 -+++ linux-rhel5/fs/ext3/super.c 2007-07-18 17:35:03.000000000 +0200 +--- linux-2.6.18-53.1.14.orig/fs/ext3/super.c ++++ linux-2.6.18-53.1.14/fs/ext3/super.c @@ -41,6 +41,7 @@ #include "xattr.h" #include "acl.h" @@ -85,7 +111,7 @@ Index: linux-rhel5/fs/ext3/super.c static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); -@@ -1225,6 +1226,91 @@ static int ext3_setup_super(struct super +@@ -1227,6 +1228,91 @@ static int ext3_setup_super(struct super return res; } @@ -177,7 +203,7 @@ Index: linux-rhel5/fs/ext3/super.c /* Called at mount-time, super-block is locked */ static int ext3_check_descriptors (struct super_block * sb) { -@@ -1279,6 +1365,13 @@ static int ext3_check_descriptors (struc +@@ -1281,6 +1367,13 @@ static int ext3_check_descriptors (struc le32_to_cpu(gdp->bg_inode_table)); return 0; } @@ -191,11 +217,11 @@ Index: linux-rhel5/fs/ext3/super.c first_block += EXT3_BLOCKS_PER_GROUP(sb); gdp++; } -Index: linux-rhel5/fs/ext3/group.h +Index: linux-2.6.18-53.1.14/fs/ext3/group.h =================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ linux-rhel5/fs/ext3/group.h 2007-07-18 17:32:15.000000000 +0200 -@@ -0,0 +1,29 @@ +--- /dev/null ++++ linux-2.6.18-53.1.14/fs/ext3/group.h +@@ -0,0 +1,30 @@ +/* + * linux/fs/ext3/group.h + * @@ -224,11 +250,12 @@ Index: linux-rhel5/fs/ext3/group.h +extern unsigned ext3_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, int group, + struct ext3_group_desc *desc); ++extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +#endif /* _LINUX_EXT3_GROUP_H */ -Index: linux-rhel5/fs/ext3/ialloc.c +Index: linux-2.6.18-53.1.14/fs/ext3/ialloc.c =================================================================== ---- linux-rhel5.orig/fs/ext3/ialloc.c 2007-07-18 17:32:05.000000000 +0200 -+++ linux-rhel5/fs/ext3/ialloc.c 2007-07-18 17:32:15.000000000 +0200 +--- linux-2.6.18-53.1.14.orig/fs/ext3/ialloc.c ++++ linux-2.6.18-53.1.14/fs/ext3/ialloc.c @@ -28,6 +28,7 @@ #include "xattr.h" @@ -246,7 +273,7 @@ Index: linux-rhel5/fs/ext3/ialloc.c + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ -+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) ++void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + @@ -423,10 +450,10 @@ Index: linux-rhel5/fs/ext3/ialloc.c /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; -Index: linux-rhel5/fs/ext3/mballoc.c +Index: linux-2.6.18-53.1.14/fs/ext3/mballoc.c =================================================================== ---- linux-rhel5.orig/fs/ext3/mballoc.c 2007-07-18 17:32:04.000000000 +0200 -+++ linux-rhel5/fs/ext3/mballoc.c 2007-07-18 17:32:15.000000000 +0200 +--- linux-2.6.18-53.1.14.orig/fs/ext3/mballoc.c ++++ linux-2.6.18-53.1.14/fs/ext3/mballoc.c @@ -36,6 +36,8 @@ #include <linux/seq_file.h> #include <linux/version.h> @@ -444,7 +471,7 @@ Index: linux-rhel5/fs/ext3/mballoc.c unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; -@@ -943,10 +946,7 @@ static int ext3_mb_init_cache(struct pag +@@ -941,10 +944,7 @@ static int ext3_mb_init_cache(struct pag if (first_group + i >= EXT3_SB(sb)->s_groups_count) break; @@ -456,7 +483,7 @@ Index: linux-rhel5/fs/ext3/mballoc.c err = -ENOMEM; bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -@@ -961,7 +961,12 @@ static int ext3_mb_init_cache(struct pag +@@ -959,7 +959,12 @@ static int ext3_mb_init_cache(struct pag unlock_buffer(bh[i]); continue; } @@ -470,7 +497,7 @@ Index: linux-rhel5/fs/ext3/mballoc.c get_bh(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); -@@ -1732,6 +1737,10 @@ static int ext3_mb_good_group(struct ext +@@ -1731,6 +1736,10 @@ static int ext3_mb_good_group(struct ext switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); @@ -481,7 +508,7 @@ Index: linux-rhel5/fs/ext3/mballoc.c bits = ac->ac_sb->s_blocksize_bits + 1; for (i = ac->ac_2order; i <= bits; i++) if (grp->bb_counters[i] > 0) -@@ -1825,7 +1834,9 @@ repeat: +@@ -1824,7 +1833,9 @@ repeat: } ac->ac_groups_scanned++; @@ -507,8 +534,8 @@ Index: linux-rhel5/fs/ext3/mballoc.c set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &meta_group_info[j]->bb_state); -@@ -2958,9 +2970,17 @@ int ext3_mb_mark_diskspace_used(struct e - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); +@@ -2943,9 +2955,17 @@ int ext3_mb_mark_diskspace_used(struct e + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + if (gdp->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) { @@ -525,7 +552,7 @@ Index: linux-rhel5/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len); -@@ -4346,6 +4366,7 @@ do_more: +@@ -4355,6 +4375,7 @@ do_more: spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); @@ -533,10 +560,10 @@ Index: linux-rhel5/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -Index: linux-rhel5/fs/ext3/balloc.c +Index: linux-2.6.18-53.1.14/fs/ext3/balloc.c =================================================================== ---- linux-rhel5.orig/fs/ext3/balloc.c 2007-07-18 17:32:04.000000000 +0200 -+++ linux-rhel5/fs/ext3/balloc.c 2007-07-18 17:32:15.000000000 +0200 +--- linux-2.6.18-53.1.14.orig/fs/ext3/balloc.c ++++ linux-2.6.18-53.1.14/fs/ext3/balloc.c @@ -20,6 +20,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> @@ -545,7 +572,7 @@ Index: linux-rhel5/fs/ext3/balloc.c /* * balloc.c contains the blocks allocation and deallocation routines */ -@@ -73,6 +74,75 @@ struct ext3_group_desc * ext3_get_group_ +@@ -73,6 +74,83 @@ struct ext3_group_desc * ext3_get_group_ return desc + offset; } @@ -613,6 +640,14 @@ Index: linux-rhel5/fs/ext3/balloc.c + for (bit = le32_to_cpu(gdp->bg_inode_table) - start, + bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++) + ext3_set_bit(bit, bh->b_data); ++ ++ /* ++ * Also if the number of blocks within the group is ++ * less than the blocksize * 8 ( which is the size ++ * of bitmap ), set rest of the block bitmap to 1 ++ */ ++ mark_bitmap_end(EXT3_BLOCKS_PER_GROUP(sb), sb->s_blocksize * 8, ++ bh->b_data); + } + + return free_blocks - sbi->s_itb_per_group - 2; @@ -621,7 +656,7 @@ Index: linux-rhel5/fs/ext3/balloc.c /* * Read the bitmap for a given block_group, reading into the specified * slot in the superblock's bitmap cache. -@@ -88,7 +158,19 @@ read_block_bitmap(struct super_block *sb +@@ -88,7 +166,19 @@ read_block_bitmap(struct super_block *sb desc = ext3_get_group_desc (sb, block_group, NULL); if (!desc) goto error_out; @@ -642,7 +677,7 @@ Index: linux-rhel5/fs/ext3/balloc.c if (!bh) ext3_error (sb, "read_block_bitmap", "Cannot read block bitmap - " -@@ -467,6 +549,7 @@ do_more: +@@ -467,6 +557,7 @@ do_more: desc->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + group_freed); @@ -650,7 +685,7 @@ Index: linux-rhel5/fs/ext3/balloc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -@@ -1434,8 +1517,11 @@ allocated: +@@ -1434,8 +1525,11 @@ allocated: ret_block, goal_hits, goal_attempts); spin_lock(sb_bgl_lock(sbi, group_no)); diff --git a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.22-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.22-vanilla.patch index fb635428b43da9226c7d25d42d53c52f0b81cd73..1fb9c0004a6ef4c848ea152b02cd1b634fb07b75 100644 --- a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.22-vanilla.patch +++ b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.22-vanilla.patch @@ -6,11 +6,11 @@ group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. -Index: linux-rhel5/include/linux/ext3_fs.h +Index: linux-2.6.22.14/include/linux/ext3_fs.h =================================================================== ---- linux-rhel5.orig/include/linux/ext3_fs.h 2007-07-18 17:32:04.000000000 +0200 -+++ linux-rhel5/include/linux/ext3_fs.h 2007-07-18 17:32:15.000000000 +0200 -@@ -150,16 +150,22 @@ struct ext3_allocation_request { +--- linux-2.6.22.14.orig/include/linux/ext3_fs.h ++++ linux-2.6.22.14/include/linux/ext3_fs.h +@@ -146,16 +146,22 @@ struct ext3_allocation_request { */ struct ext3_group_desc { @@ -37,7 +37,7 @@ Index: linux-rhel5/include/linux/ext3_fs.h /* * Macro-instructions used to manage group descriptors */ -@@ -603,6 +609,7 @@ static inline int ext3_valid_inum(struct +@@ -617,6 +623,7 @@ static inline int ext3_valid_inum(struct #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 @@ -45,7 +45,7 @@ Index: linux-rhel5/include/linux/ext3_fs.h #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 -@@ -619,6 +626,7 @@ static inline int ext3_valid_inum(struct +@@ -633,6 +640,7 @@ static inline int ext3_valid_inum(struct EXT3_FEATURE_INCOMPAT_EXTENTS) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ @@ -53,11 +53,11 @@ Index: linux-rhel5/include/linux/ext3_fs.h EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -Index: linux-rhel5/fs/ext3/resize.c +Index: linux-2.6.22.14/fs/ext3/resize.c =================================================================== ---- linux-rhel5.orig/fs/ext3/resize.c 2007-07-15 09:36:00.000000000 +0200 -+++ linux-rhel5/fs/ext3/resize.c 2007-07-18 17:32:15.000000000 +0200 -@@ -18,6 +18,7 @@ +--- linux-2.6.22.14.orig/fs/ext3/resize.c ++++ linux-2.6.22.14/fs/ext3/resize.c +@@ -16,6 +16,7 @@ #include <linux/errno.h> #include <linux/slab.h> @@ -65,7 +65,33 @@ Index: linux-rhel5/fs/ext3/resize.c #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) -@@ -834,6 +835,7 @@ int ext3_group_add(struct super_block *s +@@ -135,25 +136,6 @@ static struct buffer_head *bclean(handle + } + + /* +- * To avoid calling the atomic setbit hundreds or thousands of times, we only +- * need to use it within a single byte (to ensure we get endianness right). +- * We can use memset for the rest of the bitmap as there are no other users. +- */ +-static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +-{ +- int i; +- +- if (start_bit >= end_bit) +- return; +- +- ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); +- for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) +- ext3_set_bit(i, bitmap); +- if (i < end_bit) +- memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +-} +- +-/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to +@@ -833,6 +815,7 @@ int ext3_group_add(struct super_block *s gdp->bg_inode_table = cpu_to_le32(input->inode_table); gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); @@ -73,10 +99,10 @@ Index: linux-rhel5/fs/ext3/resize.c /* * Make the new blocks and inodes valid next. We do this before -Index: linux-rhel5/fs/ext3/super.c +Index: linux-2.6.22.14/fs/ext3/super.c =================================================================== ---- linux-rhel5.orig/fs/ext3/super.c 2007-07-18 17:32:06.000000000 +0200 -+++ linux-rhel5/fs/ext3/super.c 2007-07-18 17:35:03.000000000 +0200 +--- linux-2.6.22.14.orig/fs/ext3/super.c ++++ linux-2.6.22.14/fs/ext3/super.c @@ -41,6 +41,7 @@ #include "xattr.h" #include "acl.h" @@ -85,7 +111,7 @@ Index: linux-rhel5/fs/ext3/super.c static int ext3_load_journal(struct super_block *, struct ext3_super_block *, unsigned long journal_devnum); -@@ -1225,6 +1226,91 @@ static int ext3_setup_super(struct super +@@ -1224,6 +1225,91 @@ static int ext3_setup_super(struct super return res; } @@ -177,7 +203,7 @@ Index: linux-rhel5/fs/ext3/super.c /* Called at mount-time, super-block is locked */ static int ext3_check_descriptors (struct super_block * sb) { -@@ -1279,6 +1365,13 @@ static int ext3_check_descriptors (struc +@@ -1278,6 +1364,13 @@ static int ext3_check_descriptors (struc le32_to_cpu(gdp->bg_inode_table)); return 0; } @@ -191,11 +217,11 @@ Index: linux-rhel5/fs/ext3/super.c first_block += EXT3_BLOCKS_PER_GROUP(sb); gdp++; } -Index: linux-rhel5/fs/ext3/group.h +Index: linux-2.6.22.14/fs/ext3/group.h =================================================================== ---- /dev/null 1970-01-01 00:00:00.000000000 +0000 -+++ linux-rhel5/fs/ext3/group.h 2007-07-18 17:32:15.000000000 +0200 -@@ -0,0 +1,29 @@ +--- /dev/null ++++ linux-2.6.22.14/fs/ext3/group.h +@@ -0,0 +1,30 @@ +/* + * linux/fs/ext3/group.h + * @@ -224,11 +250,12 @@ Index: linux-rhel5/fs/ext3/group.h +extern unsigned ext3_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, int group, + struct ext3_group_desc *desc); ++extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +#endif /* _LINUX_EXT3_GROUP_H */ -Index: linux-rhel5/fs/ext3/ialloc.c +Index: linux-2.6.22.14/fs/ext3/ialloc.c =================================================================== ---- linux-rhel5.orig/fs/ext3/ialloc.c 2007-07-18 17:32:05.000000000 +0200 -+++ linux-rhel5/fs/ext3/ialloc.c 2007-07-18 17:32:15.000000000 +0200 +--- linux-2.6.22.14.orig/fs/ext3/ialloc.c ++++ linux-2.6.22.14/fs/ext3/ialloc.c @@ -28,6 +28,7 @@ #include "xattr.h" @@ -246,7 +273,7 @@ Index: linux-rhel5/fs/ext3/ialloc.c + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ -+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) ++void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + @@ -423,10 +450,10 @@ Index: linux-rhel5/fs/ext3/ialloc.c /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; -Index: linux-rhel5/fs/ext3/mballoc.c +Index: linux-2.6.22.14/fs/ext3/mballoc.c =================================================================== ---- linux-rhel5.orig/fs/ext3/mballoc.c 2007-07-18 17:32:04.000000000 +0200 -+++ linux-rhel5/fs/ext3/mballoc.c 2007-07-18 17:32:15.000000000 +0200 +--- linux-2.6.22.14.orig/fs/ext3/mballoc.c ++++ linux-2.6.22.14/fs/ext3/mballoc.c @@ -36,6 +36,8 @@ #include <linux/seq_file.h> #include <linux/version.h> @@ -470,7 +497,7 @@ Index: linux-rhel5/fs/ext3/mballoc.c get_bh(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); -@@ -1732,6 +1737,10 @@ static int ext3_mb_good_group(struct ext +@@ -1733,6 +1738,10 @@ static int ext3_mb_good_group(struct ext switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); @@ -481,7 +508,7 @@ Index: linux-rhel5/fs/ext3/mballoc.c bits = ac->ac_sb->s_blocksize_bits + 1; for (i = ac->ac_2order; i <= bits; i++) if (grp->bb_counters[i] > 0) -@@ -1825,7 +1834,9 @@ repeat: +@@ -1826,7 +1835,9 @@ repeat: } ac->ac_groups_scanned++; @@ -492,7 +519,7 @@ Index: linux-rhel5/fs/ext3/mballoc.c ext3_mb_simple_scan_group(ac, &e3b); else if (cr == 1 && ac->ac_g_ex.fe_len == sbi->s_stripe) ext3_mb_scan_aligned(ac, &e3b); -@@ -2304,12 +2315,13 @@ int ext3_mb_init_backend(struct super_bl +@@ -2306,12 +2317,13 @@ int ext3_mb_init_backend(struct super_bl i--; goto err_freebuddy; } @@ -507,8 +534,8 @@ Index: linux-rhel5/fs/ext3/mballoc.c set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &meta_group_info[j]->bb_state); -@@ -2958,9 +2970,17 @@ int ext3_mb_mark_diskspace_used(struct e - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); +@@ -2945,9 +2957,17 @@ int ext3_mb_mark_diskspace_used(struct e + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + if (gdp->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) { @@ -525,7 +552,7 @@ Index: linux-rhel5/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len); -@@ -4346,6 +4366,7 @@ do_more: +@@ -4357,6 +4377,7 @@ do_more: spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); @@ -533,11 +560,11 @@ Index: linux-rhel5/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -Index: linux-rhel5/fs/ext3/balloc.c +Index: linux-2.6.22.14/fs/ext3/balloc.c =================================================================== ---- linux-rhel5.orig/fs/ext3/balloc.c 2007-07-18 17:32:04.000000000 +0200 -+++ linux-rhel5/fs/ext3/balloc.c 2007-07-18 17:32:15.000000000 +0200 -@@ -20,6 +20,7 @@ +--- linux-2.6.22.14.orig/fs/ext3/balloc.c ++++ linux-2.6.22.14/fs/ext3/balloc.c +@@ -20,10 +20,88 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> @@ -545,9 +572,6 @@ Index: linux-rhel5/fs/ext3/balloc.c /* * balloc.c contains the blocks allocation and deallocation routines */ -@@ -73,6 +74,75 @@ struct ext3_group_desc * ext3_get_group_ - return desc + offset; - } +/* Initializes an uninitialized block bitmap if given, and returns the + * number of blocks free in the group. */ @@ -613,15 +637,23 @@ Index: linux-rhel5/fs/ext3/balloc.c + for (bit = le32_to_cpu(gdp->bg_inode_table) - start, + bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++) + ext3_set_bit(bit, bh->b_data); ++ ++ /* ++ * Also if the number of blocks within the group is ++ * less than the blocksize * 8 ( which is the size ++ * of bitmap ), set rest of the block bitmap to 1 ++ */ ++ mark_bitmap_end(EXT3_BLOCKS_PER_GROUP(sb), sb->s_blocksize * 8, ++ bh->b_data); + } + + return free_blocks - sbi->s_itb_per_group - 2; +} + /* - * Read the bitmap for a given block_group, reading into the specified - * slot in the superblock's bitmap cache. -@@ -88,7 +158,19 @@ read_block_bitmap(struct super_block *sb + * The free blocks are managed by bitmaps. A file system contains several + * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap +@@ -99,7 +177,19 @@ read_block_bitmap(struct super_block *sb desc = ext3_get_group_desc (sb, block_group, NULL); if (!desc) goto error_out; @@ -642,7 +674,7 @@ Index: linux-rhel5/fs/ext3/balloc.c if (!bh) ext3_error (sb, "read_block_bitmap", "Cannot read block bitmap - " -@@ -467,6 +549,7 @@ do_more: +@@ -571,6 +661,7 @@ do_more: desc->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + group_freed); @@ -650,7 +682,7 @@ Index: linux-rhel5/fs/ext3/balloc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -@@ -1434,8 +1517,11 @@ allocated: +@@ -1637,8 +1728,11 @@ allocated: ret_block, goal_hits, goal_attempts); spin_lock(sb_bgl_lock(sbi, group_no)); diff --git a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.9.patch b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.9.patch index b9566649d13a56aa4b25e0cbe8b39dca59d8725f..2b02b6821488f82399460e80ab75756a48cb651b 100644 --- a/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.9.patch +++ b/ldiskfs/kernel_patches/patches/ext3-uninit-2.6.9.patch @@ -6,10 +6,10 @@ group descriptor to avoid reading or scanning them at e2fsck time. A checksum of each group descriptor is used to ensure that corruption in the group descriptor's bit flags does not cause incorrect operation. -Index: linux-2.6.9-full/include/linux/ext3_fs.h +Index: linux-2.6.9-67.0.15/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.9-full.orig/include/linux/ext3_fs.h 2007-03-28 18:35:41.000000000 +0400 -+++ linux-2.6.9-full/include/linux/ext3_fs.h 2007-03-28 18:36:16.000000000 +0400 +--- linux-2.6.9-67.0.15.orig/include/linux/ext3_fs.h ++++ linux-2.6.9-67.0.15/include/linux/ext3_fs.h @@ -153,16 +153,22 @@ struct ext3_allocation_request { */ struct ext3_group_desc @@ -37,7 +37,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h /* * Macro-instructions used to manage group descriptors */ -@@ -572,6 +578,7 @@ static inline struct ext3_inode_info *EX +@@ -580,6 +586,7 @@ static inline struct ext3_inode_info *EX #define EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT3_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 #define EXT3_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 @@ -45,7 +45,7 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT3_FEATURE_INCOMPAT_COMPRESSION 0x0001 -@@ -588,6 +595,7 @@ static inline struct ext3_inode_info *EX +@@ -596,6 +603,7 @@ static inline struct ext3_inode_info *EX EXT3_FEATURE_INCOMPAT_EXTENTS) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT3_FEATURE_RO_COMPAT_LARGE_FILE| \ @@ -53,10 +53,10 @@ Index: linux-2.6.9-full/include/linux/ext3_fs.h EXT4_FEATURE_RO_COMPAT_DIR_NLINK| \ EXT3_FEATURE_RO_COMPAT_BTREE_DIR) -Index: linux-2.6.9-full/fs/ext3/resize.c +Index: linux-2.6.9-67.0.15/fs/ext3/resize.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/resize.c 2006-03-10 18:20:03.000000000 +0300 -+++ linux-2.6.9-full/fs/ext3/resize.c 2007-03-28 18:36:16.000000000 +0400 +--- linux-2.6.9-67.0.15.orig/fs/ext3/resize.c ++++ linux-2.6.9-67.0.15/fs/ext3/resize.c @@ -19,6 +19,7 @@ #include <linux/errno.h> #include <linux/slab.h> @@ -65,7 +65,33 @@ Index: linux-2.6.9-full/fs/ext3/resize.c #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) -@@ -807,6 +808,7 @@ int ext3_group_add(struct super_block *s +@@ -132,25 +133,6 @@ static struct buffer_head *bclean(handle + } + + /* +- * To avoid calling the atomic setbit hundreds or thousands of times, we only +- * need to use it within a single byte (to ensure we get endianness right). +- * We can use memset for the rest of the bitmap as there are no other users. +- */ +-static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +-{ +- int i; +- +- if (start_bit >= end_bit) +- return; +- +- ext3_debug("mark end bits +%d through +%d used\n", start_bit, end_bit); +- for (i = start_bit; i < ((start_bit + 7) & ~7UL); i++) +- ext3_set_bit(i, bitmap); +- if (i < end_bit) +- memset(bitmap + (i >> 3), 0xff, (end_bit - i) >> 3); +-} +- +-/* + * Set up the block and inode bitmaps, and the inode table for the new group. + * This doesn't need to be part of the main transaction, since we are only + * changing blocks outside the actual filesystem. We still do journaling to +@@ -807,6 +789,7 @@ int ext3_group_add(struct super_block *s gdp->bg_inode_table = cpu_to_le32(input->inode_table); gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); gdp->bg_free_inodes_count = cpu_to_le16(EXT3_INODES_PER_GROUP(sb)); @@ -73,19 +99,19 @@ Index: linux-2.6.9-full/fs/ext3/resize.c /* * Make the new blocks and inodes valid next. We do this before -Index: linux-2.6.9-full/fs/ext3/super.c +Index: linux-2.6.9-67.0.15/fs/ext3/super.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/super.c 2007-03-28 18:35:42.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/super.c 2007-03-28 18:36:16.000000000 +0400 +--- linux-2.6.9-67.0.15.orig/fs/ext3/super.c ++++ linux-2.6.9-67.0.15/fs/ext3/super.c @@ -38,6 +38,7 @@ #include <asm/uaccess.h> #include "xattr.h" #include "acl.h" +#include "group.h" - static int ext3_load_journal(struct super_block *, struct ext3_super_block *, - unsigned long journal_devnum); -@@ -1090,6 +1091,90 @@ static int ext3_setup_super(struct super + static int ext3_load_journal(struct super_block *, struct ext3_super_block *); + static int ext3_create_journal(struct super_block *, struct ext3_super_block *, +@@ -1130,6 +1131,90 @@ static int ext3_setup_super(struct super return res; } @@ -176,7 +202,7 @@ Index: linux-2.6.9-full/fs/ext3/super.c /* Called at mount-time, super-block is locked */ static int ext3_check_descriptors (struct super_block * sb) { -@@ -1139,6 +1224,13 @@ static int ext3_check_descriptors (struc +@@ -1179,6 +1264,13 @@ static int ext3_check_descriptors (struc le32_to_cpu(gdp->bg_inode_table)); return 0; } @@ -190,11 +216,11 @@ Index: linux-2.6.9-full/fs/ext3/super.c block += EXT3_BLOCKS_PER_GROUP(sb); gdp++; } -Index: linux-2.6.9-full/fs/ext3/group.h +Index: linux-2.6.9-67.0.15/fs/ext3/group.h =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/group.h 2007-02-13 18:39:59.640066087 +0300 -+++ linux-2.6.9-full/fs/ext3/group.h 2007-03-28 18:36:16.000000000 +0400 -@@ -0,0 +1,29 @@ +--- /dev/null ++++ linux-2.6.9-67.0.15/fs/ext3/group.h +@@ -0,0 +1,30 @@ +/* + * linux/fs/ext3/group.h + * @@ -223,11 +249,12 @@ Index: linux-2.6.9-full/fs/ext3/group.h +extern unsigned ext3_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, int group, + struct ext3_group_desc *desc); ++extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); +#endif /* _LINUX_EXT3_GROUP_H */ -Index: linux-2.6.9-full/fs/ext3/ialloc.c +Index: linux-2.6.9-67.0.15/fs/ext3/ialloc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/ialloc.c 2007-03-28 18:35:38.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/ialloc.c 2007-03-28 18:36:16.000000000 +0400 +--- linux-2.6.9-67.0.15.orig/fs/ext3/ialloc.c ++++ linux-2.6.9-67.0.15/fs/ext3/ialloc.c @@ -28,6 +28,7 @@ #include "xattr.h" @@ -245,7 +272,7 @@ Index: linux-2.6.9-full/fs/ext3/ialloc.c + * need to use it within a single byte (to ensure we get endianness right). + * We can use memset for the rest of the bitmap as there are no other users. + */ -+static void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) ++void mark_bitmap_end(int start_bit, int end_bit, char *bitmap) +{ + int i; + @@ -320,7 +347,7 @@ Index: linux-2.6.9-full/fs/ext3/ialloc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_inc(&sbi->s_freeinodes_counter); if (is_directory) -@@ -453,7 +513,7 @@ struct inode *ext3_new_inode(handle_t *h +@@ -452,7 +512,7 @@ struct inode *ext3_new_inode(handle_t *h struct ext3_sb_info *sbi; int err = 0; struct inode *ret; @@ -329,7 +356,7 @@ Index: linux-2.6.9-full/fs/ext3/ialloc.c /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) -@@ -566,11 +626,13 @@ repeat_in_this_group: +@@ -568,11 +628,13 @@ repeat_in_this_group: goto out; got: @@ -348,7 +375,7 @@ Index: linux-2.6.9-full/fs/ext3/ialloc.c err = -EIO; goto fail; } -@@ -578,13 +640,65 @@ got: +@@ -580,13 +642,65 @@ got: BUFFER_TRACE(bh2, "get_write_access"); err = ext3_journal_get_write_access(handle, bh2); if (err) goto fail; @@ -414,7 +441,7 @@ Index: linux-2.6.9-full/fs/ext3/ialloc.c spin_unlock(sb_bgl_lock(sbi, group)); BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); err = ext3_journal_dirty_metadata(handle, bh2); -@@ -606,7 +720,7 @@ got: +@@ -608,7 +722,7 @@ got: inode->i_gid = current->fsgid; inode->i_mode = mode; @@ -423,10 +450,10 @@ Index: linux-2.6.9-full/fs/ext3/ialloc.c /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blksize = PAGE_SIZE; inode->i_blocks = 0; -Index: linux-2.6.9-full/fs/ext3/mballoc.c +Index: linux-2.6.9-67.0.15/fs/ext3/mballoc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/mballoc.c 2007-03-28 15:42:45.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/mballoc.c 2007-03-28 18:36:16.000000000 +0400 +--- linux-2.6.9-67.0.15.orig/fs/ext3/mballoc.c ++++ linux-2.6.9-67.0.15/fs/ext3/mballoc.c @@ -36,6 +36,8 @@ #include <linux/seq_file.h> #include <linux/version.h> @@ -444,7 +471,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; -@@ -928,10 +931,7 @@ static int ext3_mb_init_cache(struct pag +@@ -943,10 +946,7 @@ static int ext3_mb_init_cache(struct pag if (first_group + i >= EXT3_SB(sb)->s_groups_count) break; @@ -456,7 +483,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c err = -ENOMEM; bh[i] = sb_getblk(sb, le32_to_cpu(desc->bg_block_bitmap)); -@@ -946,7 +946,12 @@ static int ext3_mb_init_cache(struct pag +@@ -961,7 +961,12 @@ static int ext3_mb_init_cache(struct pag unlock_buffer(bh[i]); continue; } @@ -470,7 +497,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c get_bh(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); -@@ -1703,6 +1708,10 @@ static int ext3_mb_good_group(struct ext +@@ -1733,6 +1738,10 @@ static int ext3_mb_good_group(struct ext switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); @@ -481,7 +508,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c bits = ac->ac_sb->s_blocksize_bits + 1; for (i = ac->ac_2order; i <= bits; i++) if (grp->bb_counters[i] > 0) -@@ -1796,7 +1805,9 @@ repeat: +@@ -1826,7 +1835,9 @@ repeat: } ac->ac_groups_scanned++; @@ -492,7 +519,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c ext3_mb_simple_scan_group(ac, &e3b); else if (cr == 1 && ac->ac_g_ex.fe_len == sbi->s_stripe) ext3_mb_scan_aligned(ac, &e3b); -@@ -2267,12 +2278,13 @@ int ext3_mb_init_backend(struct super_bl +@@ -2306,12 +2317,13 @@ int ext3_mb_init_backend(struct super_bl i--; goto err_freebuddy; } @@ -507,8 +534,8 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c set_bit(EXT3_GROUP_INFO_NEED_INIT_BIT, &meta_group_info[j]->bb_state); -@@ -2936,9 +2948,17 @@ int ext3_mb_mark_diskspace_used(struct e - mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); +@@ -2945,9 +2957,17 @@ int ext3_mb_mark_diskspace_used(struct e + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + if (gdp->bg_flags & cpu_to_le16(EXT3_BG_BLOCK_UNINIT)) { @@ -525,7 +552,7 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len); -@@ -4303,6 +4323,7 @@ do_more: +@@ -4357,6 +4377,7 @@ do_more: spin_lock(sb_bgl_lock(sbi, block_group)); gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); @@ -533,10 +560,10 @@ Index: linux-2.6.9-full/fs/ext3/mballoc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -Index: linux-2.6.9-full/fs/ext3/balloc.c +Index: linux-2.6.9-67.0.15/fs/ext3/balloc.c =================================================================== ---- linux-2.6.9-full.orig/fs/ext3/balloc.c 2007-03-28 15:45:41.000000000 +0400 -+++ linux-2.6.9-full/fs/ext3/balloc.c 2007-03-28 18:36:16.000000000 +0400 +--- linux-2.6.9-67.0.15.orig/fs/ext3/balloc.c ++++ linux-2.6.9-67.0.15/fs/ext3/balloc.c @@ -20,6 +20,7 @@ #include <linux/quotaops.h> #include <linux/buffer_head.h> @@ -545,7 +572,7 @@ Index: linux-2.6.9-full/fs/ext3/balloc.c /* * balloc.c contains the blocks allocation and deallocation routines */ -@@ -73,6 +74,75 @@ struct ext3_group_desc * ext3_get_group_ +@@ -73,6 +74,83 @@ struct ext3_group_desc * ext3_get_group_ return gdp + desc; } @@ -613,6 +640,14 @@ Index: linux-2.6.9-full/fs/ext3/balloc.c + for (bit = le32_to_cpu(gdp->bg_inode_table) - start, + bit_max = bit + sbi->s_itb_per_group; bit < bit_max; bit++) + ext3_set_bit(bit, bh->b_data); ++ ++ /* ++ * Also if the number of blocks within the group is ++ * less than the blocksize * 8 ( which is the size ++ * of bitmap ), set rest of the block bitmap to 1 ++ */ ++ mark_bitmap_end(EXT3_BLOCKS_PER_GROUP(sb), sb->s_blocksize * 8, ++ bh->b_data); + } + + return free_blocks - sbi->s_itb_per_group - 2; @@ -621,7 +656,7 @@ Index: linux-2.6.9-full/fs/ext3/balloc.c /* * Read the bitmap for a given block_group, reading into the specified * slot in the superblock's bitmap cache. -@@ -88,7 +158,19 @@ read_block_bitmap(struct super_block *sb +@@ -88,7 +166,19 @@ read_block_bitmap(struct super_block *sb desc = ext3_get_group_desc (sb, block_group, NULL); if (!desc) goto error_out; @@ -642,7 +677,7 @@ Index: linux-2.6.9-full/fs/ext3/balloc.c if (!bh) ext3_error (sb, "read_block_bitmap", "Cannot read block bitmap - " -@@ -429,6 +511,7 @@ do_more: +@@ -429,6 +519,7 @@ do_more: gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + *pdquot_freed_blocks); @@ -650,7 +685,7 @@ Index: linux-2.6.9-full/fs/ext3/balloc.c spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_mod(&sbi->s_freeblocks_counter, count); -@@ -1330,8 +1413,11 @@ allocated: +@@ -1330,8 +1421,11 @@ allocated: ret_block, goal_hits, goal_attempts); spin_lock(sb_bgl_lock(sbi, group_no)); diff --git a/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-2.6-sles10.patch b/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-2.6-sles10.patch new file mode 100644 index 0000000000000000000000000000000000000000..5b5b9074d49ff1f3e0ae9cbd6ac34eb3111908f2 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-2.6-sles10.patch @@ -0,0 +1,33 @@ +Index: linux-2.6.16.54-0.2.5/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.16.54-0.2.5.orig/include/linux/ext3_fs.h ++++ linux-2.6.16.54-0.2.5/include/linux/ext3_fs.h +@@ -913,6 +913,13 @@ struct mmp_struct { + #define EXT3_MMP_MIN_CHECK_INTERVAL 5 + + /* ++ * Indicates that ctime should not be updated in ext3_xattr_set_handle() ++ */ ++#ifndef XATTR_NO_CTIME ++#define XATTR_NO_CTIME 0x80 ++#endif ++ ++/* + * Function prototypes + */ + +Index: linux-2.6.16.54-0.2.5/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.16.54-0.2.5.orig/fs/ext3/xattr.c ++++ linux-2.6.16.54-0.2.5/fs/ext3/xattr.c +@@ -1030,8 +1030,8 @@ ext3_xattr_set_handle(handle_t *handle, + } + if (!error) { + ext3_xattr_update_super_block(handle, inode->i_sb); +- inode->i_ctime = ext3_current_time(inode); +- ++ if (!(flags & XATTR_NO_CTIME)) ++ inode->i_ctime = ext3_current_time(inode); + error = ext3_mark_iloc_dirty(handle, inode, &is.iloc); + if (!value) + EXT3_I(inode)->i_state &= ~EXT3_STATE_NO_EXPAND; diff --git a/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-2.6.22-vanilla.patch b/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-2.6.22-vanilla.patch new file mode 100644 index 0000000000000000000000000000000000000000..2fe2f47b9e3b610ac35328549ffd705ff6e0bef4 --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-2.6.22-vanilla.patch @@ -0,0 +1,32 @@ +Index: linux-2.6.22.14/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.22.14.orig/include/linux/ext3_fs.h ++++ linux-2.6.22.14/include/linux/ext3_fs.h +@@ -923,6 +923,13 @@ struct mmp_struct { + #define EXT3_MMP_MIN_CHECK_INTERVAL 5 + + /* ++ * Indicates that ctime should not be updated in ext3_xattr_set_handle() ++ */ ++#ifndef XATTR_NO_CTIME ++#define XATTR_NO_CTIME 0x80 ++#endif ++ ++/* + * Function prototypes + */ + +Index: linux-2.6.22.14/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.22.14.orig/fs/ext3/xattr.c ++++ linux-2.6.22.14/fs/ext3/xattr.c +@@ -1039,7 +1039,8 @@ ext3_xattr_set_handle(handle_t *handle, + } + if (!error) { + ext3_xattr_update_super_block(handle, inode->i_sb); +- inode->i_ctime = ext3_current_time(inode); ++ if (!(flags & XATTR_NO_CTIME)) ++ inode->i_ctime = ext3_current_time(inode); + error = ext3_mark_iloc_dirty(handle, inode, &is.iloc); + if (!value) + EXT3_I(inode)->i_state &= ~EXT3_STATE_NO_EXPAND; diff --git a/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-rhel4.patch b/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-rhel4.patch new file mode 100644 index 0000000000000000000000000000000000000000..b9e18023493983bcd016f69832e5e0d1cf114bba --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-rhel4.patch @@ -0,0 +1,65 @@ +Index: linux-2.6.9-67.0.20/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.9-67.0.20.orig/include/linux/ext3_fs.h ++++ linux-2.6.9-67.0.20/include/linux/ext3_fs.h +@@ -873,6 +873,13 @@ struct mmp_struct { + #define EXT3_MMP_MIN_CHECK_INTERVAL 5 + + /* ++ * Indicates that ctime should not be updated in ext3_xattr_set_handle() ++ */ ++#ifndef XATTR_NO_CTIME ++#define XATTR_NO_CTIME 0x80 ++#endif ++ ++/* + * Function prototypes + */ + +Index: linux-2.6.9-67.0.20/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.9-67.0.20.orig/fs/ext3/xattr.c ++++ linux-2.6.9-67.0.20/fs/ext3/xattr.c +@@ -91,7 +91,7 @@ + + static int ext3_xattr_set_handle2(handle_t *, struct inode *, + struct buffer_head *, +- struct ext3_xattr_header *); ++ struct ext3_xattr_header *, int); + + static int ext3_xattr_cache_insert(struct buffer_head *); + static struct buffer_head *ext3_xattr_cache_find(handle_t *, struct inode *, +@@ -1215,12 +1215,12 @@ skip_replace: + /* This block is now empty. */ + if (bh && header == HDR(bh)) + unlock_buffer(bh); /* we were modifying in-place. */ +- error = ext3_xattr_set_handle2(handle, inode, bh, NULL); ++ error = ext3_xattr_set_handle2(handle, inode, bh, NULL, flags); + } else { + ext3_xattr_rehash(header, here); + if (bh && header == HDR(bh)) + unlock_buffer(bh); /* we were modifying in-place. */ +- error = ext3_xattr_set_handle2(handle, inode, bh, header); ++ error = ext3_xattr_set_handle2(handle, inode, bh, header, flags); + } + + cleanup: +@@ -1237,7 +1237,7 @@ cleanup: + static int + ext3_xattr_set_handle2(handle_t *handle, struct inode *inode, + struct buffer_head *old_bh, +- struct ext3_xattr_header *header) ++ struct ext3_xattr_header *header, int flags) + { + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; +@@ -1311,7 +1311,8 @@ getblk_failed: + + /* Update the inode. */ + EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; +- inode->i_ctime = ext3_current_time(inode); ++ if (!(flags & XATTR_NO_CTIME)) ++ inode->i_ctime = ext3_current_time(inode); + ext3_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + handle->h_sync = 1; diff --git a/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-suse.patch b/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-suse.patch new file mode 100644 index 0000000000000000000000000000000000000000..63015849ac11395955261ae36187da841cd6caba --- /dev/null +++ b/ldiskfs/kernel_patches/patches/ext3-xattr-no-update-ctime-suse.patch @@ -0,0 +1,59 @@ +Index: linux-2.6.5-7.311/include/linux/ext3_fs.h +=================================================================== +--- linux-2.6.5-7.311.orig/include/linux/ext3_fs.h ++++ linux-2.6.5-7.311/include/linux/ext3_fs.h +@@ -752,6 +752,13 @@ struct dir_private_info { + #define ERR_BAD_DX_DIR -75000 + + /* ++ * Indicates that ctime should not be updated in ext3_xattr_set_handle() ++ */ ++#ifndef XATTR_NO_CTIME ++#define XATTR_NO_CTIME 0x80 ++#endif ++ ++/* + * Function prototypes + */ + +Index: linux-2.6.5-7.311/fs/ext3/xattr.c +=================================================================== +--- linux-2.6.5-7.311.orig/fs/ext3/xattr.c ++++ linux-2.6.5-7.311/fs/ext3/xattr.c +@@ -89,7 +89,7 @@ + + static int ext3_xattr_set_handle2(handle_t *, struct inode *, + struct buffer_head *, +- struct ext3_xattr_header *); ++ struct ext3_xattr_header *, int); + + static int ext3_xattr_cache_insert(struct buffer_head *); + static struct buffer_head *ext3_xattr_cache_find(struct inode *, +@@ -1302,7 +1302,7 @@ skip_replace: + } + error = ext3_xattr_set_handle2(handle, inode, bh, + IS_LAST_ENTRY(ENTRY(header+1)) ? +- NULL : header); ++ NULL : header, flags); + + cleanup: + brelse(bh); +@@ -1318,7 +1318,7 @@ cleanup: + static int + ext3_xattr_set_handle2(handle_t *handle, struct inode *inode, + struct buffer_head *old_bh, +- struct ext3_xattr_header *header) ++ struct ext3_xattr_header *header, int flags) + { + struct super_block *sb = inode->i_sb; + struct buffer_head *new_bh = NULL; +@@ -1401,7 +1401,8 @@ getblk_failed: + + /* Update the inode. */ + EXT3_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; +- inode->i_ctime = CURRENT_TIME; ++ if (!(flags & XATTR_NO_CTIME)) ++ inode->i_ctime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); + if (IS_SYNC(inode)) + handle->h_sync = 1; diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series index de9e3e3b9d0088c37cf6d8e87d8f6eeb99f22c1d..68227ca1440be173bd774409515072d8f4bf870d 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel4.series @@ -23,3 +23,6 @@ ext3-unlink-race.patch ext3-mmp-2.6-rhel4.patch ext3-fiemap-2.6-sles10.patch ext3-external-journal-2.6.9.patch +ext3-max-dir-size.patch +ext3-print-inum-in-htree-warning.patch +ext3-xattr-no-update-ctime-rhel4.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series index 163bf25e57a88e73c7a11892c62cccd1ce3972c4..f0eb3a97fc203b62bd7a9bfd79c0d0004895c3f8 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel5.series @@ -13,8 +13,13 @@ ext3-filterdata-sles10.patch ext3-uninit-2.6.18.patch ext3-nanosecond-2.6.18-vanilla.patch ext3-inode-version-2.6.18-vanilla.patch +ext3-ea-expand-lose-block.patch ext3-mmp-2.6.18-vanilla.patch ext3-unlink-race.patch ext3-fiemap-2.6.18-vanilla.patch ext3-statfs-2.6-rhel5.patch ext3-lookup-dotdot-2.6.9.patch +ext3-max-dir-size.patch +ext3-print-inum-in-htree-warning.patch +ext3-block-bitmap-validation-2.6-rhel5.patch +ext3-xattr-no-update-ctime-2.6.22-vanilla.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series index aab3e71dc4db8b7c11a312bde4e9a9a9a93519be..779e1346940bf495f19b8c8aa3f99800c264ac8e 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles10.series @@ -18,7 +18,12 @@ ext3-disable-write-bar-by-default-2.6-sles10.patch ext3-uninit-2.6-sles10.patch ext3-nanosecond-2.6-sles10.patch ext3-inode-version-2.6-sles10.patch +ext3-ea-expand-lose-block.patch ext3-mmp-2.6-sles10.patch ext3-fiemap-2.6-sles10.patch ext3-statfs-2.6-sles10.patch ext3-lookup-dotdot-2.6.9.patch +ext3-max-dir-size.patch +ext3-print-inum-in-htree-warning.patch +ext3-block-bitmap-validation-2.6-sles10.patch +ext3-xattr-no-update-ctime-2.6-sles10.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series index 2b89fb422e80ab13543c24cbfc4d3624541e2929..1f728c01b646ec3003321409baae410618c97416 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-suse.series @@ -22,3 +22,6 @@ ext3-uninit-2.6-suse.patch ext3-nanosecond-2.6-suse.patch ext3-fiemap-stub-suse.patch ext3-external-journal-2.6.5.patch +ext3-max-dir-size-2.6.5-suse.patch +ext3-print-inum-in-htree-warning.patch +ext3-xattr-no-update-ctime-suse.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series index 2fcdb4832ab295b8cd5b73db4c86ccfadf308a2b..a74d105ec1b35cd30ba143d487525c8f24856f73 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6.18-vanilla.series @@ -14,7 +14,11 @@ ext3-16tb-overflow-fixes.patch ext3-uninit-2.6.18.patch ext3-nanosecond-2.6.18-vanilla.patch ext3-inode-version-2.6.18-vanilla.patch +ext3-ea-expand-lose-block.patch ext3-mmp-2.6.18-vanilla.patch ext3-handle-directory-corruption-better.patch ext3-fiemap-2.6.18-vanilla.patch ext3-lookup-dotdot-2.6.9.patch +ext3-max-dir-size.patch +ext3-print-inum-in-htree-warning.patch +ext3-xattr-no-update-ctime-2.6.22-vanilla.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6.22-vanilla.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6.22-vanilla.series index 5bf60dcb88297cbed56a9b2ffb3496f3f6a97c3e..46226a5129bd65cd0a5897491d46cf909decb919 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6.22-vanilla.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6.22-vanilla.series @@ -13,9 +13,13 @@ ext3-filterdata-sles10.patch ext3-uninit-2.6.22-vanilla.patch ext3-nanosecond-2.6.22-vanilla.patch ext3-inode-version-2.6.18-vanilla.patch +ext3-ea-expand-lose-block.patch ext3-mmp-2.6.22-vanilla.patch ext3-fiemap-2.6.22-vanilla.patch ext3-statfs-2.6.22.patch ext3-lookup-dotdot-2.6.9.patch ext3-unlink-race.patch ext3-export-journal-api.patch +ext3-max-dir-size.patch +ext3-print-inum-in-htree-warning.patch +ext3-xattr-no-update-ctime-2.6.22-vanilla.patch diff --git a/libcfs/.empty b/libcfs/.empty new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/libcfs/autoconf/.empty/.empty b/libcfs/autoconf/.empty/.empty new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/lnet/ChangeLog b/lnet/ChangeLog index 30ec25f10146745021f17a6a5cfe42ce7914d7c1..7241c42fa62c3af517c3e6d41586aaae1f22625f 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -1,3 +1,42 @@ +tbd Sun Microsystems, Inc. + * version 1.6.6 + * Support for networks: + socklnd - any kernel supported by Lustre, + qswlnd - Qsnet kernel modules 5.20 and later, + openiblnd - IbGold 1.8.2, + o2iblnd - OFED 1.1, 1.2.0, 1.2.5, and 1.3 + viblnd - Voltaire ibhost 3.4.5 and later, + ciblnd - Topspin 3.2.0, + iiblnd - Infiniserv 3.3 + PathBits patch, + gmlnd - GM 2.1.22 and later, + mxlnd - MX 1.2.1 or later, + ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x + +Severity : +Bugzilla : +Description: +Details : + +Severity : normal +Bugzilla : 16102 +Description: LNET Selftest results in Soft lockup on OSS CPU +Details : only hits when 8 or more o2ib clients involved and a session is + torn down with 'lst end_session' without preceeding 'lst stop'. + +Severity : minor +Bugzilla : 16321 +Description: concurrent_sends in IB LNDs should not be changeable at run time +Details : concurrent_sends in IB LNDs should not be changeable at run time + +Severity : normal +Bugzilla : 15272 +Description: ptl_send_rpc hits LASSERT when ptl_send_buf fails +Details : only hits under out-of-memory situations + + +------------------------------------------------------------------------------- + + 04-26-2008 Sun Microsystems, Inc. * version 1.6.5 * Support for networks: @@ -12,6 +51,16 @@ mxlnd - MX 1.2.1 or later, ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x +Severity : normal +Bugzilla : 14322 +Description: excessive debug information removed +Details : excessive debug information removed + +Severity : major +Bugzilla : 15712 +Description: ksocknal_create_conn() hit ASSERTION during connection race +Details : ksocknal_create_conn() hit ASSERTION during connection race + Severity : major Bugzilla : 13983 Description: ksocknal_send_hello() hit ASSERTION while connecting race diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index c016f84086342808dda91722922558103381cfc0..5fab577e300709877a49ab7edbd12fe3e9f064a3 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -576,6 +576,25 @@ else O2IBLND="" O2IBCPPFLAGS="" ]) + # we know at this point that the found OFED source is good + O2IB_SYMVER="" + if test $ENABLEO2IB -eq 3 ; then + # OFED default rpm not handle sles10 Modules.symvers name + for name in Module.symvers Modules.symvers; do + if test -f $O2IBPATH/$name; then + O2IB_SYMVER=$name; + break; + fi + done + if test -n $O2IB_SYMVER ; then + AC_MSG_NOTICE([adding $O2IBPATH/Module.symvers to $PWD/$SYMVERFILE]) + # strip out the existing symbols versions first + egrep -v $(echo $(awk '{ print $2 }' $O2IBPATH/$O2IB_SYMVER) | tr ' ' '|') $PWD/$SYMVERFILE > $PWD/$SYMVERFILE.old + cat $PWD/$SYMVERFILE.old $O2IBPATH/$O2IB_SYMVER > $PWD/$SYMVERFILE + else + AC_MSG_ERROR([an external source tree was specified for o2iblnd however I could not find a $O2IBPATH/Module.symvers there]) + fi + fi # version checking is a hack and isn't reliable, # we need verify it with each new ofed release @@ -631,7 +650,7 @@ AC_ARG_WITH([openib], if test $ENABLEOPENIB -eq 0; then AC_MSG_RESULT([disabled]) elif test ! \( -f ${OPENIBPATH}/include/ts_ib_core.h -a \ - -f ${OPENIBPATH}/include/ts_ib_cm.h -a\ + -f ${OPENIBPATH}/include/ts_ib_cm.h -a \ -f ${OPENIBPATH}/include/ts_ib_sa_client.h \); then AC_MSG_RESULT([no]) case $ENABLEOPENIB in @@ -1058,11 +1077,11 @@ AC_DEFINE(HAVE_SHOW_TASK, 1, [show_task is exported]) # check userland __u64 type AC_DEFUN([LN_U64_LONG_LONG], -[AC_MSG_CHECKING([check u64 is long long type]) +[AC_MSG_CHECKING([u64 is long long type]) tmp_flags="$CFLAGS" CFLAGS="$CFLAGS -Werror" AC_COMPILE_IFELSE([ - #include <asm/types.h> + #include <linux/types.h> int main(void) { unsigned long long *data1; __u64 *data2; @@ -1075,10 +1094,59 @@ AC_COMPILE_IFELSE([ AC_DEFINE(HAVE_U64_LONG_LONG, 1, [__u64 is long long type]) ],[ + AC_MSG_RESULT([no]) +]) +CFLAGS="$tmp_flags" +]) + +# check userland size_t type +AC_DEFUN([LN_SIZE_T_LONG], +[AC_MSG_CHECKING([size_t is unsigned long type]) +tmp_flags="$CFLAGS" +CFLAGS="$CFLAGS -Werror" +AC_COMPILE_IFELSE([ + #include <linux/types.h> + int main(void) { + unsigned long *data1; + size_t *data2; + + data1 = data2; + return 0; + } +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_SIZE_T_LONG, 1, + [size_t is long type]) +],[ + AC_MSG_RESULT([no]) +]) +CFLAGS="$tmp_flags" +]) + +AC_DEFUN([LN_SSIZE_T_LONG], +[AC_MSG_CHECKING([ssize_t is signed long type]) +tmp_flags="$CFLAGS" +CFLAGS="$CFLAGS -Werror" +AC_COMPILE_IFELSE([ + #include <linux/types.h> + int main(void) { + long *data1; + ssize_t *data2; + + data1 = data2; + return 0; + } +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_SSIZE_T_LONG, 1, + [ssize_t is long type]) +],[ + AC_MSG_RESULT([no]) ]) CFLAGS="$tmp_flags" ]) + # LN_TASKLIST_LOCK # 2.6.18 remove tasklist_lock export AC_DEFUN([LN_TASKLIST_LOCK], @@ -1229,6 +1297,8 @@ LN_STRUCT_PAGE_LIST LN_STRUCT_SIGHAND LN_FUNC_SHOW_TASK LN_U64_LONG_LONG +LN_SSIZE_T_LONG +LN_SIZE_T_LONG # 2.6.18 LN_TASKLIST_LOCK # 2.6.19 diff --git a/lnet/include/libcfs/kp30.h b/lnet/include/libcfs/kp30.h index 0869f67bc1dcba9b74651ad72d3c6612bc137924..dcd599b7be57d3d3a3072cd991f55b9784d52d0f 100644 --- a/lnet/include/libcfs/kp30.h +++ b/lnet/include/libcfs/kp30.h @@ -163,13 +163,13 @@ do { \ "%s:%d\n", s, __FILE__, __LINE__); \ break; \ } \ + libcfs_kmem_dec((ptr), s); \ + CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n", \ + s, (ptr), atomic_read(&libcfs_kmemory)); \ if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ cfs_free_large(ptr); \ else \ cfs_free(ptr); \ - libcfs_kmem_dec((ptr), s); \ - CDEBUG(D_MALLOC, "kfreed '" #ptr "': %d at %p (tot %d).\n", \ - s, (ptr), atomic_read(&libcfs_kmemory)); \ } while (0) /******************************************************************************/ diff --git a/lnet/include/libcfs/libcfs.h b/lnet/include/libcfs/libcfs.h index 82d293b6cde7546581ea08bcb388d7b8b873e9db..80518b1ee5dd3671c41e8fce2facd94abd5c626d 100644 --- a/lnet/include/libcfs/libcfs.h +++ b/lnet/include/libcfs/libcfs.h @@ -514,7 +514,7 @@ static inline void cfs_slow_warning(cfs_time_t now, int seconds, char *msg) { if (cfs_time_after(cfs_time_current(), cfs_time_add(now, cfs_time_seconds(15)))) - CERROR("slow %s %lu sec\n", msg, + CERROR("slow %s "CFS_TIME_T" sec\n", msg, cfs_duration_sec(cfs_time_sub(cfs_time_current(),now))); } diff --git a/lnet/include/libcfs/linux/kp30.h b/lnet/include/libcfs/linux/kp30.h index 7de45d6ee7eb4db1c53b54db837ecd896ff3289a..19355ed6c0bcae75b2b19ec12a3d60845e2a4eb1 100644 --- a/lnet/include/libcfs/linux/kp30.h +++ b/lnet/include/libcfs/linux/kp30.h @@ -346,23 +346,30 @@ extern int lwt_snapshot (cycles_t *now, int *ncpu, int *total_size, # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPF64 "L" -# define LPSZ "%lu" -# define LPSSZ "%ld" #elif (_LWORDSIZE == 32) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPF64 "L" -# define LPSZ "%u" -# define LPSSZ "%d" #elif (_LWORDSIZE == 64) # define LPU64 "%lu" # define LPD64 "%ld" # define LPX64 "%#lx" # define LPF64 "l" +#endif + +#ifdef HAVE_SIZE_T_LONG # define LPSZ "%lu" +#else +# define LPSZ "%u" +#endif + +#ifdef HAVE_SSIZE_T_LONG # define LPSSZ "%ld" +#else +# define LPSSZ "%d" #endif + #ifndef LPU64 # error "No word size defined" #endif diff --git a/lnet/include/libcfs/linux/libcfs.h b/lnet/include/libcfs/linux/libcfs.h index a313389109e004afa036427e15ec1667f047a2c9..c873c2fc2fdd82dc6401c56cbea00f8564537d80 100644 --- a/lnet/include/libcfs/linux/libcfs.h +++ b/lnet/include/libcfs/linux/libcfs.h @@ -15,8 +15,8 @@ #endif #include <stdarg.h> -#include <libcfs/linux/linux-mem.h> #include <libcfs/linux/linux-time.h> +#include <libcfs/linux/linux-mem.h> #include <libcfs/linux/linux-prim.h> #include <libcfs/linux/linux-lock.h> #include <libcfs/linux/linux-fs.h> diff --git a/lnet/include/libcfs/linux/linux-fs.h b/lnet/include/libcfs/linux/linux-fs.h index fa162c0e8d4fc5cb345e2c964285089d657cb9d2..7573322dfe62b6b8d774cd9873d5c22a0f5727c0 100644 --- a/lnet/include/libcfs/linux/linux-fs.h +++ b/lnet/include/libcfs/linux/linux-fs.h @@ -46,6 +46,7 @@ typedef struct file cfs_file_t; typedef struct dentry cfs_dentry_t; +typedef struct dirent64 cfs_dirent_t; #ifdef __KERNEL__ #define cfs_filp_size(f) (i_size_read((f)->f_dentry->d_inode)) diff --git a/lnet/include/libcfs/linux/linux-time.h b/lnet/include/libcfs/linux/linux-time.h index 626defd4707ac1df8f0572fd7b7fcd435298a68b..3d4cdf54201bb0e7263938ad2bd03bb75eb0792c 100644 --- a/lnet/include/libcfs/linux/linux-time.h +++ b/lnet/include/libcfs/linux/linux-time.h @@ -307,7 +307,11 @@ static inline int cfs_time_beforeq_64(__u64 t1, __u64 t2) /* * Liblustre. time(2) based implementation. */ + +#define CFS_TIME_T "%lu" + #include <libcfs/user-time.h> + #endif /* __KERNEL__ */ /* __LIBCFS_LINUX_LINUX_TIME_H__ */ diff --git a/lnet/include/libcfs/user-prim.h b/lnet/include/libcfs/user-prim.h index 19396406a2d923b556d24d40528551238862f5f7..43c1aeb4eb36895a2c2c2e05e9155648e9290b5d 100644 --- a/lnet/include/libcfs/user-prim.h +++ b/lnet/include/libcfs/user-prim.h @@ -52,28 +52,6 @@ #include <pthread.h> #endif -#ifndef PAGE_SIZE - -#define PAGE_SIZE (getpagesize()) -static __inline__ int getpageshift() -{ - int pagesize = getpagesize(); -#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)) - /* unsigned int is 32 bits on all our architectures */ - return (__builtin_clz(pagesize) ^ 31); -#else - register int pageshift = -1; - while (pagesize) { pagesize >>= 1; pageshift++; } - return pageshift; -#endif -} - -#undef PAGE_MASK -#define PAGE_MASK (~(PAGE_SIZE-1)) -#undef PAGE_SHIFT -#define PAGE_SHIFT (getpageshift()) - -#endif /* * Wait Queue. No-op implementation. @@ -134,10 +112,21 @@ struct page { typedef struct page cfs_page_t; +#ifndef PAGE_SIZE + +/* 4K */ +#define CFS_PAGE_SHIFT 12 +#define CFS_PAGE_SIZE (1UL << CFS_PAGE_SHIFT) +#define CFS_PAGE_MASK (~((__u64)CFS_PAGE_SIZE-1)) + +#else + #define CFS_PAGE_SIZE PAGE_SIZE #define CFS_PAGE_SHIFT PAGE_SHIFT #define CFS_PAGE_MASK (~((__u64)CFS_PAGE_SIZE-1)) +#endif + cfs_page_t *cfs_alloc_page(unsigned int flags); void cfs_free_page(cfs_page_t *pg); void *cfs_page_address(cfs_page_t *pg); diff --git a/lnet/include/libcfs/user-time.h b/lnet/include/libcfs/user-time.h index 3fb801da9d3d5c68716abc2633eeaf32a39482f0..874b7da4fa100f110fe15ef26a60484f56dcf7fb 100644 --- a/lnet/include/libcfs/user-time.h +++ b/lnet/include/libcfs/user-time.h @@ -183,7 +183,10 @@ static inline cfs_duration_t cfs_time_sub(cfs_time_t t1, cfs_time_t t2) #define cfs_time_before_64 cfs_time_before #define cfs_time_beforeq_64 cfs_time_beforeq -#define CFS_TIME_T "%lu" +#ifndef CFS_TIME_T +#define CFS_TIME_T "%u" +#endif + #define CFS_DURATION_T "%ld" /* !__KERNEL__ */ diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index 37dc5d49f1acc41942817d628b3df7fafec07be9..a93354babb31969154fed7a3d371572e8957c751 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -548,6 +548,7 @@ lnet_remotenet_t *lnet_find_net_locked (__u32 net); int lnet_islocalnid(lnet_nid_t nid); int lnet_islocalnet(__u32 net); +void lnet_build_unlink_event(lnet_libmd_t *md, lnet_event_t *ev); void lnet_enq_event_locked(lnet_eq_t *eq, lnet_event_t *ev); void lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, unsigned int offset, unsigned int len); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index c03810e89d66a37bdd58a42e4df5b128fc4e7772..7a06a286e3a21f29f8e35e164b50b2c0e81df3a3 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -320,7 +320,7 @@ typedef struct lnet_lnd * for success and do NOT give back a receive credit; that has to wait * until lnd_recv() gets called. On failure return < 0 and * release resources; lnd_recv() will not be called. */ - int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, + int (*lnd_eager_recv)(struct lnet_ni *ni, void *private, lnet_msg_t *msg, void **new_privatep); /* notification of peer health */ diff --git a/lnet/include/lnet/types.h b/lnet/include/lnet/types.h index f459b1e41709b4fc92b893a57596289d216429d4..d080c9690016f8d7f6e534931f72013e13958290 100644 --- a/lnet/include/lnet/types.h +++ b/lnet/include/lnet/types.h @@ -1,9 +1,12 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ #ifndef __LNET_TYPES_H__ #define __LNET_TYPES_H__ #include <libcfs/libcfs.h> -#define LNET_RESERVED_PORTAL 0 /* portals reserved for lnet's own use */ +#define LNET_RESERVED_PORTAL 0 /* portals reserved for lnet's own use */ typedef __u64 lnet_nid_t; typedef __u32 lnet_pid_t; @@ -35,7 +38,7 @@ typedef lnet_handle_any_t lnet_handle_me_t; static inline int LNetHandleIsEqual (lnet_handle_any_t h1, lnet_handle_any_t h2) { - return (h1.cookie == h2.cookie); + return (h1.cookie == h2.cookie); } typedef struct { @@ -92,7 +95,7 @@ typedef struct { #define LNET_MD_TRUNCATE (1 << 4) #define LNET_MD_ACK_DISABLE (1 << 5) #define LNET_MD_IOVEC (1 << 6) -#define LNET_MD_MAX_SIZE (1 << 7) +#define LNET_MD_MAX_SIZE (1 << 7) #define LNET_MD_KIOV (1 << 8) /* For compatibility with Cray Portals */ @@ -104,9 +107,9 @@ typedef struct { typedef struct iovec lnet_md_iovec_t; typedef struct { - cfs_page_t *kiov_page; - unsigned int kiov_len; - unsigned int kiov_offset; + cfs_page_t *kiov_page; + unsigned int kiov_len; + unsigned int kiov_offset; } lnet_kiov_t; typedef enum { @@ -114,13 +117,13 @@ typedef enum { LNET_EVENT_PUT, LNET_EVENT_REPLY, LNET_EVENT_ACK, - LNET_EVENT_SEND, - LNET_EVENT_UNLINK, + LNET_EVENT_SEND, + LNET_EVENT_UNLINK, } lnet_event_kind_t; -#define LNET_SEQ_BASETYPE long +#define LNET_SEQ_BASETYPE long typedef unsigned LNET_SEQ_BASETYPE lnet_seq_t; -#define LNET_SEQ_GT(a,b) (((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0) +#define LNET_SEQ_GT(a,b) (((signed LNET_SEQ_BASETYPE)((a) - (b))) > 0) /* XXX * cygwin need the pragma line, not clear if it's needed in other places. diff --git a/lnet/klnds/iiblnd/iiblnd_modparams.c b/lnet/klnds/iiblnd/iiblnd_modparams.c index 908314112ceded7b03cd07764763e949fca0ec04..dfe4568dff0361164a8409636e278cfe1e351a5f 100644 --- a/lnet/klnds/iiblnd/iiblnd_modparams.c +++ b/lnet/klnds/iiblnd/iiblnd_modparams.c @@ -212,7 +212,7 @@ static cfs_sysctl_table_t kibnal_ctl_table[] = { .procname = "concurrent_sends", .data = &concurrent_sends, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0444, .proc_handler = &proc_dointvec }, {0} @@ -221,7 +221,7 @@ static cfs_sysctl_table_t kibnal_ctl_table[] = { static cfs_sysctl_table_t kibnal_top_ctl_table[] = { { .ctl_name = 203, - .procname = "openibnal", + .procname = "iibnal", .data = NULL, .maxlen = 0, .mode = 0555, diff --git a/lnet/klnds/o2iblnd/Makefile.in b/lnet/klnds/o2iblnd/Makefile.in index 52a194d19dc611e70d89873358ab0bfe0576f393..569c266a2d68c1e9591839548c35407b69adac03 100644 --- a/lnet/klnds/o2iblnd/Makefile.in +++ b/lnet/klnds/o2iblnd/Makefile.in @@ -1,6 +1,8 @@ MODULES := ko2iblnd ko2iblnd-objs := o2iblnd.o o2iblnd_cb.o o2iblnd_modparams.o -EXTRA_POST_CFLAGS := @O2IBCPPFLAGS@ +# Need to make sure we use PRE, not POST here so that an external OFED +# source pool overrides any in-kernel OFED sources +EXTRA_PRE_CFLAGS := @O2IBCPPFLAGS@ @INCLUDE_RULES@ diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index b8a994a4122d89c43d3619620211240874e15561..5d4001022d05e3d8df38debe0e18568e73c483f5 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -191,14 +191,14 @@ kiblnd_unpack_msg(kib_msg_t *msg, int nob) __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrags); } - + n = msg->ibm_u.putack.ibpam_rd.rd_nfrags; if (n <= 0 || n > IBLND_MAX_RDMA_FRAGS) { CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", n, IBLND_MAX_RDMA_FRAGS); return -EPROTO; } - + if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) { CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])); @@ -314,7 +314,7 @@ kiblnd_create_peer (lnet_ni_t *ni, kib_peer_t **peerp, lnet_nid_t nid) /* always called with a ref on ni, which prevents ni being shutdown */ LASSERT (net->ibn_shutdown == 0); - + /* npeers only grows with the global lock held */ atomic_inc(&net->ibn_npeers); @@ -742,7 +742,7 @@ kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, int state) CERROR("Can't request completion notificiation: %d\n", rc); goto failed_2; } - + memset(init_qp_attr, 0, sizeof(*init_qp_attr)); init_qp_attr->event_handler = kiblnd_qp_event; init_qp_attr->qp_context = conn; @@ -1185,20 +1185,20 @@ kiblnd_alloc_tx_descs (lnet_ni_t *ni) return -ENOMEM; } #else - LIBCFS_ALLOC(tx->tx_wrq, - (1 + IBLND_MAX_RDMA_FRAGS) * + LIBCFS_ALLOC(tx->tx_wrq, + (1 + IBLND_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); if (tx->tx_wrq == NULL) return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_sge, - (1 + IBLND_MAX_RDMA_FRAGS) * + + LIBCFS_ALLOC(tx->tx_sge, + (1 + IBLND_MAX_RDMA_FRAGS) * sizeof(*tx->tx_sge)); if (tx->tx_sge == NULL) return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_rd, - offsetof(kib_rdma_desc_t, + + LIBCFS_ALLOC(tx->tx_rd, + offsetof(kib_rdma_desc_t, rd_frags[IBLND_MAX_RDMA_FRAGS])); if (tx->tx_rd == NULL) return -ENOMEM; diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index f7926c7e34de614acc69dfd2100a2f4d5c67a50e..7da4289087b9d1aaa2e99b8650a31a36a0cb9b47 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -497,13 +497,13 @@ static inline kib_conn_t * kiblnd_get_conn_locked (kib_peer_t *peer) { LASSERT (!list_empty(&peer->ibp_conns)); - + /* just return the first connection */ return list_entry(peer->ibp_conns.next, kib_conn_t, ibc_list); } static inline int -kiblnd_send_keepalive(kib_conn_t *conn) +kiblnd_send_keepalive(kib_conn_t *conn) { return (*kiblnd_tunables.kib_keepalive > 0) && time_after(jiffies, conn->ibc_last_send + diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index dfe69198cd1222bd35c9bd2f21ec1e93680dd169..7881b499e059a0ec7585d419e7a5626141874ac8 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -505,7 +505,7 @@ kiblnd_rx_complete (kib_rx_t *rx, int status, int nob) LASSERT (net != NULL); LASSERT (rx->rx_nob < 0); /* was posted */ rx->rx_nob = 0; /* isn't now */ - + if (conn->ibc_state > IBLND_CONN_ESTABLISHED) goto ignore; @@ -1258,7 +1258,7 @@ kiblnd_init_rdma (lnet_ni_t *ni, kib_tx_t *tx, int type, dstfrag++; dstidx++; } - + tx->tx_nwrq++; } @@ -1872,7 +1872,7 @@ kiblnd_peer_notify (kib_peer_t *peer) time_t last_alive = 0; int error = 0; unsigned long flags; - + read_lock_irqsave(&kiblnd_data.kib_global_lock, flags); if (list_empty(&peer->ibp_conns) && @@ -1881,14 +1881,14 @@ kiblnd_peer_notify (kib_peer_t *peer) peer->ibp_error != 0) { error = peer->ibp_error; peer->ibp_error = 0; - + last_alive = cfs_time_current_sec() - cfs_duration_sec(cfs_time_current() - peer->ibp_last_alive); } - + read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags); - + if (error != 0) lnet_notify(peer->ibp_ni, peer->ibp_nid, 0, last_alive); @@ -2780,14 +2780,14 @@ kiblnd_cm_callback(struct rdma_cm_id *cmid, struct rdma_cm_event *event) kiblnd_conn_decref(conn); return 0; - case RDMA_CM_EVENT_DEVICE_REMOVAL: - LCONSOLE_ERROR_MSG(0x131, + case RDMA_CM_EVENT_DEVICE_REMOVAL: + LCONSOLE_ERROR_MSG(0x131, "Received notification of device removal\n" "Please shutdown LNET to allow this to proceed\n"); /* Can't remove network from underneath LNET for now, so I have * to ignore this */ - return 0; - } + return 0; + } } int @@ -2807,7 +2807,7 @@ kiblnd_check_txs (kib_conn_t *conn, struct list_head *txs) } else { LASSERT (!tx->tx_queued); LASSERT (tx->tx_waiting || tx->tx_sending != 0); - } + } if (time_after_eq (jiffies, tx->tx_deadline)) { timed_out = 1; diff --git a/lnet/klnds/o2iblnd/o2iblnd_modparams.c b/lnet/klnds/o2iblnd/o2iblnd_modparams.c index ce65801a2dfea9e984ac7690c9bfdabef6bff014..0624aa944d0b27b8fb29109319b7f80397fa8ac1 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_modparams.c +++ b/lnet/klnds/o2iblnd/o2iblnd_modparams.c @@ -199,7 +199,7 @@ static cfs_sysctl_table_t kiblnd_ctl_table[] = { .procname = "concurrent_sends", .data = &concurrent_sends, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0444, .proc_handler = &proc_dointvec }, { diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index a04bb487452115391ae2abe160d4c4c09817902e..710ebd2e94fc82333879432d03f041c13ecd45a3 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -1045,7 +1045,9 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, conn->ksnc_peer = NULL; conn->ksnc_route = NULL; conn->ksnc_sock = sock; - atomic_set (&conn->ksnc_sock_refcount, 1); /* 1 ref for conn */ + /* 2 ref, 1 for conn, another extra ref prevents socket + * being closed before establishment of connection */ + atomic_set (&conn->ksnc_sock_refcount, 2); conn->ksnc_type = type; ksocknal_lib_save_callback(sock, conn); atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */ @@ -1154,6 +1156,14 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, } } + if (peer->ksnp_closing || + (active && route->ksnr_deleted)) { + /* peer/route got closed under me */ + rc = -ESTALE; + warn = "peer/route removed"; + goto failed_2; + } + if (peer->ksnp_proto == NULL) { /* Never connected before. * NB recv_hello may have returned EPROTO to signal my peer @@ -1191,40 +1201,6 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, goto failed_2; } - write_unlock_bh(global_lock); - - /* No more race (or won the race), has compatible version with peer */ - if (active) { - /* additional routes after interface exchange? */ - ksocknal_create_routes(peer, conn->ksnc_port, - hello->kshm_ips, hello->kshm_nips); - } else { - hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips, - hello->kshm_nips); - rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); - } - - /* setup the socket AFTER I've received hello (it disables - * SO_LINGER). I might call back to the acceptor who may want - * to send a protocol version response and then close the - * socket; this ensures the socket only tears down after the - * response has been sent. */ - if (rc == 0) - rc = ksocknal_lib_setup_sock(sock); - - write_lock_bh(global_lock); - - if (rc != 0) - goto failed_2; - - if (peer->ksnp_closing || - (active && route->ksnr_deleted)) { - /* peer/route got closed under me */ - rc = -ESTALE; - warn = "peer/route removed"; - goto failed_2; - } - /* Refuse to duplicate an existing connection, unless this is a * loopback connection */ if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) { @@ -1301,21 +1277,17 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, ksocknal_queue_tx_locked (tx, conn); } - /* NB my callbacks block while I hold ksnd_global_lock */ - ksocknal_lib_set_callback(sock, conn); - - if (!active) - peer->ksnp_accepting--; + write_unlock_bh (global_lock); - write_unlock_bh(global_lock); + /* We've now got a new connection. Any errors from here on are just + * like "normal" comms errors and we close the connection normally. + * NB (a) we still have to send the reply HELLO for passive + * connections, + * (b) normal I/O on the conn is blocked until I setup and call the + * socket callbacks. + */ - if (ksocknal_connsock_addref(conn) == 0) { - ksocknal_lib_bind_irq (irq); - /* Allow I/O to proceed. */ - ksocknal_read_callback(conn); - ksocknal_write_callback(conn); - ksocknal_connsock_decref(conn); - } + ksocknal_lib_bind_irq (irq); CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d" " incarnation:"LPD64" sched[%d]/%d\n", @@ -1324,11 +1296,51 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route, conn->ksnc_port, incarnation, (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq); + if (active) { + /* additional routes after interface exchange? */ + ksocknal_create_routes(peer, conn->ksnc_port, + hello->kshm_ips, hello->kshm_nips); + } else { + hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips, + hello->kshm_nips); + rc = ksocknal_send_hello(ni, conn, peerid.nid, hello); + } + LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t, kshm_ips[LNET_MAX_INTERFACES])); + /* setup the socket AFTER I've received hello (it disables + * SO_LINGER). I might call back to the acceptor who may want + * to send a protocol version response and then close the + * socket; this ensures the socket only tears down after the + * response has been sent. */ + if (rc == 0) + rc = ksocknal_lib_setup_sock(sock); + + write_lock_bh(global_lock); + + /* NB my callbacks block while I hold ksnd_global_lock */ + ksocknal_lib_set_callback(sock, conn); + + if (!active) + peer->ksnp_accepting--; + + write_unlock_bh(global_lock); + + if (rc != 0) { + write_lock_bh(global_lock); + ksocknal_close_conn_locked(conn, rc); + write_unlock_bh(global_lock); + } else if (ksocknal_connsock_addref(conn) == 0) { + /* Allow I/O to proceed. */ + ksocknal_read_callback(conn); + ksocknal_write_callback(conn); + ksocknal_connsock_decref(conn); + } + + ksocknal_connsock_decref(conn); ksocknal_conn_decref(conn); - return 0; + return rc; failed_2: if (!peer->ksnp_closing && diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 8217d6b7358492c392f5ac07005324e553bf1bad..bf4cd0886bec4a05a6cd9d6cc5ec109815f4c8ed 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -2104,8 +2104,7 @@ ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn, LASSERT (0 <= hello->kshm_nips && hello->kshm_nips <= LNET_MAX_INTERFACES); - /* No need for getconnsock/putconnsock */ - LASSERT (!conn->ksnc_closing); + /* rely on caller to hold a ref on socket so it wouldn't disappear */ LASSERT (conn->ksnc_proto != NULL); srcnid = lnet_ptlcompat_srcnid(ni->ni_nid, peer_nid); diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c index 0ab15bff1f960c8ae3d022b97539385ef8a3ecaa..b733b7b59e9f7141653c6d7b8ccd3a15785ad04f 100644 --- a/lnet/klnds/viblnd/viblnd.c +++ b/lnet/klnds/viblnd/viblnd.c @@ -174,7 +174,7 @@ kibnal_init_msg(kib_msg_t *msg, int type, int body_nob) } void -kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, +kibnal_pack_msg(kib_msg_t *msg, __u32 version, int credits, lnet_nid_t dstnid, __u64 dststamp, __u64 seq) { /* CAVEAT EMPTOR! all message fields not set here should have been @@ -243,7 +243,7 @@ kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) msg_version != IBNAL_MSG_VERSION) return -EPROTO; } else if (msg_version != expected_version) { - CERROR("Bad version: %x(%x expected)\n", + CERROR("Bad version: %x(%x expected)\n", msg_version, expected_version); return -EPROTO; } @@ -269,7 +269,7 @@ kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) return -EPROTO; } msg->ibm_cksum = msg_cksum; - + if (flip) { /* leave magic unflipped as a clue to peer endianness */ msg->ibm_version = msg_version; @@ -282,7 +282,7 @@ kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) __swab64s(&msg->ibm_dststamp); __swab64s(&msg->ibm_seq); } - + if (msg->ibm_srcnid == LNET_NID_ANY) { CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid)); return -EPROTO; @@ -292,7 +292,7 @@ kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) default: CERROR("Unknown message type %x\n", msg->ibm_type); return -EPROTO; - + case IBNAL_MSG_NOOP: break; @@ -329,14 +329,14 @@ kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key); __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag); } - + n = msg->ibm_u.putack.ibpam_rd.rd_nfrag; if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", + CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n", n, IBNAL_MAX_RDMA_FRAGS); return -EPROTO; } - + if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) { CERROR("Short PUT_ACK: %d(%d)\n", msg_nob, (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])); @@ -365,7 +365,7 @@ kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob); __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); } -#else +#else if (flip) { __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key); __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag); @@ -373,17 +373,17 @@ kibnal_unpack_msg(kib_msg_t *msg, __u32 expected_version, int nob) n = msg->ibm_u.get.ibgm_rd.rd_nfrag; if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) { - CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", + CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n", n, IBNAL_MAX_RDMA_FRAGS); return -EPROTO; } - + if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) { CERROR("Short GET_REQ: %d(%d)\n", msg_nob, (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])); return -EPROTO; } - + if (flip) for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) { __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob); @@ -431,25 +431,25 @@ kibnal_start_listener (lnet_ni_t *ni) LASSERT (kibnal_data.kib_listen_handle == NULL); - kibnal_data.kib_listen_handle = + kibnal_data.kib_listen_handle = cm_create_cep(cm_cep_transp_rc); if (kibnal_data.kib_listen_handle == NULL) { CERROR ("Can't create listen CEP\n"); return -ENOMEM; } - CDEBUG(D_NET, "Created CEP %p for listening\n", + CDEBUG(D_NET, "Created CEP %p for listening\n", kibnal_data.kib_listen_handle); memset(&info, 0, sizeof(info)); - info.listen_addr.end_pt.sid = + info.listen_addr.end_pt.sid = (__u64)(*kibnal_tunables.kib_service_number); cmrc = cm_listen(kibnal_data.kib_listen_handle, &info, kibnal_listen_callback, NULL); if (cmrc == cm_stat_success) return 0; - + CERROR ("cm_listen error: %d\n", cmrc); cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); @@ -465,13 +465,13 @@ kibnal_stop_listener(lnet_ni_t *ni) cm_return_t cmrc; LASSERT (kibnal_data.kib_listen_handle != NULL); - + cmrc = cm_cancel(kibnal_data.kib_listen_handle); if (cmrc != cm_stat_success) CERROR ("Error %d stopping listener\n", cmrc); cfs_pause(cfs_time_seconds(1)/10); /* ensure no more callbacks */ - + cmrc = cm_destroy_cep(kibnal_data.kib_listen_handle); if (cmrc != vv_return_ok) CERROR ("Error %d destroying CEP\n", cmrc); @@ -519,18 +519,18 @@ kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid) /* npeers only grows with the global lock held */ atomic_inc(&kibnal_data.kib_npeers); } - + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); if (rc != 0) { CERROR("Can't create peer: %s\n", - (rc == -ESHUTDOWN) ? "shutting down" : + (rc == -ESHUTDOWN) ? "shutting down" : "too many peers"); LIBCFS_FREE(peer, sizeof(*peer)); } else { *peerp = peer; } - + return rc; } @@ -544,7 +544,7 @@ kibnal_destroy_peer (kib_peer_t *peer) LASSERT (peer->ibp_accepting == 0); LASSERT (list_empty (&peer->ibp_conns)); LASSERT (list_empty (&peer->ibp_tx_queue)); - + LIBCFS_FREE (peer, sizeof (*peer)); /* NB a peer's connections keep a reference on their peer until @@ -643,7 +643,7 @@ kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip) CDEBUG(D_NET, "%s at %u.%u.%u.%u\n", libcfs_nid2str(nid), HIPQUAD(ip)); - + if (nid == LNET_NID_ANY) return (-EINVAL); @@ -669,7 +669,7 @@ kibnal_add_persistent_peer (lnet_nid_t nid, __u32 ip) peer->ibp_ip = ip; peer->ibp_persistence++; - + write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); return (0); } @@ -814,16 +814,16 @@ kibnal_debug_conn (kib_conn_t *conn) { struct list_head *tmp; int i; - + spin_lock(&conn->ibc_lock); - - CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n", - atomic_read(&conn->ibc_refcount), conn, + + CDEBUG(D_CONSOLE, "conn[%d] %p -> %s: \n", + atomic_read(&conn->ibc_refcount), conn, libcfs_nid2str(conn->ibc_peer->ibp_nid)); CDEBUG(D_CONSOLE, " txseq "LPD64" rxseq "LPD64" state %d \n", conn->ibc_txseq, conn->ibc_rxseq, conn->ibc_state); CDEBUG(D_CONSOLE, " nposted %d cred %d o_cred %d r_cred %d\n", - conn->ibc_nsends_posted, conn->ibc_credits, + conn->ibc_nsends_posted, conn->ibc_credits, conn->ibc_outstanding_credits, conn->ibc_reserved_credits); CDEBUG(D_CONSOLE, " disc %d comms_err %d\n", conn->ibc_disconnect, conn->ibc_comms_error); @@ -831,7 +831,7 @@ kibnal_debug_conn (kib_conn_t *conn) CDEBUG(D_CONSOLE, " early_rxs:\n"); list_for_each(tmp, &conn->ibc_early_rxs) kibnal_debug_rx(list_entry(tmp, kib_rx_t, rx_list)); - + CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); list_for_each(tmp, &conn->ibc_tx_queue_nocred) kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); @@ -843,11 +843,11 @@ kibnal_debug_conn (kib_conn_t *conn) CDEBUG(D_CONSOLE, " tx_queue:\n"); list_for_each(tmp, &conn->ibc_tx_queue) kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); - + CDEBUG(D_CONSOLE, " active_txs:\n"); list_for_each(tmp, &conn->ibc_active_txs) kibnal_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); - + CDEBUG(D_CONSOLE, " rxs:\n"); for (i = 0; i < IBNAL_RX_MSGS; i++) kibnal_debug_rx(&conn->ibc_rxs[i]); @@ -859,20 +859,20 @@ int kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) { static vv_qp_attr_t attr; - + kib_connvars_t *cv = conn->ibc_connvars; vv_return_t vvrc; - + /* Only called by connd => static OK */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); memset(&attr, 0, sizeof(attr)); - + switch (new_state) { default: LBUG(); - + case vv_qp_state_init: { struct vv_qp_modify_init_st *init = &attr.modify.params.init; @@ -882,7 +882,7 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) init->access_control = vv_acc_r_mem_read | vv_acc_r_mem_write; /* XXX vv_acc_l_mem_write ? */ - attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | + attr.modify.vv_qp_attr_mask = VV_QP_AT_P_KEY_IX | VV_QP_AT_PHY_PORT_NUM | VV_QP_AT_ACCESS_CON_F; break; @@ -911,9 +911,9 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) // XXX sdp sets VV_QP_AT_OP_F but no actual optional options - attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | + attr.modify.vv_qp_attr_mask = VV_QP_AT_ADD_VEC | VV_QP_AT_DEST_QP | - VV_QP_AT_R_PSN | + VV_QP_AT_R_PSN | VV_QP_AT_MIN_RNR_NAK_T | VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM | VV_QP_AT_OP_F; @@ -927,7 +927,7 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) rts->retry_num = *kibnal_tunables.kib_retry_cnt; rts->rnr_num = *kibnal_tunables.kib_rnr_cnt; rts->dest_out_rdma_r_atom_num = IBNAL_OUS_DST_RD; - + attr.modify.vv_qp_attr_mask = VV_QP_AT_S_PSN | VV_QP_AT_L_ACK_T | VV_QP_AT_RETRY_NUM | @@ -940,18 +940,18 @@ kibnal_set_qp_state (kib_conn_t *conn, vv_qp_state_t new_state) attr.modify.vv_qp_attr_mask = 0; break; } - + attr.modify.qp_modify_into_state = new_state; attr.modify.vv_qp_attr_mask |= VV_QP_AT_STATE; - + vvrc = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &attr, NULL); if (vvrc != vv_return_ok) { - CERROR("Can't modify qp -> %s state to %d: %d\n", + CERROR("Can't modify qp -> %s state to %d: %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), new_state, vvrc); return -EIO; } - + return 0; } @@ -971,7 +971,7 @@ kibnal_create_conn (cm_cep_handle_t cep) /* Only the connd creates conns => single threaded */ LASSERT(!in_interrupt()); LASSERT(current == kibnal_data.kib_connd); - + LIBCFS_ALLOC(conn, sizeof (*conn)); if (conn == NULL) { CERROR ("Can't allocate connection\n"); @@ -989,7 +989,7 @@ kibnal_create_conn (cm_cep_handle_t cep) INIT_LIST_HEAD (&conn->ibc_tx_queue_rsrvd); INIT_LIST_HEAD (&conn->ibc_active_txs); spin_lock_init (&conn->ibc_lock); - + atomic_inc (&kibnal_data.kib_nconns); /* well not really, but I call destroy() on failure, which decrements */ @@ -1023,7 +1023,7 @@ kibnal_create_conn (cm_cep_handle_t cep) vv_r_key_t r_key; rx->rx_conn = conn; - rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + page_offset); vvrc = vv_get_gen_mr_attrib(kibnal_data.kib_hca, @@ -1052,7 +1052,7 @@ kibnal_create_conn (cm_cep_handle_t cep) reqattr.create.qp_type = vv_qp_type_r_conn; reqattr.create.cq_send_h = kibnal_data.kib_cq; reqattr.create.cq_receive_h = kibnal_data.kib_cq; - reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) * + reqattr.create.send_max_outstand_wr = (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends); reqattr.create.receive_max_outstand_wr = IBNAL_RX_MSGS; reqattr.create.max_scatgat_per_send_wr = 1; @@ -1072,13 +1072,13 @@ kibnal_create_conn (cm_cep_handle_t cep) conn->ibc_state = IBNAL_CONN_INIT_QP; conn->ibc_connvars->cv_local_qpn = rspattr.create_return.qp_num; - if (rspattr.create_return.receive_max_outstand_wr < + if (rspattr.create_return.receive_max_outstand_wr < IBNAL_RX_MSGS || - rspattr.create_return.send_max_outstand_wr < + rspattr.create_return.send_max_outstand_wr < (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends)) { CERROR("Insufficient rx/tx work items: wanted %d/%d got %d/%d\n", - IBNAL_RX_MSGS, - (1 + IBNAL_MAX_RDMA_FRAGS) * + IBNAL_RX_MSGS, + (1 + IBNAL_MAX_RDMA_FRAGS) * (*kibnal_tunables.kib_concurrent_sends), rspattr.create_return.receive_max_outstand_wr, rspattr.create_return.send_max_outstand_wr); @@ -1091,7 +1091,7 @@ kibnal_create_conn (cm_cep_handle_t cep) /* 1 ref for caller */ atomic_set (&conn->ibc_refcount, 1); return (conn); - + failed: kibnal_destroy_conn (conn); return (NULL); @@ -1105,7 +1105,7 @@ kibnal_destroy_conn (kib_conn_t *conn) /* Only the connd does this (i.e. single threaded) */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); - + CDEBUG (D_NET, "connection %p\n", conn); LASSERT (atomic_read (&conn->ibc_refcount) == 0); @@ -1137,16 +1137,16 @@ kibnal_destroy_conn (kib_conn_t *conn) if (vvrc != vv_return_ok) CERROR("Can't destroy QP: %d\n", vvrc); /* fall through */ - + case IBNAL_CONN_INIT_NOTHING: break; } - if (conn->ibc_rx_pages != NULL) + if (conn->ibc_rx_pages != NULL) kibnal_free_pages(conn->ibc_rx_pages); if (conn->ibc_rxs != NULL) - LIBCFS_FREE(conn->ibc_rxs, + LIBCFS_FREE(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof(kib_rx_t)); if (conn->ibc_connvars != NULL) @@ -1195,7 +1195,7 @@ kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) CDEBUG(D_NET, "Closing stale conn -> %s incarnation:"LPX64"("LPX64")\n", libcfs_nid2str(peer->ibp_nid), conn->ibc_incarnation, incarnation); - + count++; kibnal_close_conn_locked (conn, -ESTALE); } @@ -1245,7 +1245,7 @@ kibnal_close_matching_conns (lnet_nid_t nid) /* wildcards always succeed */ if (nid == LNET_NID_ANY) return (0); - + return (count == 0 ? -ENOENT : 0); } @@ -1318,11 +1318,11 @@ kibnal_free_pages (kib_pages_t *p) { int npages = p->ibp_npages; int i; - + for (i = 0; i < npages; i++) if (p->ibp_pages[i] != NULL) __free_page(p->ibp_pages[i]); - + LIBCFS_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); } @@ -1340,7 +1340,7 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); p->ibp_npages = npages; - + for (i = 0; i < npages; i++) { p->ibp_pages[i] = alloc_page (GFP_KERNEL); if (p->ibp_pages[i] == NULL) { @@ -1355,15 +1355,15 @@ kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) } int -kibnal_alloc_tx_descs (void) +kibnal_alloc_tx_descs (void) { int i; - + LIBCFS_ALLOC (kibnal_data.kib_tx_descs, IBNAL_TX_MSGS() * sizeof(kib_tx_t)); if (kibnal_data.kib_tx_descs == NULL) return -ENOMEM; - + memset(kibnal_data.kib_tx_descs, 0, IBNAL_TX_MSGS() * sizeof(kib_tx_t)); @@ -1376,20 +1376,20 @@ kibnal_alloc_tx_descs (void) if (tx->tx_pages == NULL) return -ENOMEM; #else - LIBCFS_ALLOC(tx->tx_wrq, - (1 + IBNAL_MAX_RDMA_FRAGS) * + LIBCFS_ALLOC(tx->tx_wrq, + (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); if (tx->tx_wrq == NULL) return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_gl, - (1 + IBNAL_MAX_RDMA_FRAGS) * + + LIBCFS_ALLOC(tx->tx_gl, + (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_gl)); if (tx->tx_gl == NULL) return -ENOMEM; - - LIBCFS_ALLOC(tx->tx_rd, - offsetof(kib_rdma_desc_t, + + LIBCFS_ALLOC(tx->tx_rd, + offsetof(kib_rdma_desc_t, rd_frags[IBNAL_MAX_RDMA_FRAGS])); if (tx->tx_rd == NULL) return -ENOMEM; @@ -1400,7 +1400,7 @@ kibnal_alloc_tx_descs (void) } void -kibnal_free_tx_descs (void) +kibnal_free_tx_descs (void) { int i; @@ -1416,18 +1416,18 @@ kibnal_free_tx_descs (void) sizeof(*tx->tx_pages)); #else if (tx->tx_wrq != NULL) - LIBCFS_FREE(tx->tx_wrq, - (1 + IBNAL_MAX_RDMA_FRAGS) * + LIBCFS_FREE(tx->tx_wrq, + (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_wrq)); if (tx->tx_gl != NULL) - LIBCFS_FREE(tx->tx_gl, - (1 + IBNAL_MAX_RDMA_FRAGS) * + LIBCFS_FREE(tx->tx_gl, + (1 + IBNAL_MAX_RDMA_FRAGS) * sizeof(*tx->tx_gl)); if (tx->tx_rd != NULL) - LIBCFS_FREE(tx->tx_rd, - offsetof(kib_rdma_desc_t, + LIBCFS_FREE(tx->tx_rd, + offsetof(kib_rdma_desc_t, rd_frags[IBNAL_MAX_RDMA_FRAGS])); #endif } @@ -1438,7 +1438,7 @@ kibnal_free_tx_descs (void) #if IBNAL_USE_FMR void -kibnal_free_fmrs (int n) +kibnal_free_fmrs (int n) { int i; vv_return_t vvrc; @@ -1477,7 +1477,7 @@ kibnal_setup_tx_descs (void) /* No fancy arithmetic when we do the buffer calculations */ CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); - rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES(), 0); if (rc != 0) return (rc); @@ -1533,7 +1533,7 @@ kibnal_setup_tx_descs (void) LASSERT (ipage <= IBNAL_TX_MSG_PAGES()); } } - + return (0); } @@ -1545,7 +1545,7 @@ kibnal_shutdown (lnet_ni_t *ni) LASSERT (ni == kibnal_data.kib_ni); LASSERT (ni->ni_data == &kibnal_data); - + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", atomic_read (&libcfs_kmemory)); @@ -1597,7 +1597,7 @@ kibnal_shutdown (lnet_ni_t *ni) kibnal_async_callback); if (vvrc != vv_return_ok) CERROR("vv_dell_async_event_cb error: %d\n", vvrc); - + /* fall through */ case IBNAL_INIT_HCA: @@ -1632,7 +1632,7 @@ kibnal_shutdown (lnet_ni_t *ni) cfs_pause(cfs_time_seconds(1)); } /* fall through */ - + case IBNAL_INIT_NOTHING: break; } @@ -1641,7 +1641,7 @@ kibnal_shutdown (lnet_ni_t *ni) if (kibnal_data.kib_peers != NULL) LIBCFS_FREE (kibnal_data.kib_peers, - sizeof (struct list_head) * + sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", @@ -1856,18 +1856,18 @@ kibnal_startup (lnet_ni_t *ni) /* Found a suitable port. Get its GUID and PKEY. */ tbl_count = 1; - vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, + vvrc = vv_get_port_gid_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_gid); if (vvrc != vv_return_ok) { CERROR("vv_get_port_gid_tbl failed " - "for %s port %d: %d\n", + "for %s port %d: %d\n", hca_name, port_num, vvrc); continue; } tbl_count = 1; - vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, + vvrc = vv_get_port_partition_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_pkey); if (vvrc != vv_return_ok) { @@ -1895,8 +1895,8 @@ kibnal_startup (lnet_ni_t *ni) } CDEBUG(D_NET, "Using %s port %d - GID="LPX64":"LPX64"\n", - hca_name, kibnal_data.kib_port, - kibnal_data.kib_port_gid.scope.g.subnet, + hca_name, kibnal_data.kib_port, + kibnal_data.kib_port_gid.scope.g.subnet, kibnal_data.kib_port_gid.scope.g.eui64); /*****************************************************/ @@ -1930,7 +1930,7 @@ kibnal_startup (lnet_ni_t *ni) __u32 nentries; vvrc = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(), - kibnal_cq_callback, + kibnal_cq_callback, NULL, /* context */ &kibnal_data.kib_cq, &nentries); if (vvrc != 0) { @@ -1942,13 +1942,13 @@ kibnal_startup (lnet_ni_t *ni) kibnal_data.kib_init = IBNAL_INIT_CQ; if (nentries < IBNAL_CQ_ENTRIES()) { - CERROR ("CQ only has %d entries, need %d\n", + CERROR ("CQ only has %d entries, need %d\n", nentries, IBNAL_CQ_ENTRIES()); goto failed; } - vvrc = vv_request_completion_notification(kibnal_data.kib_hca, - kibnal_data.kib_cq, + vvrc = vv_request_completion_notification(kibnal_data.kib_hca, + kibnal_data.kib_cq, vv_next_solicit_unsolicit_event); if (vvrc != 0) { CERROR ("Failed to re-arm completion queue: %d\n", rc); @@ -1970,7 +1970,7 @@ kibnal_startup (lnet_ni_t *ni) failed: CDEBUG(D_NET, "kibnal_startup failed\n"); - kibnal_shutdown (ni); + kibnal_shutdown (ni); return (-ENETDOWN); } @@ -1988,9 +1988,9 @@ kibnal_module_init (void) vibnal_assert_wire_constants(); - CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) + CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) <= cm_REQ_priv_data_len); - CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) + CLASSERT (offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t) <= cm_REP_priv_data_len); CLASSERT (sizeof(kib_msg_t) <= IBNAL_MSG_SIZE); #if !IBNAL_USE_FMR diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index d468673b3a3ac9ffcb666b4921801e90ab4a2dc5..0c7e323cee628fbbb12ba023de424402f7428770 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -160,7 +160,7 @@ kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) LASSERT (conn->ibc_state >= IBNAL_CONN_INIT); LASSERT (rx->rx_nob >= 0); /* not posted */ - CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", + CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n", rx->rx_wrq.scatgat_list->length, rx->rx_wrq.scatgat_list->l_key, KIBNAL_SG2ADDR(rx->rx_wrq.scatgat_list->v_address)); @@ -194,10 +194,10 @@ kibnal_post_rx (kib_rx_t *rx, int credit, int rsrvd_credit) spin_unlock(&conn->ibc_lock); - CERROR ("post rx -> %s failed %d\n", + CERROR ("post rx -> %s failed %d\n", libcfs_nid2str(conn->ibc_peer->ibp_nid), vvrc); rc = -EIO; - kibnal_close_conn(rx->rx_conn, rc); + kibnal_close_conn(conn, rc); /* No more posts for this rx; so lose its ref */ kibnal_conn_decref(conn); return rc; @@ -1739,7 +1739,7 @@ kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, case IBNAL_MSG_PUT_REQ: if (mlen == 0) { lnet_finalize(ni, lntmsg, 0); - kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0, + kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, 0, rxmsg->ibm_u.putreq.ibprm_cookie); break; } @@ -1769,7 +1769,7 @@ kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, libcfs_nid2str(conn->ibc_peer->ibp_nid), rc); kibnal_tx_done(tx); /* tell peer it's over */ - kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc, + kibnal_send_completion(conn, IBNAL_MSG_PUT_NAK, rc, rxmsg->ibm_u.putreq.ibprm_cookie); break; } @@ -1801,8 +1801,7 @@ kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, kibnal_reply(ni, rx, lntmsg); } else { /* GET didn't match anything */ - kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE, - -ENODATA, + kibnal_send_completion(conn, IBNAL_MSG_GET_DONE, -ENODATA, rxmsg->ibm_u.get.ibgm_cookie); } break; @@ -2477,7 +2476,7 @@ kibnal_recv_connreq(cm_cep_handle_t *cep, cm_request_data_t *cmreq) write_unlock_irqrestore(g_lock, flags); CWARN("Conn race %s\n", - libcfs_nid2str(peer2->ibp_nid)); + libcfs_nid2str(rxmsg.ibm_srcnid)); kibnal_peer_decref(peer); reason = IBNAL_REJECT_CONN_RACE; @@ -3055,7 +3054,7 @@ kibnal_arp_done (kib_conn_t *conn) path->pkey, &cv->cv_pkey_index); if (vvrc != vv_return_ok) { CWARN("pkey2pkey_index failed for %s @ %u.%u.%u.%u: %d\n", - libcfs_nid2str(peer->ibp_nid), + libcfs_nid2str(peer->ibp_nid), HIPQUAD(peer->ibp_ip), vvrc); goto failed; } @@ -3085,7 +3084,7 @@ kibnal_arp_done (kib_conn_t *conn) &path->slid); if (vvrc != vv_return_ok) { CWARN("port_num2base_lid failed for %s @ %u.%u.%u.%u: %d\n", - libcfs_nid2str(peer->ibp_ip), + libcfs_nid2str(peer->ibp_ip), HIPQUAD(peer->ibp_ip), vvrc); goto failed; } diff --git a/lnet/klnds/viblnd/viblnd_modparams.c b/lnet/klnds/viblnd/viblnd_modparams.c index f31c8dc2e51cad3e27c6d1b9e8fc939a6ea01c82..064d17bd971ac47d743399f174cb88cc2cf60c3c 100644 --- a/lnet/klnds/viblnd/viblnd_modparams.c +++ b/lnet/klnds/viblnd/viblnd_modparams.c @@ -272,7 +272,7 @@ static cfs_sysctl_table_t kibnal_ctl_table[] = { .procname = "concurrent_sends", .data = &concurrent_sends, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0444, .proc_handler = &proc_dointvec }, #if IBNAL_USE_FMR diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index e3b1cab9e56b5da29f5a6092c04636c0bf81a4ea..9810bdb900ed85f21333af7ae003596dfd3b739a 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -102,6 +102,9 @@ char debug_file_path[1024] = "/r/tmp/lustre-log"; #else char debug_file_path[1024] = "/tmp/lustre-log"; #endif +CFS_MODULE_PARM(debug_file_path, "s", charp, 0644, + "Path for dumping debug logs, " + "set 'NONE' to prevent log dumping"); int libcfs_panic_in_progress; @@ -402,11 +405,14 @@ void libcfs_debug_dumplog_internal(void *arg) CFS_PUSH_JOURNAL; - snprintf(debug_file_name, sizeof(debug_file_path) - 1, "%s.%ld.%ld", - debug_file_path, cfs_time_current_sec(), (long)arg); - printk(KERN_ALERT "LustreError: dumping log to %s\n", debug_file_name); - tracefile_dump_all_pages(debug_file_name); - + if (strncmp(debug_file_path, "NONE", 4) != 0) { + snprintf(debug_file_name, sizeof(debug_file_name) - 1, + "%s.%ld.%ld", debug_file_path, cfs_time_current_sec(), + (long)arg); + printk(KERN_ALERT "LustreError: dumping log to %s\n", + debug_file_name); + tracefile_dump_all_pages(debug_file_name); + } CFS_POP_JOURNAL; } @@ -647,11 +653,11 @@ int libcfs_debug_init(unsigned long bufsize) debug_filename = getenv("LIBLUSTRE_DEBUG_FILE"); if (debug_filename) - strncpy(debug_file_name,debug_filename,sizeof(debug_file_path)); + strncpy(debug_file_name,debug_filename,sizeof(debug_file_name)); if (debug_file_name[0] == '\0' && debug_file_path[0] != '\0') snprintf(debug_file_name, sizeof(debug_file_name) - 1, - "%s-%s-%lu.log", debug_file_path, source_nid, time(0)); + "%s-%s-"CFS_TIME_T".log", debug_file_path, source_nid, time(0)); if (strcmp(debug_file_name, "stdout") == 0 || strcmp(debug_file_name, "-") == 0) { @@ -719,8 +725,9 @@ libcfs_debug_vmsg2(cfs_debug_limit_state_t *cdls, int nob; int remain; va_list ap; - char buf[PAGE_SIZE]; /* size 4096 used for compatimble with linux, - * where message can`t be exceed PAGE_SIZE */ + char buf[CFS_PAGE_SIZE]; /* size 4096 used for compatimble + * with linux, where message can`t + * be exceed PAGE_SIZE */ int console = 0; char *prefix = "Lustre"; @@ -813,7 +820,7 @@ out_file: gettimeofday(&tv, NULL); - fprintf(debug_file_fd, "%lu.%06lu:%u:%s:(%s:%d:%s()): %s", + fprintf(debug_file_fd, CFS_TIME_T".%06lu:%u:%s:(%s:%d:%s()): %s", tv.tv_sec, tv.tv_usec, source_pid, source_nid, file, line, fn, buf); diff --git a/lnet/libcfs/user-prim.c b/lnet/libcfs/user-prim.c index 58f6b378a0eb21c1bd9a8152d645a4d59f048312..ffa32c193d501a76dd8430ef4e57f10d152ead0a 100644 --- a/lnet/libcfs/user-prim.c +++ b/lnet/libcfs/user-prim.c @@ -31,6 +31,9 @@ #ifndef __KERNEL__ +#include <libcfs/libcfs.h> +#include <libcfs/kp30.h> + #include <sys/mman.h> #ifndef __CYGWIN__ #include <stdint.h> @@ -48,10 +51,9 @@ #include <signal.h> #include <errno.h> #include <sys/stat.h> +#ifdef HAVE_SYS_VFS_H #include <sys/vfs.h> - -#include <libcfs/libcfs.h> -#include <libcfs/kp30.h> +#endif /* * Sleep channel. No-op implementation. @@ -288,6 +290,11 @@ void cfs_daemonize(char *str) return; } +int cfs_daemonize_ctxt(char *str) +{ + return 0; +} + cfs_sigset_t cfs_block_allsigs(void) { cfs_sigset_t all; diff --git a/lnet/libcfs/user-tcpip.c b/lnet/libcfs/user-tcpip.c index a76edb3fb57d79726ac54d0e1df294d9b3800b1e..e0cedb9921909b0c7b1cefc563e7f3a0632ab1f9 100644 --- a/lnet/libcfs/user-tcpip.c +++ b/lnet/libcfs/user-tcpip.c @@ -19,9 +19,15 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#ifndef __KERNEL__ +#if !defined(__KERNEL__) || !defined(REDSTORM) + +#include <libcfs/libcfs.h> +#include <libcfs/kp30.h> #include <sys/socket.h> +#ifdef HAVE_NETINET_IN_H +#include <netinet/in.h> +#endif #include <netinet/tcp.h> #include <sys/ioctl.h> #include <unistd.h> @@ -38,9 +44,6 @@ #include <sys/syscall.h> #endif -#include <libcfs/libcfs.h> -#include <libcfs/kp30.h> - /* * Functions to get network interfaces info */ @@ -49,8 +52,8 @@ int libcfs_sock_ioctl(int cmd, unsigned long arg) { int fd, rc; - - fd = socket(AF_INET, SOCK_STREAM, 0); + + fd = socket(AF_INET, SOCK_STREAM, 0); if (fd < 0) { rc = -errno; @@ -600,4 +603,4 @@ int libcfs_sock_readv(int fd, const struct iovec *vector, int count) return rc; } -#endif /* !__KERNEL__ */ +#endif /* !__KERNEL__ || !defined(REDSTORM) */ diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 4fd9adb3d20f19edc03af8ffad8ba244d7d79226..8a86dcc72fe7bdff691a1f589a56fc778131248e 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -858,6 +858,7 @@ lnet_count_acceptor_nis (lnet_ni_t **first_ni) * *first_ni so the acceptor can pass it connections "blind" to retain * binary compatibility. */ int count = 0; +#if defined(__KERNEL__) || defined(HAVE_LIBPTHREAD) struct list_head *tmp; lnet_ni_t *ni; @@ -876,6 +877,8 @@ lnet_count_acceptor_nis (lnet_ni_t **first_ni) } LNET_UNLOCK(); + +#endif /* defined(__KERNEL__) || defined(HAVE_LIBPTHREAD) */ return count; } diff --git a/lnet/lnet/lib-eq.c b/lnet/lnet/lib-eq.c index 5bae602fa88efd61097f6564ae07a61faf13efb1..83c841a0d7cdbe8e78ed6a506bde69ad433f5bda 100644 --- a/lnet/lnet/lib-eq.c +++ b/lnet/lnet/lib-eq.c @@ -26,14 +26,14 @@ #include <lnet/lib-lnet.h> int -LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, +LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, lnet_handle_eq_t *handle) { lnet_eq_t *eq; LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - + /* We need count to be a power of 2 so that when eq_{enq,deq}_seq * overflow, they don't skip entries, so the queue has the same * apparant capacity at all times */ @@ -48,7 +48,7 @@ LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, if (count == 0) /* catch bad parameter / overflow on roundup */ return (-EINVAL); - + eq = lnet_eq_alloc(); if (eq == NULL) return (-ENOMEM); @@ -92,7 +92,7 @@ LNetEQFree(lnet_handle_eq_t eqh) LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - + LNET_LOCK(); eq = lnet_handle2eq(&eqh); @@ -102,6 +102,8 @@ LNetEQFree(lnet_handle_eq_t eqh) } if (eq->eq_refcount != 0) { + CDEBUG(D_NET, "Event queue (%d) busy on destroy.\n", + eq->eq_refcount); LNET_UNLOCK(); return (-EBUSY); } @@ -160,7 +162,7 @@ LNetEQGet (lnet_handle_eq_t eventq, lnet_event_t *event) { int which; - return LNetEQPoll(&eventq, 1, 0, + return LNetEQPoll(&eventq, 1, 0, event, &which); } @@ -216,10 +218,10 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, RETURN(rc); } } - + #ifdef __KERNEL__ if (timeout_ms == 0) { - LNET_UNLOCK (); + LNET_UNLOCK(); RETURN (0); } @@ -231,19 +233,19 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, if (timeout_ms < 0) { cfs_waitq_wait (&wl, CFS_TASK_INTERRUPTIBLE); - } else { + } else { struct timeval tv; now = cfs_time_current(); cfs_waitq_timedwait(&wl, CFS_TASK_INTERRUPTIBLE, cfs_time_seconds(timeout_ms)/1000); - cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), - &tv); + cfs_duration_usec(cfs_time_sub(cfs_time_current(), now), + &tv); timeout_ms -= tv.tv_sec * 1000 + tv.tv_usec / 1000; if (timeout_ms < 0) timeout_ms = 0; } - + LNET_LOCK(); cfs_waitq_del(&the_lnet.ln_waitq, &wl); #else @@ -259,7 +261,7 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, gettimeofday(&then, NULL); (eqwaitni->ni_lnd->lnd_wait)(eqwaitni, timeout_ms); - + gettimeofday(&now, NULL); timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 + (now.tv_usec - then.tv_usec) / 1000; @@ -289,26 +291,26 @@ LNetEQPoll (lnet_handle_eq_t *eventqs, int neq, int timeout_ms, LBUG(); # else if (timeout_ms < 0) { - pthread_cond_wait(&the_lnet.ln_cond, + pthread_cond_wait(&the_lnet.ln_cond, &the_lnet.ln_lock); } else { gettimeofday(&then, NULL); - + ts.tv_sec = then.tv_sec + timeout_ms/1000; - ts.tv_nsec = then.tv_usec * 1000 + + ts.tv_nsec = then.tv_usec * 1000 + (timeout_ms%1000) * 1000000; if (ts.tv_nsec >= 1000000000) { ts.tv_sec++; ts.tv_nsec -= 1000000000; } - + pthread_cond_timedwait(&the_lnet.ln_cond, &the_lnet.ln_lock, &ts); - + gettimeofday(&now, NULL); timeout_ms -= (now.tv_sec - then.tv_sec) * 1000 + (now.tv_usec - then.tv_usec) / 1000; - + if (timeout_ms < 0) timeout_ms = 0; } diff --git a/lnet/lnet/lib-md.c b/lnet/lnet/lib-md.c index ecd8f0734e251ca521f3486851193746e2e98854..731db566bc3726191fd02c607d6fa2b9f995f95e 100644 --- a/lnet/lnet/lib-md.c +++ b/lnet/lnet/lib-md.c @@ -284,7 +284,7 @@ LNetMDUnlink (lnet_handle_md_t mdh) LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - + LNET_LOCK(); md = lnet_handle2md(&mdh); @@ -299,14 +299,7 @@ LNetMDUnlink (lnet_handle_md_t mdh) if (md->md_eq != NULL && md->md_refcount == 0) { - memset(&ev, 0, sizeof(ev)); - - ev.type = LNET_EVENT_UNLINK; - ev.status = 0; - ev.unlinked = 1; - lnet_md_deconstruct(md, &ev.md); - lnet_md2handle(&ev.md_handle, md); - + lnet_build_unlink_event(md, &ev); lnet_enq_event_locked(md->md_eq, &ev); } diff --git a/lnet/lnet/lib-me.c b/lnet/lnet/lib-me.c index edfb8a830c61322455b99d762482da49ddfde98a..c5b12866c935db6c3672627f6e4f69851440f814 100644 --- a/lnet/lnet/lib-me.c +++ b/lnet/lnet/lib-me.c @@ -119,25 +119,33 @@ LNetMEInsert(lnet_handle_me_t current_meh, int LNetMEUnlink(lnet_handle_me_t meh) { - lnet_me_t *me; - int rc; + lnet_me_t *me; + lnet_libmd_t *md; + lnet_event_t ev; - LASSERT (the_lnet.ln_init); + LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - + LNET_LOCK(); me = lnet_handle2me(&meh); if (me == NULL) { - rc = -ENOENT; - } else { - lnet_me_unlink(me); - rc = 0; + LNET_UNLOCK(); + return -ENOENT; } - LNET_UNLOCK(); + md = me->me_md; + if (md != NULL && + md->md_eq != NULL && + md->md_refcount == 0) { + lnet_build_unlink_event(md, &ev); + lnet_enq_event_locked(md->md_eq, &ev); + } - return (rc); + lnet_me_unlink(me); + + LNET_UNLOCK(); + return 0; } /* call with LNET_LOCK please */ diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index c174762a25b528e398a23b50474e083ba7c7bf9d..a4a1a4c9e09581d0bacb86882e276548f3ecdca6 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -40,7 +40,7 @@ static void lnet_drop_delayed_put(lnet_msg_t *msg, char *reason); static int lnet_try_match_md (int index, int op_mask, lnet_process_id_t src, - unsigned int rlength, unsigned int roffset, + unsigned int rlength, unsigned int roffset, __u64 match_bits, lnet_libmd_t *md, lnet_msg_t *msg, unsigned int *mlength_out, unsigned int *offset_out) { @@ -90,7 +90,7 @@ lnet_try_match_md (int index, int op_mask, lnet_process_id_t src, } else if ((md->md_options & LNET_MD_TRUNCATE) == 0) { /* this packet _really_ is too big */ CERROR("Matching packet from %s, match "LPU64 - " length %d too big: %d left, %d allowed\n", + " length %d too big: %d left, %d allowed\n", libcfs_id2str(src), match_bits, rlength, md->md_length - offset, mlength); @@ -163,20 +163,20 @@ lnet_match_md(int index, int op_mask, lnet_process_id_t src, LASSERT (me == md->md_me); - rc = lnet_try_match_md(index, op_mask, src, rlength, + rc = lnet_try_match_md(index, op_mask, src, rlength, roffset, match_bits, md, msg, mlength_out, offset_out); switch (rc) { default: LBUG(); - + case LNET_MATCHMD_NONE: continue; - + case LNET_MATCHMD_OK: *md_out = md; return LNET_MATCHMD_OK; - + case LNET_MATCHMD_DROP: return LNET_MATCHMD_DROP; } @@ -186,7 +186,7 @@ lnet_match_md(int index, int op_mask, lnet_process_id_t src, if (op_mask == LNET_MD_OP_GET || (ptl->ptl_options & LNET_PTL_LAZY) == 0) return LNET_MATCHMD_DROP; - + return LNET_MATCHMD_NONE; } @@ -199,7 +199,7 @@ lnet_fail_nid (lnet_nid_t nid, unsigned int threshold) struct list_head cull; LASSERT (the_lnet.ln_init); - + if (threshold != 0) { /* Adding a new entry */ LIBCFS_ALLOC(tp, sizeof(*tp)); @@ -330,7 +330,7 @@ lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset, ndiov--; LASSERT (ndiov > 0); } - + /* skip complete frags before 'soffset' */ LASSERT (nsiov > 0); while (soffset >= siov->iov_len) { @@ -358,7 +358,7 @@ lnet_copy_iov2iov (unsigned int ndiov, struct iovec *diov, unsigned int doffset, ndiov--; doffset = 0; } - + if (siov->iov_len > soffset + this_nob) { soffset += this_nob; } else { @@ -747,7 +747,7 @@ lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, LASSERT (!in_interrupt ()); LASSERT (mlen == 0 || msg != NULL); - + if (msg != NULL) { LASSERT(msg->msg_receiving); LASSERT(!msg->msg_sending); @@ -762,12 +762,12 @@ lnet_ni_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed, niov = msg->msg_niov; iov = msg->msg_iov; kiov = msg->msg_kiov; - + LASSERT (niov > 0); LASSERT ((iov == NULL) != (kiov == NULL)); } } - + rc = (ni->ni_lnd->lnd_recv)(ni, private, msg, delayed, niov, iov, kiov, offset, mlen, rlen); if (rc < 0) @@ -779,16 +779,16 @@ lnet_compare_routers(lnet_peer_t *p1, lnet_peer_t *p2) { if (p1->lp_txqnob < p2->lp_txqnob) return 1; - + if (p1->lp_txqnob > p2->lp_txqnob) return -1; - + if (p1->lp_txcredits > p2->lp_txcredits) return 1; - + if (p1->lp_txcredits < p2->lp_txcredits) return -1; - + return 0; } @@ -834,7 +834,7 @@ lnet_prep_send(lnet_msg_t *msg, int type, lnet_process_id_t target, } void -lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) +lnet_ni_send(lnet_ni_t *ni, lnet_msg_t *msg) { void *priv = msg->msg_private; int rc; @@ -860,14 +860,14 @@ lnet_eager_recv_locked(lnet_msg_t *msg) LASSERT (msg->msg_receiving); LASSERT (!msg->msg_sending); - + peer = msg->msg_rxpeer; ni = peer->lp_ni; if (ni->ni_lnd->lnd_eager_recv != NULL) { LNET_UNLOCK(); - - rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, + + rc = (ni->ni_lnd->lnd_eager_recv)(ni, msg->msg_private, msg, &msg->msg_private); if (rc != 0) { CERROR("recv from %s / send to %s aborted: " @@ -912,7 +912,7 @@ lnet_post_send_locked (lnet_msg_t *msg, int do_send) return EAGAIN; } } - + if (!msg->msg_txcredit) { LASSERT ((ni->ni_txcredits < 0) == !list_empty(&ni->ni_txq)); @@ -943,23 +943,23 @@ lnet_commit_routedmsg (lnet_msg_t *msg) { /* ALWAYS called holding the LNET_LOCK */ LASSERT (msg->msg_routing); - + the_lnet.ln_counters.msgs_alloc++; - if (the_lnet.ln_counters.msgs_alloc > + if (the_lnet.ln_counters.msgs_alloc > the_lnet.ln_counters.msgs_max) - the_lnet.ln_counters.msgs_max = + the_lnet.ln_counters.msgs_max = the_lnet.ln_counters.msgs_alloc; the_lnet.ln_counters.route_count++; the_lnet.ln_counters.route_length += msg->msg_len; - + LASSERT (!msg->msg_onactivelist); msg->msg_onactivelist = 1; list_add (&msg->msg_activelist, &the_lnet.ln_active_msgs); } lnet_rtrbufpool_t * -lnet_msg2bufpool(lnet_msg_t *msg) +lnet_msg2bufpool(lnet_msg_t *msg) { lnet_rtrbufpool_t *rbp = &the_lnet.ln_rtrpools[0]; @@ -994,12 +994,12 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) if (!msg->msg_peerrtrcredit) { LASSERT ((lp->lp_rtrcredits < 0) == !list_empty(&lp->lp_rtrq)); - + msg->msg_peerrtrcredit = 1; lp->lp_rtrcredits--; if (lp->lp_rtrcredits < lp->lp_minrtrcredits) lp->lp_minrtrcredits = lp->lp_rtrcredits; - + if (lp->lp_rtrcredits < 0) { /* must have checked eager_recv before here */ LASSERT (msg->msg_delayed); @@ -1007,7 +1007,7 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) return EAGAIN; } } - + rbp = lnet_msg2bufpool(msg); if (!msg->msg_rtrcredit) { @@ -1025,11 +1025,11 @@ lnet_post_routed_recv_locked (lnet_msg_t *msg, int do_recv) return EAGAIN; } } - + LASSERT (!list_empty(&rbp->rbp_bufs)); rb = list_entry(rbp->rbp_bufs.next, lnet_rtrbuf_t, rb_list); list_del(&rb->rb_list); - + msg->msg_niov = rbp->rbp_npages; msg->msg_kiov = &rb->rb_kiov[0]; @@ -1081,7 +1081,7 @@ lnet_return_credits_locked (lnet_msg_t *msg) txpeer->lp_txcredits++; if (txpeer->lp_txcredits <= 0) { - msg2 = list_entry(txpeer->lp_txq.next, + msg2 = list_entry(txpeer->lp_txq.next, lnet_msg_t, msg_list); list_del(&msg2->msg_list); @@ -1097,7 +1097,7 @@ lnet_return_credits_locked (lnet_msg_t *msg) lnet_peer_decref_locked(txpeer); } -#ifdef __KERNEL__ +#ifdef __KERNEL__ if (msg->msg_rtrcredit) { /* give back global router credits */ lnet_rtrbuf_t *rb; @@ -1107,32 +1107,32 @@ lnet_return_credits_locked (lnet_msg_t *msg) * there until it gets one allocated, or aborts the wait * itself */ LASSERT (msg->msg_kiov != NULL); - + rb = list_entry(msg->msg_kiov, lnet_rtrbuf_t, rb_kiov[0]); rbp = rb->rb_pool; LASSERT (rbp == lnet_msg2bufpool(msg)); msg->msg_kiov = NULL; msg->msg_rtrcredit = 0; - + LASSERT((rbp->rbp_credits < 0) == !list_empty(&rbp->rbp_msgs)); LASSERT((rbp->rbp_credits > 0) == !list_empty(&rbp->rbp_bufs)); list_add(&rb->rb_list, &rbp->rbp_bufs); rbp->rbp_credits++; if (rbp->rbp_credits <= 0) { - msg2 = list_entry(rbp->rbp_msgs.next, + msg2 = list_entry(rbp->rbp_msgs.next, lnet_msg_t, msg_list); list_del(&msg2->msg_list); - + (void) lnet_post_routed_recv_locked(msg2, 1); } } - + if (msg->msg_peerrtrcredit) { /* give back peer router credits */ msg->msg_peerrtrcredit = 0; - + LASSERT((rxpeer->lp_rtrcredits < 0) == !list_empty(&rxpeer->lp_rtrq)); rxpeer->lp_rtrcredits++; @@ -1140,7 +1140,7 @@ lnet_return_credits_locked (lnet_msg_t *msg) msg2 = list_entry(rxpeer->lp_rtrq.next, lnet_msg_t, msg_list); list_del(&msg2->msg_list); - + (void) lnet_post_routed_recv_locked(msg2, 1); } } @@ -1197,7 +1197,7 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) LASSERT (!msg->msg_routing); } - /* Is this for someone on a local network? */ + /* Is this for someone on a local network? */ local_ni = lnet_net2ni_locked(LNET_NIDNET(dst_nid)); if (local_ni != NULL) { @@ -1221,7 +1221,7 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) src_nid = lnet_ptlcompat_srcnid(src_nid, dst_nid); msg->msg_hdr.src_nid = cpu_to_le64(src_nid); } - + if (src_ni == the_lnet.ln_loni) { /* No send credit hassles with LOLND */ LNET_UNLOCK(); @@ -1229,7 +1229,7 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) lnet_ni_decref(src_ni); return 0; } - + rc = lnet_nid2peer_locked(&lp, dst_nid); lnet_ni_decref_locked(src_ni); /* lp has ref on src_ni; lose mine */ if (rc != 0) { @@ -1270,7 +1270,7 @@ lnet_send(lnet_nid_t src_nid, lnet_msg_t *msg) if (src_ni != NULL) lnet_ni_decref_locked(src_ni); LNET_UNLOCK(); - CERROR("No route to %s (all routers down)\n", + CERROR("No route to %s (all routers down)\n", libcfs_id2str(msg->msg_target)); return -EHOSTUNREACH; } @@ -1355,12 +1355,12 @@ lnet_drop_message (lnet_ni_t *ni, void *private, unsigned int nob) the_lnet.ln_counters.drop_count++; the_lnet.ln_counters.drop_length += nob; LNET_UNLOCK(); - + lnet_ni_recv(ni, private, NULL, 0, 0, 0, nob); } static void -lnet_drop_delayed_put(lnet_msg_t *msg, char *reason) +lnet_drop_delayed_put(lnet_msg_t *msg, char *reason) { LASSERT (msg->msg_md == NULL); LASSERT (msg->msg_delayed); @@ -1369,11 +1369,11 @@ lnet_drop_delayed_put(lnet_msg_t *msg, char *reason) CWARN("Dropping delayed PUT from %s portal %d match "LPU64 " offset %d length %d: %s\n", - libcfs_id2str((lnet_process_id_t){ + libcfs_id2str((lnet_process_id_t){ .nid = msg->msg_hdr.src_nid, .pid = msg->msg_hdr.src_pid}), - msg->msg_hdr.msg.put.ptl_index, - msg->msg_hdr.msg.put.match_bits, + msg->msg_hdr.msg.put.ptl_index, + msg->msg_hdr.msg.put.match_bits, msg->msg_hdr.msg.put.offset, msg->msg_hdr.payload_length, reason); @@ -1382,16 +1382,16 @@ lnet_drop_delayed_put(lnet_msg_t *msg, char *reason) * called lnet_drop_message(), so I just hang onto msg as well * until that's done */ - lnet_drop_message(msg->msg_rxpeer->lp_ni, + lnet_drop_message(msg->msg_rxpeer->lp_ni, msg->msg_private, msg->msg_len); LNET_LOCK(); lnet_peer_decref_locked(msg->msg_rxpeer); msg->msg_rxpeer = NULL; - + lnet_msg_free(msg); - + LNET_UNLOCK(); } @@ -1444,7 +1444,7 @@ LNetClearLazyPortal(int portal) ptl->ptl_options &= ~LNET_PTL_LAZY; LNET_UNLOCK(); - + while (!list_empty(&zombies)) { msg = list_entry(zombies.next, lnet_msg_t, msg_list); list_del(&msg->msg_list); @@ -1480,10 +1480,10 @@ lnet_recv_put(lnet_libmd_t *md, lnet_msg_t *msg, int delayed, * it back into the ACK during lnet_finalize() */ msg->msg_ack = (!lnet_is_wire_handle_none(&hdr->msg.put.ack_wmd) && (md->md_options & LNET_MD_ACK_DISABLE) == 0); - - lnet_ni_recv(msg->msg_rxpeer->lp_ni, - msg->msg_private, - msg, delayed, offset, mlength, + + lnet_ni_recv(msg->msg_rxpeer->lp_ni, + msg->msg_private, + msg, delayed, offset, mlength, hdr->payload_length); } @@ -1527,16 +1527,16 @@ lnet_match_blocked_msg(lnet_libmd_t *md) src.pid = hdr->src_pid; rc = lnet_try_match_md(index, LNET_MD_OP_PUT, src, - hdr->payload_length, - hdr->msg.put.offset, - hdr->msg.put.match_bits, + hdr->payload_length, + hdr->msg.put.offset, + hdr->msg.put.match_bits, md, msg, &mlength, &offset); if (rc == LNET_MATCHMD_NONE) continue; - + /* Hurrah! This _is_ a match */ - list_del(&msg->msg_list); + list_del(&msg->msg_list); ptl->ptl_msgq_version++; if (rc == LNET_MATCHMD_OK) { @@ -1545,8 +1545,8 @@ lnet_match_blocked_msg(lnet_libmd_t *md) CDEBUG(D_NET, "Resuming delayed PUT from %s portal %d " "match "LPU64" offset %d length %d.\n", libcfs_id2str(src), - hdr->msg.put.ptl_index, - hdr->msg.put.match_bits, + hdr->msg.put.ptl_index, + hdr->msg.put.match_bits, hdr->msg.put.offset, hdr->payload_length); } else { @@ -1564,7 +1564,7 @@ lnet_match_blocked_msg(lnet_libmd_t *md) list_for_each_safe (entry, tmp, &drops) { msg = list_entry(entry, lnet_msg_t, msg_list); - list_del(&msg->msg_list); + list_del(&msg->msg_list); lnet_drop_delayed_put(msg, "Bad match"); } @@ -1572,7 +1572,7 @@ lnet_match_blocked_msg(lnet_libmd_t *md) list_for_each_safe (entry, tmp, &matches) { msg = list_entry(entry, lnet_msg_t, msg_list); - list_del(&msg->msg_list); + list_del(&msg->msg_list); /* md won't disappear under me, since each msg * holds a ref on it */ @@ -1613,26 +1613,26 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) switch (rc) { default: LBUG(); - + case LNET_MATCHMD_OK: LNET_UNLOCK(); lnet_recv_put(md, msg, 0, offset, mlength); return 0; - + case LNET_MATCHMD_NONE: rc = lnet_eager_recv_locked(msg); if (rc == 0 && !the_lnet.ln_shutdown) { - list_add_tail(&msg->msg_list, + list_add_tail(&msg->msg_list, &the_lnet.ln_portals[index].ptl_msgq); the_lnet.ln_portals[index].ptl_msgq_version++; CDEBUG(D_NET, "Delaying PUT from %s portal %d match " LPU64" offset %d length %d: no match \n", - libcfs_id2str(src), index, - hdr->msg.put.match_bits, + libcfs_id2str(src), index, + hdr->msg.put.match_bits, hdr->msg.put.offset, rlength); - + LNET_UNLOCK(); return 0; } @@ -1642,8 +1642,8 @@ lnet_parse_put(lnet_ni_t *ni, lnet_msg_t *msg) CDEBUG(D_NETERROR, "Dropping PUT from %s portal %d match "LPU64 " offset %d length %d: %d\n", - libcfs_id2str(src), index, - hdr->msg.put.match_bits, + libcfs_id2str(src), index, + hdr->msg.put.match_bits, hdr->msg.put.offset, rlength, rc); LNET_UNLOCK(); @@ -1679,9 +1679,9 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) CDEBUG(D_NETERROR, "Dropping GET from %s portal %d match "LPU64 " offset %d length %d\n", - libcfs_id2str(src), - hdr->msg.get.ptl_index, - hdr->msg.get.match_bits, + libcfs_id2str(src), + hdr->msg.get.ptl_index, + hdr->msg.get.match_bits, hdr->msg.get.src_offset, hdr->msg.get.sink_length); LNET_UNLOCK(); @@ -1689,7 +1689,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) } LASSERT (rc == LNET_MATCHMD_OK); - + the_lnet.ln_counters.send_count++; the_lnet.ln_counters.send_length += mlength; @@ -1715,7 +1715,7 @@ lnet_parse_get(lnet_ni_t *ni, lnet_msg_t *msg, int rdma_get) lnet_ni_recv(ni, msg->msg_private, NULL, 0, 0, 0, 0); msg->msg_receiving = 0; - + rc = lnet_send(ni->ni_nid, msg); if (rc < 0) { /* didn't get as far as lnet_ni_send() */ @@ -1956,20 +1956,20 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, case LNET_MSG_GET: if (payload_length > 0) { CERROR("%s, src %s: bad %s payload %d (0 expected)\n", - libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), + libcfs_nid2str(from_nid), + libcfs_nid2str(src_nid), lnet_msgtyp2str(type), payload_length); return -EPROTO; } break; - + case LNET_MSG_PUT: case LNET_MSG_REPLY: if (payload_length > (for_me ? LNET_MAX_PAYLOAD : LNET_MTU)) { CERROR("%s, src %s: bad %s payload %d " - "(%d max expected)\n", + "(%d max expected)\n", libcfs_nid2str(from_nid), - libcfs_nid2str(src_nid), + libcfs_nid2str(src_nid), lnet_msgtyp2str(type), payload_length, for_me ? LNET_MAX_PAYLOAD : LNET_MTU); @@ -2030,7 +2030,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, libcfs_nid2str(dest_nid)); return -EPROTO; } - + if (!the_lnet.ln_routing) { CERROR ("%s, src %s: Dropping message for %s " "(routing not enabled)\n", @@ -2062,7 +2062,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, } /* msg zeroed in lnet_msg_alloc; i.e. flags all clear, pointers NULL etc */ - + msg->msg_type = type; msg->msg_private = private; msg->msg_receiving = 1; @@ -2100,7 +2100,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, goto free_drop; } } - + lnet_commit_routedmsg(msg); rc = lnet_post_routed_recv_locked(msg, 0); LNET_UNLOCK(); @@ -2120,7 +2120,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, msg->msg_hdr.payload_length = payload_length; msg->msg_ev.sender = from_nid; - + switch (type) { case LNET_MSG_ACK: rc = lnet_parse_ack(ni, msg); @@ -2141,7 +2141,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, if (rc == 0) return 0; - + LASSERT (rc == ENOENT); free_drop: @@ -2162,7 +2162,7 @@ lnet_parse(lnet_ni_t *ni, lnet_hdr_t *hdr, lnet_nid_t from_nid, int LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, lnet_process_id_t target, unsigned int portal, - __u64 match_bits, unsigned int offset, + __u64 match_bits, unsigned int offset, __u64 hdr_data) { lnet_msg_t *msg; @@ -2171,7 +2171,7 @@ LNetPut(lnet_nid_t self, lnet_handle_md_t mdh, lnet_ack_req_t ack, LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ fail_peer (target.nid, 1)) /* shall we now? */ { @@ -2333,7 +2333,7 @@ lnet_set_reply_msg_len(lnet_ni_t *ni, lnet_msg_t *reply, unsigned int len) /* NB I trusted my peer to RDMA. If she tells me she's written beyond * the end of my buffer, I might as well be dead. */ LASSERT (len <= reply->msg_ev.mlength); - + reply->msg_ev.mlength = len; } @@ -2348,7 +2348,7 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, LASSERT (the_lnet.ln_init); LASSERT (the_lnet.ln_refcount > 0); - + if (!list_empty (&the_lnet.ln_test_peers) && /* normally we don't */ fail_peer (target.nid, 1)) /* shall we now? */ { @@ -2426,7 +2426,7 @@ LNetGet(lnet_nid_t self, lnet_handle_md_t mdh, int LNetDist (lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) { - struct list_head *e; + struct list_head *e; lnet_ni_t *ni; lnet_route_t *route; lnet_remotenet_t *rnet; @@ -2446,7 +2446,7 @@ LNetDist (lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) list_for_each (e, &the_lnet.ln_nis) { ni = list_entry(e, lnet_ni_t, ni_list); - + if (ni->ni_nid == dstnid || (the_lnet.ln_ptlcompat > 0 && LNET_NIDNET(dstnid) == 0 && @@ -2481,7 +2481,7 @@ LNetDist (lnet_nid_t dstnid, lnet_nid_t *srcnidp, __u32 *orderp) } list_for_each (e, &the_lnet.ln_remote_nets) { - rnet = list_entry(e, lnet_remotenet_t, lrn_list); + rnet = list_entry(e, lnet_remotenet_t, lrn_list); if (rnet->lrn_net == dstnet) { LASSERT (!list_empty(&rnet->lrn_routes)); @@ -2517,9 +2517,9 @@ LNetSetAsync(lnet_process_id_t id, int nasync) int maxnids = 256; int rc = 0; int rc2; - + /* Target on a local network? */ - + ni = lnet_net2ni(LNET_NIDNET(id.nid)); if (ni != NULL) { if (ni->ni_lnd->lnd_setasync != NULL) @@ -2546,7 +2546,7 @@ LNetSetAsync(lnet_process_id_t id, int nasync) maxnids *= 2; goto again; } - + route = list_entry(tmp, lnet_route_t, lr_list); nids[nnids++] = route->lr_gateway->lp_nid; } @@ -2561,7 +2561,7 @@ LNetSetAsync(lnet_process_id_t id, int nasync) ni = lnet_net2ni(LNET_NIDNET(id.nid)); if (ni == NULL) continue; - + if (ni->ni_lnd->lnd_setasync != NULL) { rc2 = (ni->ni_lnd->lnd_setasync)(ni, id, nasync); if (rc2 != 0) diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index c46ad1a4c6a4ccad7b89113484d492e87a09ec70..84dbbeea2962495713daefc54ba99add378d540e 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -26,6 +26,18 @@ #include <lnet/lib-lnet.h> +void +lnet_build_unlink_event (lnet_libmd_t *md, lnet_event_t *ev) +{ + memset(ev, 0, sizeof(*ev)); + + ev->status = 0; + ev->unlinked = 1; + ev->type = LNET_EVENT_UNLINK; + lnet_md_deconstruct(md, &ev->md); + lnet_md2handle(&ev->md_handle, md); +} + void lnet_enq_event_locked (lnet_eq_t *eq, lnet_event_t *ev) { @@ -79,18 +91,18 @@ lnet_complete_msg_locked(lnet_msg_t *msg) msg->msg_ack = 0; LNET_UNLOCK(); - + LASSERT(msg->msg_ev.type == LNET_EVENT_PUT); LASSERT(!msg->msg_routing); ack_wmd = msg->msg_hdr.msg.put.ack_wmd; - + lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.initiator, 0, 0); msg->msg_hdr.msg.ack.dst_wmd = ack_wmd; msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits; msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength); - + rc = lnet_send(msg->msg_ev.target.nid, msg); LNET_LOCK(); @@ -167,12 +179,12 @@ lnet_finalize (lnet_ni_t *ni, lnet_msg_t *msg, int status) LASSERT (md->md_refcount >= 0); unlink = lnet_md_unlinkable(md); - + msg->msg_ev.unlinked = unlink; - + if (md->md_eq != NULL) lnet_enq_event_locked(md->md_eq, &msg->msg_ev); - + if (unlink) lnet_md_unlink(md); diff --git a/lnet/lnet/router.c b/lnet/lnet/router.c index 01b635aa8343a74dcfa2923b5c1a883748481845..881f4adc06463c6147802c590e021184f8cf0c5a 100644 --- a/lnet/lnet/router.c +++ b/lnet/lnet/router.c @@ -94,14 +94,14 @@ lnet_notify_locked(lnet_peer_t *lp, int notifylnd, int alive, time_t when) } void -lnet_do_notify (lnet_peer_t *lp) +lnet_do_notify (lnet_peer_t *lp) { lnet_ni_t *ni = lp->lp_ni; int alive; int notifylnd; - + LNET_LOCK(); - + /* Notify only in 1 thread at any time to ensure ordered notification. * NB individual events can be missed; the only guarantee is that you * always get the most recent news */ @@ -112,7 +112,7 @@ lnet_do_notify (lnet_peer_t *lp) } lp->lp_notifying = 1; - + while (lp->lp_notify) { alive = lp->lp_alive; notifylnd = lp->lp_notifylnd; @@ -173,7 +173,7 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) CDEBUG(D_NET, "Auto-down disabled\n"); return 0; } - + LNET_LOCK(); lp = lnet_find_peer_locked(nid); @@ -187,9 +187,9 @@ lnet_notify (lnet_ni_t *ni, lnet_nid_t nid, int alive, time_t when) lnet_notify_locked(lp, ni == NULL, alive, when); LNET_UNLOCK(); - + lnet_do_notify(lp); - + LNET_LOCK(); lnet_peer_decref_locked(lp); @@ -271,11 +271,11 @@ int lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) { struct list_head zombies; - struct list_head *e; - lnet_remotenet_t *rnet; - lnet_remotenet_t *rnet2; - lnet_route_t *route; - lnet_route_t *route2; + struct list_head *e; + lnet_remotenet_t *rnet; + lnet_remotenet_t *rnet2; + lnet_route_t *route; + lnet_route_t *route2; lnet_ni_t *ni; int add_route; int rc; @@ -363,11 +363,11 @@ lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) LASSERT (route2->lr_gateway->lp_nid != gateway); } } - + if (add_route) { ni = route->lr_gateway->lp_ni; lnet_ni_addref_locked(ni); - + LASSERT (rc == 0); list_add_tail(&route->lr_list, &rnet2->lrn_routes); the_lnet.ln_remote_nets_version++; @@ -393,7 +393,7 @@ lnet_add_route (__u32 net, unsigned int hops, lnet_nid_t gateway) while (!list_empty(&zombies)) { route = list_entry(zombies.next, lnet_route_t, lr_list); list_del(&route->lr_list); - + LNET_LOCK(); lnet_rtr_decref_locked(route->lr_gateway); lnet_peer_decref_locked(route->lr_gateway); @@ -427,7 +427,7 @@ lnet_check_routes (void) else if (route->lr_gateway->lp_ni != route2->lr_gateway->lp_ni) { LNET_UNLOCK(); - + CERROR("Routes to %s via %s and %s not supported\n", libcfs_net2str(rnet->lrn_net), libcfs_nid2str(route->lr_gateway->lp_nid), @@ -436,7 +436,7 @@ lnet_check_routes (void) } } } - + LNET_UNLOCK(); return 0; } @@ -509,8 +509,8 @@ int lnet_get_route (int idx, __u32 *net, __u32 *hops, lnet_nid_t *gateway, __u32 *alive) { - struct list_head *e1; - struct list_head *e2; + struct list_head *e1; + struct list_head *e2; lnet_remotenet_t *rnet; lnet_route_t *route; @@ -551,14 +551,14 @@ lnet_router_checker_event (lnet_event_t *event) /* The router checker thread has unlinked the rc_md * and exited. */ LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKING); - the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKED; - mutex_up(&the_lnet.ln_rc_signal); + the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKED; + mutex_up(&the_lnet.ln_rc_signal); return; } - LASSERT (event->type == LNET_EVENT_SEND || + LASSERT (event->type == LNET_EVENT_SEND || event->type == LNET_EVENT_REPLY); - + nid = (event->type == LNET_EVENT_SEND) ? event->target.nid : event->initiator.nid; @@ -575,7 +575,7 @@ lnet_router_checker_event (lnet_event_t *event) if (lnet_isrouter(lp) && /* ignore if no longer a router */ (event->status != 0 || event->type == LNET_EVENT_REPLY)) { - + /* A successful REPLY means the router is up. If _any_ comms * to the router fail I assume it's down (this will happen if * we ping alive routers to try to detect router death before @@ -611,8 +611,8 @@ lnet_router_checker(void *arg) lnet_process_id_t rtr_id; int secs; - cfs_daemonize("router_checker"); - cfs_block_allsigs(); + cfs_daemonize("router_checker"); + cfs_block_allsigs(); rtr_id.pid = LUSTRE_SRV_LNET_PID; @@ -638,7 +638,7 @@ lnet_router_checker(void *arg) the_lnet.ln_rc_state = LNET_RC_STATE_RUNNING; mutex_up(&the_lnet.ln_rc_signal); /* let my parent go */ - while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { + while (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING) { __u64 version; LNET_LOCK(); @@ -668,7 +668,7 @@ rescan: } if (secs <= 0) secs = 0; - + if (secs != 0 && !rtr->lp_ping_notsent && now > rtr->lp_ping_timestamp + secs) { @@ -690,7 +690,7 @@ rescan: LNET_RESERVED_PORTAL, LNET_PROTO_PING_MATCHBITS, 0); } - + LNET_LOCK(); lnet_peer_decref_locked(rtr); @@ -708,17 +708,16 @@ rescan: set_current_state(CFS_TASK_INTERRUPTIBLE); cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, cfs_time_seconds(1)); - } + } LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_STOPTHREAD); the_lnet.ln_rc_state = LNET_RC_STATE_UNLINKING; - + rc = LNetMDUnlink(mdh); LASSERT (rc == 0); /* The unlink event callback will signal final completion */ - - return 0; + return 0; } @@ -731,11 +730,11 @@ lnet_wait_known_routerstate(void) for (;;) { LNET_LOCK(); - + all_known = 1; list_for_each (entry, &the_lnet.ln_routers) { rtr = list_entry(entry, lnet_peer_t, lp_rtr_list); - + if (rtr->lp_alive_count == 0) { all_known = 0; break; @@ -763,14 +762,14 @@ lnet_router_checker_stop(void) return; the_lnet.ln_rc_state = LNET_RC_STATE_STOPTHREAD; - /* block until event callback signals exit */ - mutex_down(&the_lnet.ln_rc_signal); + /* block until event callback signals exit */ + mutex_down(&the_lnet.ln_rc_signal); LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_UNLINKED); rc = LNetEQFree(the_lnet.ln_rc_eqh); LASSERT (rc == 0); - + the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; } @@ -788,12 +787,12 @@ lnet_router_checker_start(void) "\n"); return -EINVAL; } - + if (live_router_check_interval <= 0 && dead_router_check_interval <= 0) return 0; - init_mutex_locked(&the_lnet.ln_rc_signal); + init_mutex_locked(&the_lnet.ln_rc_signal); /* EQ size doesn't matter; the callback is guaranteed to get every * event */ @@ -804,20 +803,20 @@ lnet_router_checker_start(void) return -ENOMEM; } - rc = (int)cfs_kernel_thread(lnet_router_checker, NULL, 0); - if (rc < 0) { - CERROR("Can't start router checker thread: %d\n", rc); + rc = (int)cfs_kernel_thread(lnet_router_checker, NULL, 0); + if (rc < 0) { + CERROR("Can't start router checker thread: %d\n", rc); goto failed; - } + } - mutex_down(&the_lnet.ln_rc_signal); /* wait for checker to startup */ + mutex_down(&the_lnet.ln_rc_signal); /* wait for checker to startup */ rc = the_lnet.ln_rc_state; if (rc < 0) { the_lnet.ln_rc_state = LNET_RC_STATE_SHUTDOWN; goto failed; } - + LASSERT (the_lnet.ln_rc_state == LNET_RC_STATE_RUNNING); if (check_routers_before_use) { @@ -826,9 +825,9 @@ lnet_router_checker_start(void) * may have to a previous instance of me. */ lnet_wait_known_routerstate(); } - + return 0; - + failed: rc = LNetEQFree(the_lnet.ln_rc_eqh); LASSERT (rc == 0); @@ -915,7 +914,7 @@ lnet_rtrpool_alloc_bufs(lnet_rtrbufpool_t *rbp, int nbufs) LASSERT (rbp->rbp_nbuffers == nbufs); return 0; } - + for (i = 0; i < nbufs; i++) { rb = lnet_new_rtrbuf(rbp); @@ -974,7 +973,7 @@ int lnet_alloc_rtrpools(int im_a_router) { int rc; - + if (!strcmp(forwarding, "")) { /* not set either way */ if (!im_a_router) @@ -989,7 +988,7 @@ lnet_alloc_rtrpools(int im_a_router) "'enabled' or 'disabled'\n"); return -EINVAL; } - + if (tiny_router_buffers <= 0) { LCONSOLE_ERROR_MSG(0x10c, "tiny_router_buffers=%d invalid when " "routing enabled\n", tiny_router_buffers); @@ -1029,7 +1028,7 @@ lnet_alloc_rtrpools(int im_a_router) LNET_LOCK(); the_lnet.ln_routing = 1; LNET_UNLOCK(); - + return 0; failed: diff --git a/lnet/selftest/rpc.c b/lnet/selftest/rpc.c index 2b17e55766bba0c5c5bdcca5750b9c56bb1bb705..4d8d6530294858f39c1059c15e3bad9bcfdc38fd 100644 --- a/lnet/selftest/rpc.c +++ b/lnet/selftest/rpc.c @@ -11,6 +11,8 @@ #include "selftest.h" +#define SRPC_PEER_HASH_SIZE 101 /* # peer lists */ + typedef enum { SRPC_STATE_NONE, SRPC_STATE_NI_INIT, @@ -20,9 +22,6 @@ typedef enum { SRPC_STATE_STOPPING, } srpc_state_t; -#define SRPC_PEER_HASH_SIZE 101 /* # peer lists */ -#define SRPC_PEER_CREDITS 16 /* >= most LND's default peer credit */ - struct smoketest_rpc { spinlock_t rpc_glock; /* global lock */ srpc_service_t *rpc_services[SRPC_SERVICE_MAX_ID + 1]; @@ -33,6 +32,10 @@ struct smoketest_rpc { __u64 rpc_matchbits; /* matchbits counter */ } srpc_data; +static int srpc_peer_credits = 16; +CFS_MODULE_PARM(srpc_peer_credits, "i", int, 0444, + "# in-flight RPCs per peer (16 by default)"); + /* forward ref's */ int srpc_handle_rpc (swi_workitem_t *wi); @@ -171,7 +174,7 @@ srpc_create_peer (lnet_nid_t nid) memset(peer, 0, sizeof(srpc_peer_t)); peer->stp_nid = nid; - peer->stp_credits = SRPC_PEER_CREDITS; + peer->stp_credits = srpc_peer_credits; spin_lock_init(&peer->stp_lock); CFS_INIT_LIST_HEAD(&peer->stp_rpcq); @@ -198,8 +201,8 @@ srpc_find_peer_locked (lnet_nid_t nid) static srpc_peer_t * srpc_nid2peer (lnet_nid_t nid) { - srpc_peer_t *peer; - srpc_peer_t *new_peer; + srpc_peer_t *peer; + srpc_peer_t *new_peer; spin_lock(&srpc_data.rpc_glock); peer = srpc_find_peer_locked(nid); @@ -207,7 +210,7 @@ srpc_nid2peer (lnet_nid_t nid) if (peer != NULL) return peer; - + new_peer = srpc_create_peer(nid); spin_lock(&srpc_data.rpc_glock); @@ -225,7 +228,7 @@ srpc_nid2peer (lnet_nid_t nid) spin_unlock(&srpc_data.rpc_glock); return NULL; } - + list_add_tail(&new_peer->stp_list, srpc_nid2peerlist(nid)); spin_unlock(&srpc_data.rpc_glock); return new_peer; @@ -375,7 +378,7 @@ srpc_post_passive_rdma(int portal, __u64 matchbits, void *buf, } int -srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len, +srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len, int options, lnet_process_id_t peer, lnet_nid_t self, lnet_handle_md_t *mdh, srpc_event_t *ev) { @@ -399,11 +402,11 @@ srpc_post_active_rdma(int portal, __u64 matchbits, void *buf, int len, /* this is kind of an abuse of the LNET_MD_OP_{PUT,GET} options. * they're only meaningful for MDs attached to an ME (i.e. passive * buffers... */ - if ((options & LNET_MD_OP_PUT) != 0) { + if ((options & LNET_MD_OP_PUT) != 0) { rc = LNetPut(self, *mdh, LNET_NOACK_REQ, peer, portal, matchbits, 0, 0); } else { - LASSERT ((options & LNET_MD_OP_GET) != 0); + LASSERT ((options & LNET_MD_OP_GET) != 0); rc = LNetGet(self, *mdh, peer, portal, matchbits, 0); } @@ -438,7 +441,7 @@ srpc_post_active_rqtbuf(lnet_process_id_t peer, int service, void *buf, else portal = SRPC_FRAMEWORK_REQUEST_PORTAL; - rc = srpc_post_active_rdma(portal, service, buf, len, + rc = srpc_post_active_rdma(portal, service, buf, len, LNET_MD_OP_PUT, peer, LNET_NID_ANY, mdh, ev); return rc; @@ -506,7 +509,7 @@ srpc_service_post_buffer (srpc_service_t *sv, srpc_buffer_t *buf) spin_unlock(&sv->sv_lock); LIBCFS_FREE(buf, sizeof(*buf)); spin_lock(&sv->sv_lock); - return rc; + return rc; } int @@ -918,7 +921,8 @@ srpc_handle_rpc (swi_workitem_t *wi) } } case SWI_STATE_BULK_STARTED: - LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired); + /* we cannot LASSERT ev_fired right here because it + * may be set only upon an event with unlinked==1 */ if (rpc->srpc_bulk != NULL) { rc = ev->ev_status; @@ -927,11 +931,20 @@ srpc_handle_rpc (swi_workitem_t *wi) rc = (*sv->sv_bulk_ready) (rpc, rc); if (rc != 0) { - srpc_server_rpc_done(rpc, rc); - return 1; + if (ev->ev_fired) { + srpc_server_rpc_done(rpc, rc); + return 1; + } + + rpc->srpc_status = rc; + wi->wi_state = SWI_STATE_BULK_ERRORED; + LNetMDUnlink(rpc->srpc_bulk->bk_mdh); + return 0; /* wait for UNLINK event */ } } + LASSERT (rpc->srpc_bulk == NULL || ev->ev_fired); + wi->wi_state = SWI_STATE_REPLY_SUBMITTED; rc = srpc_send_reply(rpc); if (rc == 0) @@ -945,6 +958,13 @@ srpc_handle_rpc (swi_workitem_t *wi) wi->wi_state = SWI_STATE_DONE; srpc_server_rpc_done(rpc, ev->ev_status); return 1; + + case SWI_STATE_BULK_ERRORED: + LASSERT (rpc->srpc_bulk != NULL && ev->ev_fired); + LASSERT (rpc->srpc_status != 0); + + srpc_server_rpc_done(rpc, rpc->srpc_status); + return 1; } return 0; @@ -982,20 +1002,20 @@ srpc_add_client_rpc_timer (srpc_client_rpc_t *rpc) CFS_INIT_LIST_HEAD(&timer->stt_list); timer->stt_data = rpc; timer->stt_func = srpc_client_rpc_expired; - timer->stt_expires = cfs_time_add(rpc->crpc_timeout, + timer->stt_expires = cfs_time_add(rpc->crpc_timeout, cfs_time_current_sec()); stt_add_timer(timer); return; } -/* +/* * Called with rpc->crpc_lock held. * * Upon exit the RPC expiry timer is not queued and the handler is not * running on any CPU. */ void srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc) -{ +{ /* timer not planted or already exploded */ if (rpc->crpc_timeout == 0) return; @@ -1007,7 +1027,7 @@ srpc_del_client_rpc_timer (srpc_client_rpc_t *rpc) while (rpc->crpc_timeout != 0) { spin_unlock(&rpc->crpc_lock); - cfs_schedule(); + cfs_schedule(); spin_lock(&rpc->crpc_lock); } @@ -1075,7 +1095,7 @@ srpc_client_rpc_done (srpc_client_rpc_t *rpc, int status) * No one can schedule me now since: * - RPC timer has been defused. * - all LNet events have been fired. - * - crpc_closed has been set, preventing srpc_abort_rpc from + * - crpc_closed has been set, preventing srpc_abort_rpc from * scheduling me. * Cancel pending schedules and prevent future schedule attempts: */ @@ -1133,7 +1153,7 @@ srpc_send_rpc (swi_workitem_t *wi) case SWI_STATE_REQUEST_SUBMITTED: /* CAVEAT EMPTOR: rqtev, rpyev, and bulkev may come in any - * order; however, they're processed in a strict order: + * order; however, they're processed in a strict order: * rqt, rpy, and bulk. */ if (!rpc->crpc_reqstev.ev_fired) break; @@ -1150,7 +1170,7 @@ srpc_send_rpc (swi_workitem_t *wi) rc = rpc->crpc_replyev.ev_status; if (rc != 0) break; - if ((reply->msg_type != type && + if ((reply->msg_type != type && reply->msg_type != __swab32(type)) || (reply->msg_magic != SRPC_MSG_MAGIC && reply->msg_magic != __swab32(SRPC_MSG_MAGIC))) { @@ -1219,7 +1239,7 @@ srpc_create_client_rpc (lnet_process_id_t peer, int service, { srpc_client_rpc_t *rpc; - LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t, + LIBCFS_ALLOC(rpc, offsetof(srpc_client_rpc_t, crpc_bulk.bk_iovs[nbulkiov])); if (rpc == NULL) return NULL; @@ -1368,7 +1388,7 @@ srpc_send_reply (srpc_server_rpc_t *rpc) } /* when in kernel always called with LNET_LOCK() held, and in thread context */ -void +void srpc_lnet_ev_handler (lnet_event_t *ev) { srpc_event_t *rpcev = ev->md.user_ptr; @@ -1378,6 +1398,7 @@ srpc_lnet_ev_handler (lnet_event_t *ev) srpc_service_t *sv; srpc_msg_t *msg; srpc_msg_type_t type; + int fired_flag = 1; LASSERT (!in_interrupt()); @@ -1410,7 +1431,7 @@ srpc_lnet_ev_handler (lnet_event_t *ev) LASSERT (rpcev->ev_fired == 0); rpcev->ev_fired = 1; - rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? -EINTR : ev->status; swi_schedule_workitem(&crpc->crpc_wi); @@ -1438,7 +1459,7 @@ srpc_lnet_ev_handler (lnet_event_t *ev) LASSERT (sv->sv_nposted_msg >= 0); if (sv->sv_shuttingdown) { - /* Leave buffer on sv->sv_posted_msgq since + /* Leave buffer on sv->sv_posted_msgq since * srpc_finish_service needs to traverse it. */ spin_unlock(&sv->sv_lock); break; @@ -1449,7 +1470,7 @@ srpc_lnet_ev_handler (lnet_event_t *ev) type = srpc_service2request(sv->sv_id); if (ev->status != 0 || ev->mlength != sizeof(*msg) || - (msg->msg_type != type && + (msg->msg_type != type && msg->msg_type != __swab32(type)) || (msg->msg_magic != SRPC_MSG_MAGIC && msg->msg_magic != __swab32(SRPC_MSG_MAGIC))) { @@ -1499,10 +1520,13 @@ srpc_lnet_ev_handler (lnet_event_t *ev) ev->type == LNET_EVENT_REPLY || ev->type == LNET_EVENT_UNLINK); - if (ev->type == LNET_EVENT_SEND && - ev->status == 0 && !ev->unlinked) - break; /* wait for the final LNET_EVENT_REPLY */ - + if (ev->type == LNET_EVENT_SEND && !ev->unlinked) { + if (ev->status == 0) + break; /* wait for the final LNET_EVENT_REPLY */ + else + fired_flag = 0; /* LNET_EVENT_REPLY may arrive + (optimized GET case) */ + } case SRPC_BULK_PUT_SENT: if (ev->status == 0 && ev->type != LNET_EVENT_UNLINK) { spin_lock(&srpc_data.rpc_glock); @@ -1521,9 +1545,12 @@ srpc_lnet_ev_handler (lnet_event_t *ev) LASSERT (rpcev == &srpc->srpc_ev); spin_lock(&sv->sv_lock); - rpcev->ev_fired = 1; - rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? + if (fired_flag) + rpcev->ev_fired = 1; + + rpcev->ev_status = (ev->type == LNET_EVENT_UNLINK) ? -EINTR : ev->status; + srpc_schedule_server_rpc(srpc); spin_unlock(&sv->sv_lock); break; @@ -1544,15 +1571,15 @@ srpc_check_event (int timeout) rc = LNetEQPoll(&srpc_data.rpc_lnet_eq, 1, timeout * 1000, &ev, &i); if (rc == 0) return 0; - + LASSERT (rc == -EOVERFLOW || rc == 1); - + /* We can't affort to miss any events... */ if (rc == -EOVERFLOW) { CERROR ("Dropped an event!!!\n"); abort(); } - + srpc_lnet_ev_handler(&ev); return 1; } @@ -1565,6 +1592,18 @@ srpc_startup (void) int i; int rc; +#ifndef __KERNEL__ + char *s; + + s = getenv("SRPC_PEER_CREDITS"); + srpc_peer_credits = (s != NULL) ? atoi(s) : srpc_peer_credits; +#endif + + if (srpc_peer_credits <= 0) { + CERROR("Peer credits must be positive: %d\n", srpc_peer_credits); + return -EINVAL; + } + memset(&srpc_data, 0, sizeof(struct smoketest_rpc)); spin_lock_init(&srpc_data.rpc_glock); @@ -1687,7 +1726,7 @@ srpc_shutdown (void) LASSERT (list_empty(&peer->stp_rpcq)); LASSERT (list_empty(&peer->stp_ctl_rpcq)); - LASSERT (peer->stp_credits == SRPC_PEER_CREDITS); + LASSERT (peer->stp_credits == srpc_peer_credits); LIBCFS_FREE(peer, sizeof(srpc_peer_t)); } diff --git a/lnet/selftest/selftest.h b/lnet/selftest/selftest.h index dd977265a4d480ca3cc453ed203898b218223c1e..ee4e8e8c06f8a179d29bc1788be9a80ef772d402 100644 --- a/lnet/selftest/selftest.h +++ b/lnet/selftest/selftest.h @@ -11,8 +11,31 @@ #define LNET_ONLY #ifndef __KERNEL__ -#include <liblustre.h> /* userland spinlock_t and atomic_t */ + +/* XXX workaround XXX */ +#ifdef HAVE_SYS_TYPES_H +#include <sys/types.h> #endif + +/* TODO: remove these when libcfs provides proper primitives for userspace + * + * Dummy implementations of spinlock_t and atomic_t work since userspace + * selftest is completely single-threaded, even using multi-threaded usocklnd. + */ +typedef struct { } spinlock_t; +static inline void spin_lock(spinlock_t *l) {return;} +static inline void spin_unlock(spinlock_t *l) {return;} +static inline void spin_lock_init(spinlock_t *l) {return;} + +typedef struct { volatile int counter; } atomic_t; +#define atomic_read(a) ((a)->counter) +#define atomic_set(a,b) do {(a)->counter = b; } while (0) +#define atomic_dec_and_test(a) ((--((a)->counter)) == 0) +#define atomic_inc(a) (((a)->counter)++) +#define atomic_dec(a) do { (a)->counter--; } while (0) + +#endif + #include <libcfs/kp30.h> #include <libcfs/libcfs.h> #include <lnet/lnet.h> @@ -35,6 +58,7 @@ #define SWI_STATE_REQUEST_SENT 4 #define SWI_STATE_REPLY_RECEIVED 5 #define SWI_STATE_BULK_STARTED 6 +#define SWI_STATE_BULK_ERRORED 7 #define SWI_STATE_DONE 10 /* forward refs */ @@ -50,11 +74,11 @@ struct sfw_test_instance; * serialized with respect to itself. * - no CPU affinity, a workitem does not necessarily run on the same CPU * that schedules it. However, this might change in the future. - * - if a workitem is scheduled again before it has a chance to run, it + * - if a workitem is scheduled again before it has a chance to run, it * runs only once. - * - if a workitem is scheduled while it runs, it runs again after it - * completes; this ensures that events occurring while other events are - * being processed receive due attention. This behavior also allows a + * - if a workitem is scheduled while it runs, it runs again after it + * completes; this ensures that events occurring while other events are + * being processed receive due attention. This behavior also allows a * workitem to reschedule itself. * * Usage notes: @@ -334,7 +358,7 @@ typedef struct { typedef struct { int (*tso_init)(struct sfw_test_instance *tsi); /* intialize test client */ void (*tso_fini)(struct sfw_test_instance *tsi); /* finalize test client */ - int (*tso_prep_rpc)(struct sfw_test_unit *tsu, + int (*tso_prep_rpc)(struct sfw_test_unit *tsu, lnet_process_id_t dest, srpc_client_rpc_t **rpc); /* prep a tests rpc */ void (*tso_done_rpc)(struct sfw_test_unit *tsu, @@ -367,7 +391,7 @@ typedef struct sfw_test_instance { } tsi_u; } sfw_test_instance_t; -/* XXX: trailing (CFS_PAGE_SIZE % sizeof(lnet_process_id_t)) bytes at +/* XXX: trailing (CFS_PAGE_SIZE % sizeof(lnet_process_id_t)) bytes at * the end of pages are not used */ #define SFW_MAX_CONCUR LST_MAX_CONCUR #define SFW_ID_PER_PAGE (CFS_PAGE_SIZE / sizeof(lnet_process_id_t)) @@ -404,7 +428,7 @@ void sfw_add_bulk_page(srpc_bulk_t *bk, cfs_page_t *pg, int i); int sfw_alloc_pages(srpc_server_rpc_t *rpc, int npages, int sink); srpc_client_rpc_t * -srpc_create_client_rpc(lnet_process_id_t peer, int service, +srpc_create_client_rpc(lnet_process_id_t peer, int service, int nbulkiov, int bulklen, void (*rpc_done)(srpc_client_rpc_t *), void (*rpc_fini)(srpc_client_rpc_t *), void *priv); @@ -492,12 +516,12 @@ srpc_init_client_rpc (srpc_client_rpc_t *rpc, lnet_process_id_t peer, return; } -static inline const char * +static inline const char * swi_state2str (int state) { #define STATE2STR(x) case x: return #x switch(state) { - default: + default: LBUG(); STATE2STR(SWI_STATE_NEWBORN); STATE2STR(SWI_STATE_REPLY_SUBMITTED); @@ -506,6 +530,7 @@ swi_state2str (int state) STATE2STR(SWI_STATE_REQUEST_SENT); STATE2STR(SWI_STATE_REPLY_RECEIVED); STATE2STR(SWI_STATE_BULK_STARTED); + STATE2STR(SWI_STATE_BULK_ERRORED); STATE2STR(SWI_STATE_DONE); } #undef STATE2STR diff --git a/lnet/selftest/workitem.c b/lnet/selftest/workitem.c index 19b0a79be1961fbfff77d641ab253f878293a57a..789da8d01e99a192f5fbdf3fd78a44fe10b92e5a 100644 --- a/lnet/selftest/workitem.c +++ b/lnet/selftest/workitem.c @@ -259,7 +259,7 @@ swi_check_events (void) q = &swi_data.wi_runq; else break; - + wi = list_entry(q->next, swi_workitem_t, wi_list); list_del_init(&wi->wi_list); diff --git a/lnet/ulnds/socklnd/usocklnd.h b/lnet/ulnds/socklnd/usocklnd.h index f2abf9d61e1442de5dcfdd821be2f4c8fc21b53f..f67a2578fa9800b44004024bef1b5db399290d62 100644 --- a/lnet/ulnds/socklnd/usocklnd.h +++ b/lnet/ulnds/socklnd/usocklnd.h @@ -8,7 +8,9 @@ * Lustre is a trademark of Cluster File Systems, Inc. * */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include <pthread.h> #include <poll.h> #include <lnet/lib-lnet.h> diff --git a/lnet/utils/debug.c b/lnet/utils/debug.c index 2a868dd6e1eca1faef3a0b78a386a300e37a396b..bb5760d28a6c7ae06d45c3df0cf55d3f8b14d91f 100644 --- a/lnet/utils/debug.c +++ b/lnet/utils/debug.c @@ -24,7 +24,9 @@ */ #define __USE_FILE_OFFSET64 -#define _GNU_SOURCE +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif #include <stdio.h> #ifdef HAVE_NETDB_H @@ -45,7 +47,6 @@ #include <sys/types.h> #include <sys/socket.h> -#include <sys/ioctl.h> #include <sys/stat.h> #include <sys/mman.h> #include <sys/utsname.h> @@ -73,7 +74,7 @@ static const char *libcfs_debug_subsystems[] = "pinger", "filter", "", "echo", "ldlm", "lov", "", "", "", "", "", "lmv", - "", "sec", "gss", "", + "", "sec", "gss", "", "mgc", "mgs", "fid", "fld", NULL}; static const char *libcfs_debug_masks[] = {"trace", "inode", "super", "ext2", @@ -373,7 +374,7 @@ static int add_rec(struct dbg_line *line, struct dbg_line ***linevp, int *lenp, *linevp = linev; *lenp = nlen; } - linev[used] = line; + linev[used] = line; return 1; } @@ -407,9 +408,8 @@ static int parse_buffer(FILE *in, FILE *out) if (rc <= 0) break; - if (hdr->ph_mask && - (!(subsystem_mask & hdr->ph_subsys) || - (!(debug_mask & hdr->ph_mask)))) { + if ((hdr->ph_subsys && !(subsystem_mask & hdr->ph_subsys)) || + (hdr->ph_mask && !(debug_mask & hdr->ph_mask))) { dropped++; continue; } @@ -441,10 +441,10 @@ static int parse_buffer(FILE *in, FILE *out) line->text = p; if (!add_rec(line, &linev, &linev_len, kept)) { - fprintf(stderr, "malloc failed; printing accumulated " + fprintf(stderr, "malloc failed; printing accumulated " "records and exiting.\n"); break; - } + } kept++; } @@ -483,7 +483,8 @@ int jt_dbg_debug_kernel(int argc, char **argv) if (argc > 1 && raw) strcpy(filename, argv[1]); else - sprintf(filename, "/tmp/lustre-log.%lu.%u",time(NULL),getpid()); + sprintf(filename, "/tmp/lustre-log."CFS_TIME_T".%u", + time(NULL),getpid()); if (stat(filename, &st) == 0 && S_ISREG(st.st_mode)) unlink(filename); diff --git a/lnet/utils/lstclient.c b/lnet/utils/lstclient.c index ec52175845dddbb8674ffca0c59ba6a8580cae9f..c9a70f86a761249744dc1cffe4323ddfcf885178 100644 --- a/lnet/utils/lstclient.c +++ b/lnet/utils/lstclient.c @@ -178,7 +178,7 @@ main(int argc, char **argv) if (server_mode_flag) lnet_server_mode(); - + rc = lnet_selftest_init(); if (rc != 0) { fprintf(stderr, "Can't startup selftest\n"); @@ -187,7 +187,7 @@ main(int argc, char **argv) return -1; } - + rc = lstjn_join_session(ses, grp); if (rc != 0) goto out; diff --git a/lustre-iokit/.cvsignore b/lustre-iokit/.cvsignore new file mode 100644 index 0000000000000000000000000000000000000000..39b55efd225d28cbca0ea1fdb85852ff4e13abec --- /dev/null +++ b/lustre-iokit/.cvsignore @@ -0,0 +1,17 @@ +Kernelenv +Makefile +Makefile.in +aclocal.m4 +autom4te.cache +config.log +config.status +configure +.*.cmd +.depend +COPYING +INSTALL +lustre-iokit.spec +install-sh +missing +mkinstalldirs +lustre-iokit*.tar.gz diff --git a/lustre-iokit/AUTHORS b/lustre-iokit/AUTHORS new file mode 100644 index 0000000000000000000000000000000000000000..f0f587a37178336cd85873229e31b6ad81284cf5 --- /dev/null +++ b/lustre-iokit/AUTHORS @@ -0,0 +1,5 @@ +Phil Schwan +Eric Barton <eeb@clusterfs.com> +Jody McIntyre <scjody@clusterfs.com> +Michael MacDonald <mjmac@clusterfs.com> +Cliff White <cliffw@clusterfs.com> diff --git a/lustre-iokit/ChangeLog b/lustre-iokit/ChangeLog new file mode 100644 index 0000000000000000000000000000000000000000..69e48fbdbe62d4954751325902c2bf46f0bbdad1 --- /dev/null +++ b/lustre-iokit/ChangeLog @@ -0,0 +1,22 @@ +2006-10-31 - Borrowing mjmac's most excellent work and setting up autogen/RPM +2007-07-24 - + Bugs committed: + 10851 - minor obdfilter fixes + 11485 - sgpdd_survey uses readcap for block size + 12741 - better error checking for sgpdd_survey +2007-07-26 Jitendra Pawar <jitendra@clusterfs.com> + Fixes for bug: 11171 + - Added common library "libobd" + - Improved obdfilter-survey +2007-08-01 Jitendra Pawar <jitendra@clusterfs.com> + Fixes for bug: 10961 + - Modified README.obdfilter-survey +2007-08-22 Jitendra Pawar <jitendra@clusterfs.com> + Aditions for bug: 11171 + - Added automatic ondecho module load support on client and server + - Added flexible OSS setup and cleanup +2007-09-21 Jitendra Pawar <jitendra@clusterfs.com> + Aditions for bug: 11171, 10961 + - removed OSTS and ECHO_CLIENTS command line options, instead + used only 'targets' option for all three cases. + - modified README-obdfilter-survey as per changes in 11171. diff --git a/lustre-iokit/Makefile.am b/lustre-iokit/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..a654ff6aef18b79dea55126cd5b7cdec8db4926f --- /dev/null +++ b/lustre-iokit/Makefile.am @@ -0,0 +1,6 @@ +SUBDIRS = obdfilter-survey sgpdd-survey ost-survey ior-survey stats-collect + +EXTRA_DIST = lustre-iokit.spec + +rpms rpm: dist + rpmbuild -ta $(PACKAGE)-$(VERSION).tar.gz diff --git a/lustre-iokit/NEWS b/lustre-iokit/NEWS new file mode 100644 index 0000000000000000000000000000000000000000..6874d57ead8be478b5c8bfd9ea8819b01357453c --- /dev/null +++ b/lustre-iokit/NEWS @@ -0,0 +1 @@ +2006-10-31 - Moved to autoconf diff --git a/lustre-iokit/README b/lustre-iokit/README new file mode 100644 index 0000000000000000000000000000000000000000..3a643885b2b1459c5b1dc80896eade3330daf0b1 --- /dev/null +++ b/lustre-iokit/README @@ -0,0 +1,26 @@ +This bundle includes four tools: +In order of preference: + +sgpdd-survey: - a test of the 'bare metal' performance, bypassing +as much of the kernel as we can. Does not require Lustre, does +require the sgp_dd package. WILL ERASE ALL DATA ON DEVICE. + +obdfilter-survey: +Shell script - tests performance of isolated OSTS, network +via echo clients, end-to-end test. + +obdsurvey: - a test of Lustre performance with three modes: +Maintained by Scali, included here as an extra. +Requires Python > 2.2 +Requires Lustre + +- local disk test - requires one OST +- network performance test - requires two Lustre machines +- network + disk test - requires Lustre filesystem and client + +ior-survey: +A script to run the IOR benchmark. Version 2.8.6 of IOR is included + +ost-survey: +This is OST performance survey, designed to test the client-to-disk +performance of the individual OSTs in a Lustre filesystem. diff --git a/lustre-iokit/autogen.sh b/lustre-iokit/autogen.sh new file mode 100644 index 0000000000000000000000000000000000000000..37f4a1551dcb5f5655d43489c93b824ff54d7489 --- /dev/null +++ b/lustre-iokit/autogen.sh @@ -0,0 +1,21 @@ +#!/bin/sh + +fatal() { + local msg="$1" + + echo "FATAL: $msg" + exit 1 +} + +run_cmd() { + local cmd="$1" + + echo "Running $cmd..." + $cmd || fatal "$cmd failed!" +} + +run_cmd aclocal +run_cmd "automake -a -c" +run_cmd autoconf + +echo "Finished. Ready for ./configure ..." diff --git a/lustre-iokit/configure.ac b/lustre-iokit/configure.ac new file mode 100644 index 0000000000000000000000000000000000000000..49b6fc428cb510f40c649cb4105652961fc3e767 --- /dev/null +++ b/lustre-iokit/configure.ac @@ -0,0 +1,16 @@ +AC_INIT +AM_INIT_AUTOMAKE(lustre-iokit,1.2) +AC_PATH_PROGS(BASH, bash) +AC_PATH_PROGS(PERL, perl) +RELEASE="`date +%Y%m%d%H%M`" +AC_SUBST(RELEASE) + +AC_OUTPUT( +lustre-iokit.spec +Makefile +sgpdd-survey/Makefile +obdfilter-survey/Makefile +ost-survey/Makefile +ior-survey/Makefile +stats-collect/Makefile +) diff --git a/lustre-iokit/ior-survey/.cvsignore b/lustre-iokit/ior-survey/.cvsignore new file mode 100644 index 0000000000000000000000000000000000000000..282522db0342d8750454b3dc162493b5fc709cc8 --- /dev/null +++ b/lustre-iokit/ior-survey/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre-iokit/ior-survey/Makefile.am b/lustre-iokit/ior-survey/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..537162b2171322acc20e04c8376a069cbc64466d --- /dev/null +++ b/lustre-iokit/ior-survey/Makefile.am @@ -0,0 +1,3 @@ +bin_SCRIPTS = ior-survey parse-ior +CLEANFILE = $(bin_SCRIPTS) +EXTRA_DIST = README.ior-survey ior-survey parse-ior diff --git a/lustre-iokit/ior-survey/README.ior-survey b/lustre-iokit/ior-survey/README.ior-survey new file mode 100644 index 0000000000000000000000000000000000000000..8d734da7f29f0fbb22f2e177b8b75e26d9526579 --- /dev/null +++ b/lustre-iokit/ior-survey/README.ior-survey @@ -0,0 +1,190 @@ +Introduction : + + The ior_survey script can be used to test the performance of the lustre +file systems. It uses IOR (Interleaved Or Random), a script used for testing +performance of parallel file systems using various interfaces and access +patterns. IOR uses MPI for process synchronization. + +General Description: + + ior_mpiio is a parallel file system test developed by the SIOP (Scalable +I/O Project) at LLNL. This parallel program performs parallel writes and +reads to/from a file using MPI-IO and reporting the throughput rates. + + MPI is used for process synchronization. Under the control of compile-time +defined constants (and, to a lesser extent, environment variables), I/O is done +via MPI-IO. The data are written and read using independent parallel transfers +of equal-sized blocks of contiguous bytes that cover the file with no gaps and +that do not overlap each other. The test consists of creating a new file, +writing it with data, then reading the data back. + + The data written are C integers. If the program runs successfully to +completion, it returns 0. If a problem is detected with any I/O routine, the +program exits with a value of IO_ERR. + + If a non-I/O problem is detected, the program exits with a value of +INTERNAL_ERR (this can be caused by a bug in the test program, or a problem in +MPI, or by inconsistencies in the environment variable settings). + +Requirements : + To run the ior_survey script following items are required. + +1: IOR + + The IOR test should be obtained at + ftp://ftp.llnl.gov/pub/siop/ior/ + +2: pdsh + The tarball can be obtained from + http://sourceforge.net/project/showfiles.php?group_id=33530&package_id=183641 + +3: pdsh-rcmd-ssh module + The rpm for this could be found at + http://sourceforge.net/project/showfiles.php?group_id=33530&package_id=183641 + +4: lam/mpi + The tarball can be obtained from + http://www.lam-mpi.org/7.1/download.php + +5: You need to be a non-root user to execute the script and should have the + super-user privileges. + +6: The user should have login on all the nodes without password on which the + test is going to be run. + + + +To make an entry into the sudoers file : + +1: Become super user (root) + +2: type visudo + +3: make an entry as + username ALL=(ALL) NOPASSWD: ALL //(username is the name of the user) + + +Building IOR : + + Type 'gmake mpiio' from the IOR/ directory. In + IOR/src/C, the file Makefile.config currently has settings for AIX, Linux, + OSF1 (TRU64), and IRIX64 to model on. Note that MPI must be present for + building/running IOR, and that MPI I/O must be available for MPI I/O, HDF5, + and Parallel netCDF builds. As well, HDF5 and Parallel netCDF libraries are + necessary for those builds. All IOR builds include the POSIX interface. + + Copy the IOR binary file in IOR/src/C/ to /usr/local/sbin/ using + + sudo cp IOR/src/C/IOR /usr/local/sbin/ + + + +Installing pdsh and pdsh-rcmd-ssh module : + +1: Download the pdsh tarball + +2: untar it using tar -xzvf (if tar.gz) or tar -xjvf(if tar.bz2) + +3: go to the pdsh directory and type ./bootstrap + +4: configure it using the following command + + ./configure --with-ssh + +5: Build it using "make" + +6: Install it using "sudo make install" + +7: Download the pdsh-rcmd-ssh rpm + +8: Install the rpm using "rpm -ivh pdsh-rcmd-ssh*" + + +Installing lam/mpi : + +1: Download the lam tarball + +2: untar it using tar -xzvf (if tar.gz) or tar -xjvf(if tar.bz2) + +3: go to the lam directory and type ./configure + +4: Build it using "make" + +5: Install it using "sudo make install" + + The lam, IOR, pdsh should be installed on all the nodes on which the + test is going to be run. + +Note: Please make sure that you have installed the same version of lam on all +the nodes on which the test is going to be run. + + + +Running the ior_survey script : + +1: Lustre should be mounted at /mnt/lustre. Do + "touch /mnt/lustre/ior_survey_testfile" + +2: Make a hostfile in which the ip addresses of all the nodes are present on + the node from where the script is going to be executed. + +3: run the lam using "lamboot -v -d hostfile". This will start lamd on all the + nodes. + +4: run the ior_survey script using "./ior_survey" + +Note: + The node names of the clients should be like rhea1, rhea2, rhea3, so on. + The name of the cluster (1st part of the node name) should be set in the + ior_survey script in the cluster name field. + e.g. cluster=rhea //name of the cluster + + The client node numbers should be set as last part of the node name i.e. + numeral part. + e.g. client=(1) //to run test on one node only node1. + client=(1-2) //to run test on two nodes node1, node2. + + Please note that the hostfile should contain the ip addresses of only + those nodes on which the lustre filesystem is mounted i.e. clients are + mounted. + + The details of the test can be found on the node from where the + test was run as /tmp/ior_survey_run_date@start_time_nodename.detail + + The output of the IOR looks like + +host1: access bw(MiB/s) block(KiB) xfer(KiB) open(s) wr/rd(s) close(s) iter +host1: ------ --------- ---------- --------- -------- -------- -------- ---- +host1: write 1.58 2097152 1024.00 0.000873 1299.37 0.000132 0 +host1: +host1: Max Write: 1.58 MiB/sec (1.65 MB/sec) + + where, + host1 : node on which the test is run + access: the test which is run (write, rewrite, read, reread) + bw : band width + block : total size to be written + xfer : block size to transfer here 1MB + open : time taken for open + close : time taken for close + wr/rd : time taken for read/write + iteration : iteration no. + Max write : Max_write speed obtained + +Note : MB is defined as 1,000,000 bytes and MiB is 1,048,576 bytes. + + The summary of the test can be found on the node from where the + test was run as /tmp/ior_survey_run_date@start_time_nodename.summary + It contains the tests run and the status of those tests. + + +Instructions for graphing IOR results + + The plot-ior.pl script will plot the results from the .detail file + generated by ior-survery. It will create a data file for writes as + /tmp/ior_survey_run_date@start_time_nodename.detail.dat1 and for reads + as /tmp/ior_survey_run_date@start_time_nodename.detail.dat2 and gnuplot + file as /tmp/ior_survey_run_date@start_time_nodename.detail.scr. + + $ perl parse-ior.pl /tmp/ior_survey_run_date@start_time_nodename.detail + diff --git a/lustre-iokit/ior-survey/ior-survey b/lustre-iokit/ior-survey/ior-survey new file mode 100644 index 0000000000000000000000000000000000000000..a2d67245600ce5563d46d43488c4c9b2ac5422d8 --- /dev/null +++ b/lustre-iokit/ior-survey/ior-survey @@ -0,0 +1,354 @@ +#!/bin/bash + +# the value of these can be set in the environment. +# This test assumes a typical pdsh naming scheme, where +# node names can be expressed as a string +# forllowed by a number +# cluster name (all node names are this followed by the node number) +cluster=${cluster:-""} +if [ -z "$cluster" ];then echo "cluster not defined"; exit 1; fi + +# client node numbers (individual numbers or inclusive ranges) +clients=${clients:-""} +if [ -z "$clients" ]; then echo "clients not defined"; exit 1; fi + +# numbers of clients to survey +clients_lo=${clients_lo:-1} +clients_hi=${clients_hi:-3} +clients_iterator=${clients_iterator:-"+=1"} + +# numbers of tasks per client to survey +tasks_per_client_lo=${task_per_client_lo:-1} +tasks_per_client_hi=${tasks_per_client_hi:-8} +tasks_per_client_iterator=${tasks_per_client_iterator:-"*=2"} + +# record sizes to survey +rsize_lo=${rsize_lo:-1M} +rsize_hi=${rsize_hi:-1M} +rsize_iterator=${rsize_iterator:-"*=2"} + +## which tests to run (first must be write) +# clear_cache) not really a test; just uncache everything +# *write*) write +# *) read +tests=(write rewrite clear_cache read reread) + +# total # bytes written/read by any client node +min_per_client_size=${min_per_client_size:-"1G"} +min_total_size=${min_total_size:-"2G"} + +# should each task do I/O to its own file? +file_per_task=${file_per_task:-1} + +# the binaries +IOR=${IOR:-"/usr/local/sbin/IOR"} +llmount=${llmount:-"llmount"} +# Select mpirun, yod, or pdsh +fanout_cmd=${fanout_cmd:-"pdsh"} +# mpirun still uses pdsh for cleanup +pdsh=${pdsh:-"pdsh"} +pdsh_args="-R ssh -S -b -w " + +# the result file prefix (date/time + hostname makes unique) +rslt_loc=${rslt_loc:-"/tmp"} +rslt=${rslt:-"$rslt_loc/ior_survey_`date +%F@%R`_`uname -n`"} + +# where lustre is mounted on the clients +lustre=${lustre:-"/mnt/lustre"} + +# basename of the test file(s) +testfile=${testfile:-"${lustre}/ior/ior_survey_testfile"} + +#don't spin for MPI completions +export LIBELAN_WAITTYPE=0 + +################################################################################ +# dont change stuff below here unless you know what you're doing... + +# This is to allow use of yod, pdsh, etc. +fanout() { + local clients=$1; shift + local tmpfile=$1; shift + local result + case $fanout_cmd in + 'pdsh') + $fanout_cmd $pdsh_args "$clients" "$@" >> $tmpfile 2>&1 + echo $? + return + ;; + 'mpirun') + # horrible misuse of globals + $fanout_cmd -np $((ntask*nclnt)) "$@" >> $tmpfile 2>&1 + echo $? + return + ;; + 'yod') + # and another + $fanout_cmd -np $((ntask*nclnt)) "$@" >> $tmpfile 2>&1 + echo $? + return + ;; + + *) + echo "255" + return + ;; + esac +} + +dump_cache() { + # we are assuming mpi uses will also have pdsh + local clients=$1;shift + local tmpfile=$1;shift + clear_cache='for LRU in /proc/fs/lustre/ldlm/namespaces/*/lru_size; + do sudo /bin/bash -c "echo clear > $LRU"; done' + echo "=> $clear_cache" >> $tmpfile + $pdsh $pdsh_args "$test_clients" "$clear_cache" >> $tmpfile 2>&1 + status=$? + echo "Completion Status: $status" >> $tmpfile + + if ((status)); then + echo "ERROR" + else + echo "OK" + fi +} +count_range() { + echo $1 | awk '{ nvals=split($1, vals, "-");\ + if (nvals == 1) print 1;\ + else if (nvals == 2) printf "%d\n", vals[2] - vals[1] + 1;}' +} + +base_range() { + echo $1 | awk '{ split($1, vals, "-"); print vals[1]; }' +} + +idx2nodenum() { + local n=$1; shift + while ((1)); do + local range=$1; shift + if [ -z "$range" ]; then + return + fi + chunk=`count_range $range` + if ((chunk > $n)); then + base=`base_range $range` + echo $((base + n)) + return + fi + n=$((n-chunk)) + done +} + +n2noderange() { + local n=$1; shift + sep="" + nodes="[" + while ((n > 0)); do + local range=$1; shift + if [ -z "$range" ]; then + return + fi + local base=`base_range $range` + local chunk=`count_range $range` + if ((chunk > $n)); then chunk=$n; fi + local nodes="${nodes}${sep}${base}"; sep="," + if ((chunk > 1)); then nodes="${nodes}-$((base+chunk-1))"; fi + n=$((n-chunk)) + done + echo "${nodes}]" +} + +countnodes() { + local radix=16384 + local n=0 + while ((radix > 0)); do + local nodes=`n2noderange $((n+radix)) $@` + if [ -n "$nodes" ]; then + n=$((n+radix)) + fi + radix=$((radix/2)) + done + echo $n +} + +parse_number() { + local str=$1 + case $str in + *G|*g) n=`echo $str | sed 's/[gG]//'`; echo $((n*1024*1024*1024));; + *M|*m) n=`echo $str | sed 's/[Mm]//'`; echo $((n*1024*1024));; + *K|*k) n=`echo $str | sed 's/[Kk]//'`; echo $((n*1024));; + *) echo $1;; + esac +} + +pp_number() { + local n=$1 + local G=$((1024*1024*1024)) + local M=$((1024*1024)) + local K=$((1024)) + if ((n%G == 0 && n >= $G)); then + echo "$((n/G))G" + elif ((n%M == 0 && n >= $M)); then + echo "$((n/M))M" + elif ((n%K == 0 && n >= $K)); then + echo "$((n/K))K" + else + echo $n + fi +} + +if [ ${#tests[@]} -eq 0 -o "${tests[0]}" != "write" ]; then + echo "First test must be 'write'" 1>&2 + exit 1 +fi + +rsltf="${rslt}.summary" +workf="${rslt}.detail" +echo -n > $rsltf +echo -n > $workf + +print_summary () { + if [ "$1" = "-n" ]; then + minusn=$1; shift + else + minusn="" + fi + echo $minusn "$*" >> $rsltf + echo $minusn "$*" +} + +check_mount() { + local lustre=$1; shift + local tmpb=$1; shift + local clients=$1; shift + local tmpfile=${tmpb}_tmp + # check lustre is mounted everywhere it's needed + cmd="grep $lustre /proc/mounts" + $pdsh $pdsh_args "$clients" "$cmd" >> $tmpfile + status=$? + if (($status)); then + print_summary "Lustre NOT mounted on $lustre somewhere" + exit 1 + fi + +} +# convert params to actual numbers +min_per_client_size=`parse_number $min_per_client_size` +min_total_size=`parse_number $min_total_size` + +rsize_lo=`parse_number $rsize_lo` +rsize_hi=`parse_number $rsize_hi` + +# check on actual numbers of client nodes +nclients=`countnodes ${clients[@]}` +if ((clients_hi > $nclients)); then clients_hi=$nclients; fi + +cur_date=`date` +machine=`uname -n` +script_name=`echo $0 | cut -d "/" -f2` + +echo "$cur_date $script_name on $lustre from $machine" >> $workf + +for ((rsize=$rsize_lo; rsize<=$rsize_hi; rsize$rsize_iterator)); do + pp_rsize=`pp_number $rsize` + + for ((nclnt=$clients_lo; nclnt<=$clients_hi; nclnt$clients_iterator)); do + test_clients="${cluster}`n2noderange $nclnt ${clients[@]}`" + echo $test_clients + if [ "$fanout_cmd" = "pdsh" ] || [ "$fanout_cmd" = "mpirun" ];then + check_mount $lustre $workf $test_clients + fi + per_client_size=$((min_total_size/nclnt)) + if ((per_client_size < $min_per_client_size)); then + per_client_size=$min_per_client_size + fi + + for ((ntask=$tasks_per_client_lo; ntask <= $tasks_per_client_hi; \ + ntask$tasks_per_client_iterator)); do + per_task_size=$((per_client_size/ntask)) + if ((per_task_size%rsize != 0)); then + per_task_size=$(((per_task_size/rsize + 1)*rsize)) + fi + total_size=`pp_number $((per_task_size*nclnt*ntask))` + + hdrstr=`printf "Total: %5sB rsize: %4sB clients: %4d tasks: %3d: " \ + $total_size $pp_rsize $nclnt $ntask` + print_summary -n "$hdrstr" + + for ((test_idx=0; test_idx < ${#tests[@]}; test_idx++)); do + test=${tests[$test_idx]} + + print_summary -n "$test " + echo "===========> ${hdrstr} on $test_clients doing $test" >> $workf + tmpf=${workf}_tmp + echo -n > $tmpf + + if [ "$test" = "clear_cache" ]; then + if [ "$fanout_cmd" = "pdsh" ] || [ "$fanout_cmd" = "mpirun" ]; then + result=`dump_cache $test_clients $tmpf` + else + echo "Haven't figured out how to clear cache" >> $tmpf + result="N/A" + fi + else + + cmdline=( + $IOR # the command + -o${testfile} # test file prefix + -b${per_task_size} # bytes per task + -t${rsize} # record size + -e # fsync before close + -q # quit on error + ) + + idx=${#cmdline[@]} + + # keep the test file(s) unless this is the last test + #((test_idx < ${#tests[@]}-1)) && cmdline[$((idx++))]="-k" + cmdline[$((idx++))]="-k" + + # use the existing test file(s) unless this is the first test + ((test_idx > 0)) && cmdline[$((idx++))]="-E" + + # file-per-task + (($file_per_task)) && cmdline[$((idx++))]="-F" + + case "$test" in + *write*) cmdline[$((idx++))]="-w" + awkstr="Max Write";; + *) cmdline[$((idx++))]="-r" + awkstr="Max Read";; + esac + + echo "=> ${cmdline[@]}" >> $tmpf + + status=`fanout $test_clients $tmpf ${cmdline[@]}` + + echo "Completion Status: $status" >> $tmpf + + if (($status)); then + result="ERROR" + else + # pdsh adds an extra field + if [ "$fanout_cmd" = "pdsh" ]; then + result=`awk < $tmpf "/$awkstr/ {print $ 4; found=1; exit}\ + END {if (!found) print \"ERROR\"}"` + else + result=`awk < $tmpf "/$awkstr/ {print $ 3; found=1; exit}\ + END {if (!found) print \"ERROR\"}"` + fi + fi + fi + + cat $tmpf >> $workf + rm $tmpf + + str=`printf "%8s" "$result"` + print_summary -n "$str " + done + print_summary "" + done + done +done + diff --git a/lustre-iokit/ior-survey/parse-ior b/lustre-iokit/ior-survey/parse-ior new file mode 100644 index 0000000000000000000000000000000000000000..e751503b09f48ec5188c74b0e21dac5be4e754aa --- /dev/null +++ b/lustre-iokit/ior-survey/parse-ior @@ -0,0 +1,56 @@ +#!/usr/bin/perl -w + +# arg 0 is filename + +sub usages_msg(){ + print "Usage: $0 <results_filename>\n"; + print " parses and plots IOR results using gnuplot, and generates a .dat file for\n"; + print " simple graphing in spreadhseets\n"; + print "e.g.> perl parse-ior.pl ior-log\n"; + exit 1; +} + +if ( !$ARGV[0] ) { + usages_msg(); +} +$file = $ARGV[0]; + +# Open log file for reading +open ( PFILE, "$file") or die "Can't open results log file"; +# Open .csv file for writting required columns from log file. +open ( DATAFILE, "> $file.dat" ) or die "Can't open csv file for writting"; +$count = 0; +while ( <PFILE> ) { + chomp; + @line = split( /\s+/ ); # splits line into tokens + if ( $line[0] ) { + # This comparison will be changed if there will be changes log file. + if( $line[0] eq "access" && $line[1] eq "bw(MiB/s)" ) { + print DATAFILE "$count $line[1] $line[4] $line[5] $line[6] br(MiB/s) ropen(s) rd(s) rclose(s)\n"; + $count = $count + 1; + } + # Two columns from output file are skiped since + # they are constant and may not be so useful while graphing results. + if( $line[0] eq "write" ) { + print DATAFILE "$count $line[1] $line[4] $line[5] $line[6] "; + } + if( $line[0] eq "read" ) { + print DATAFILE "$line[1] $line[4] $line[5] $line[6]\n"; + $count = $count + 1; + } + } +} +close PFILE; +close DATAFILE; + +# Open .scr file for writting instructions for gnuplot. +open ( SCRFILE, "> $file.scr" ) or die "Can't open scr file for writting"; +# Only two columns bw(MiB/s) and br(MiB/s) are considered for graphing results. +print SCRFILE "plot \"$file.dat\" using 1:2 axes x1y1 title \"bw(MiB/s)\" with line\n"; +print SCRFILE "replot \"$file.dat\" using 1:6 axes x1y1 title \"br(MiB/s)\" with line\n"; +print SCRFILE "pause -1\n"; +close SCRFILE; +# check whether gnuplot exists? +system ("which gnuplot > /dev/null") == 0 or die "gnuplot does not exists, Please install it and try again.\n"; +# invoke gnuplot to display graph. +system ("gnuplot $file.scr"); diff --git a/lustre-iokit/lustre-iokit.spec.in b/lustre-iokit/lustre-iokit.spec.in new file mode 100644 index 0000000000000000000000000000000000000000..f7116840090d402a1c2caf322863c2f021869179 --- /dev/null +++ b/lustre-iokit/lustre-iokit.spec.in @@ -0,0 +1,78 @@ +# lustre-iokit.spec +%define name @PACKAGE@ +%define version @VERSION@ +%define release @RELEASE@ + +Summary: The Lustre IO-Kit is a collection of benchmark tools for a cluster with the Lustre file system. +Name: %{name} +Version: %{version} +Release: %{release} +License: GPL +Group: Applications/System +Source: %{name}-%{version}.tar.gz +URL: http://clusterfs.com/ +BuildRoot: /var/tmp/%{name}-%{version}-root +Provides: %{name} = %{version} +BuildArch: noarch +Requires: python > 2.2, sg3_utils + +%description +This package includes five tools: +sgpdd-survey: +A test of the 'bare metal' performance, bypassing as much of the kernel as we can. Uses the sgp_dd utility. + +obdfilter-survey +This survey can be run in 3 modes to test disk I/O including the filesystem, +network I/O, and disk I/O via the network. The script does sequential I/O +with varying numbers of threads and objects (files) by using lctl::test_brw +to drive the echo_client connected to local or remote obdfilter instances, +or remote obdecho instances. + +ost-survey +This survey tests the client-to-disk performance of individual OSTs, and +ranks then for comparison. + +stats-collect +This script will collect IO stats on a defined set of nodes. + +ior-survey: +A script to run the IOR benchmark. The latest version can be downloaded from http://www.llnl.gov/asci/purple/benchmarks/limited/ior/ + +%prep +%setup -qn %{name}-%{version} + +%build +rm -fr $RPM_BUILD_ROOT +./configure --prefix=/usr +make + +%install +make install DESTDIR=$RPM_BUILD_ROOT + +%files +/usr/bin/ior-survey +/usr/bin/parse-ior +/usr/bin/libecho +/usr/bin/obdfilter-survey +/usr/bin/plot-obdfilter +/usr/bin/plot-ost +/usr/bin/ost-survey +/usr/bin/sgpdd-survey +/usr/bin/plot-sgpdd +/usr/bin/lstats.sh +/usr/bin/gather_stats_everywhere.sh +/usr/bin/config.sh +%doc obdfilter-survey/README.obdfilter-survey +%doc ior-survey/README.ior-survey +%doc ost-survey/README.ost-survey +%doc sgpdd-survey/README.sgpdd-survey +%doc stats-collect/README.lstats.sh + + +%changelog +* Tue Jul 24 2007 Cliff White +- Added stats-collect +* Mon Apr 9 2007 Cliff White +- Merged with existing, changed to .in format. +* Thu Oct 4 2006 Kalpak Shah +- Created the spec file. diff --git a/lustre-iokit/obdfilter-survey/.cvsignore b/lustre-iokit/obdfilter-survey/.cvsignore new file mode 100644 index 0000000000000000000000000000000000000000..282522db0342d8750454b3dc162493b5fc709cc8 --- /dev/null +++ b/lustre-iokit/obdfilter-survey/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre-iokit/obdfilter-survey/Makefile.am b/lustre-iokit/obdfilter-survey/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..8b1846468e5427a3ebbe2ba44345a9610c1a8251 --- /dev/null +++ b/lustre-iokit/obdfilter-survey/Makefile.am @@ -0,0 +1,3 @@ +bin_SCRIPTS = obdfilter-survey libecho plot-obdfilter +CLEANFILE = $(bin_SCRIPTS) +EXTRA_DIST = README.obdfilter-survey obdfilter-survey libecho plot-obdfilter diff --git a/lustre-iokit/obdfilter-survey/README.obdfilter-survey b/lustre-iokit/obdfilter-survey/README.obdfilter-survey new file mode 100644 index 0000000000000000000000000000000000000000..e483a6a7e97bf2b596be8c8dd4e34df5617703d7 --- /dev/null +++ b/lustre-iokit/obdfilter-survey/README.obdfilter-survey @@ -0,0 +1,198 @@ +Overview +-------- + +This survey script does sequential I/O with varying numbers of threads and +objects (files) by using lctl to drive the echo_client connected +to local or remote obdfilter instances, or remote obdecho instances. + +It can be used to characterise the performance of the following lustre +components. + +1. The Object Storage Targets. + + Here the script directly exercises one or more instances of obdfilter. + They may be running on 1 or more nodes, e.g. when they are all attached + to the same multi-ported disk subsystem. + + You need to tell the script all the names of the obdfilter instances. + These should be up and running already . If some are on different + nodes, you need to specify their hostnames too (e.g. node1:ost1). + --OR-- + You just need to pass parameter case=disk to the script. The script will + automatically detect the local obdfilter instances. + + All the obdfilter instances are driven directly. The script + automatically loads the obdecho module if required and creates one + instance of echo_client for each obdfilter instance. + +2. The Network. + + Here the script drives one or more instances of obdecho server via instances + of echo_client running on 1 or more nodes. + + You just need to pass parameters case=network and + targets="<hostname/ip_of_server>" to the script. The script will do the + required setup for network case. + +3. The Stripe F/S over the Network. + + Here the script drives one or more instances of obdfilter via instances + of echo_client running on 1 or more nodes. + + You need to tell the script all the names of the OSC's, which should be + up and running. + --OR-- + You just need to pass parameter case=netdisk to the script. The script will + use all of the local OSCs. + +Note that the script is _NOT_ scalable to 100s of nodes since it is only +intended to measure individual servers, not the scalability of the system +as a whole. + +Running +------- + +The script must be customised according to the components under test and +where it should keep its working files. Customization variables are +described clearly at Customization variables Section in the script. +Please see maximum suported value ranges for customization variables +in the srcipt. + +To run against a local disk: +--------------------------- +- Create a Lustre configuraton using your normal methods + +1. Automated run: +Setup the Lustre filesystem with required OST's. Make sure that obdecho.ko +module is present. Then invoke the obdfilter-survey script with parameter +case=disk. +e.g. : $ nobjhi=2 thrhi=2 size=1024 case=disk sh obdfilter-survey + +--OR-- + +2. Manual run: +- You do not need to specify and MDS or LOV +- List all OSTs that you wish to test +- On all OSS machines: + Remember, write tests are destructive! This test should be run prior to +startup of your actual Lustre filesystem. If that is the case, you will not +need to reformat to restart Lustre - however, if the test is terminated before +completion, you may have to remove objects from the disk. + +- Determine the obdfilter instance names on all the clients, column 4 +of 'lctl dl'. For example: + +# pdsh -w oss[01-02] lctl dl |grep obdfilter |sort +oss01: 0 UP obdfilter oss01-sdb oss01-sdb_UUID 3 +oss01: 2 UP obdfilter oss01-sdd oss01-sdd_UUID 3 +oss02: 0 UP obdfilter oss02-sdi oss02-sdi_UUID 3 +... + +Here the obdfilter instance names are oss01-sdb, oss01-sdd, oss02-sdi. + +Since you are driving obdfilter instances directly, set the shell array +variable 'targets' to the names of the obdfilter instances. + +Example: + +targets='oss01:oss01-sdb oss01:oss01-sdd oss02:oss02-sdi' \ + ./obdfilter-survey + +To run against a network: +------------------------ +For the second case i.e. obdfilter-survey over network, following setup +is to be done. +- Install all lustre modules including obdecho. +- Start lctl and check for the device list. The device list must be empty. +- It is suggested that there should be passwordless enrty between client + and server machine to avoid typing password. +1. Automated run: + To run obdfilter-surevy against network you just need to pass parameter + case=netdisk and targets="<hostname/ip_of_server>" to the script. + +e.g. $ nobjhi=2 thrhi=2 size=1024 targets="<hostname/ip_of_server>" \ + case=network sh obdfilter-survey + +On server side you can see the stats at : + /proc/fs/lustre/obdecho/<echo_srv>/stats +where, 'echo_srv' is the obdecho server created through script. + +NOTE: In network test only automated run is supported. + +To run against network-disk: +---------------------------- +- Create a Lustre configuraton using your normal methods + +1. Automated run: +Setup the lustre with required OST's. Make sure that obdecho.ko module is +present. Then invoke the obdfilter-survey script with parameter case=netdisk. +e.g. : $ nobjhi=2 thrhi=2 size=1024 case=netdisk sh obdfilter-survey + +2. Manual run: +While running manually you need to tell the script all the names of the +echo_client instances, which should already be up and running. +e.g. $ nobjhi=2 thrhi=2 size=1024 targets="<osc_name> ..." \ + sh obdfilter-survey + + +Output files: +------------- + +When the script runs, it creates a number of working files and a pair of +result files. All files start with the prefix given by ${rslt}. + +${rslt}.summary same as stdout +${rslt}.script_* per-host test script files +${rslt}.detail_tmp* per-ost result files +${rslt}.detail collected result files for post-mortem + +The script iterates over the given numbers of threads and objects +performing all the specified tests and checking that all test processes +completed successfully. + +Note that the script may not clean up properly if it is aborted or if it +encounters an unrecoverable error. In this case, manual cleanup may be +required, possibly including killing any running instances of 'lctl' (local +or remote), removing echo_client instances created by the script and +unloading obdecho. + + +Script output +------------- + +The summary file and stdout contain lines like... + +ost 8 sz 67108864K rsz 1024 obj 8 thr 8 write 613.54 [ 64.00, 82.00] + +ost 8 is the total number of OSTs under test. +sz 67108864K is the total amount of data read or written (in KB). +rsz 1024 is the record size (size of each echo_client I/O, in KB). +obj 8 is the total number of objects over all OSTs +thr 8 is the total number of threads over all OSTs and objects +write is the test name. If more tests have been specified they + all appear on the same line. +613.54 is the aggregate bandwidth over all OSTs measured by + dividing the total number of MB by the elapsed time. +[64.00, 82.00] are the minimum and maximum instantaneous bandwidths seen on + any individual OST. + +Note that although the numbers of threads and objects are specifed per-OST +in the customization section of the script, results are reported aggregated +over all OSTs. + + +Visualising Results +------------------- + +I've found it most useful to import the summary data (it's fixed width) +into gnuplot, Excel (or any graphing package) and graph bandwidth v. +# threads for varying numbers of concurrent regions. This shows how +the OSS performs for a given number of concurrently accessed objects +(i.e. files) with varying numbers of I/Os in flight. + +It is also extremely useful to record average disk I/O sizes during each +test. These numbers help find pathologies in file the file system block +allocator and the block device elevator. + +The included plot-obdfilter script is an example of processing the output +files to a .csv format and plotting graph using gnuplot. diff --git a/lustre-iokit/obdfilter-survey/libecho b/lustre-iokit/obdfilter-survey/libecho new file mode 100644 index 0000000000000000000000000000000000000000..860d547891a285b7aa5924a260c1217378284ea2 --- /dev/null +++ b/lustre-iokit/obdfilter-survey/libecho @@ -0,0 +1,424 @@ +#!/bin/bash +#* Copyright (C) 2002 Cluster File Systems, Inc. +#* Author: Jitendra Pawar <jitendra@clusterfs.com> +#* +#* Lustre-iokit is free software; you can redistribute it and/or +#* modify it under the terms of version 2 of the GNU General Public +#* License as published by the Free Software Foundation. +#* +#* Lustre-iokit is distributed in the hope that it will be useful, +#* but WITHOUT ANY WARRANTY; without even the implied warranty of +#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +#* GNU General Public License for more details. +#* +#* You should have received a copy of the GNU General Public License +#* along with Lustre; if not, write to the Free Software +#* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + +# binaries +lsmod="/sbin/lsmod" +modprobe="/sbin/modprobe" +insmod="/sbin/insmod" +rmmod="/sbin/rmmod" + +declare -a ost_names +declare -a client_names +declare -a host_list +declare -a dev_list +declare -a unique_hosts +declare count +declare -a vmstatpids +declare -a do_unload_echo + + +DSH=${DSH:-"ssh"} + +dsh () { + local node="$1" + local user="$2" + shift 2 + local command="$@" + + local here=$(pwd) + + command="cd $here; export PATH=/sbin:/usr/sbin:\$PATH; $command" + + case $DSH in + ssh) + if [ -n "$user" ]; then + user="$user@" + fi + $DSH $user$node "$command" + ;; + rsh) + if [ -n "$user" ]; then + user="-l $user" + fi + $DSH $user $node "$command" + ;; + esac +} + +# how to run commands on other nodes +# You need to make this work on your cluster if you have specified +# non-local obd instances above +remote_shell () { + host=$1 + shift + cmds="$@" + if [ "$host" = "localhost" -o "$host" = `uname -n` ]; then + eval "$cmds" + else + # split $host into $host and $user + local user="" + if [[ $host == *@* ]]; then + user=${host%@*} + host=${host#*@} + fi + dsh $host "$user" "$cmds" + fi +} + +# checks whether obdecho module is loded on given host. +# parameter: 1. hostname +obdecho_loaded() { + local host=$1 + remote_shell $host $lsmod | grep obdecho > /dev/null 2>&1 +} + +# load obdecho.ko or obdecho.o module on host kernel. +load_obdecho () { + local index=$1 + local host=${unique_hosts[$index]} + do_unload_echo[$index]=0 + if obdecho_loaded $host; then + return 0 + fi + if [ -z "$lustre_root" ]; then + remote_shell $host $modprobe obdecho + elif [ -f ${lustre_root}/obdecho/obdecho.ko ]; then + remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.ko + else + remote_shell $host $insmod ${lustre_root}/obdecho/obdecho.o + fi + if obdecho_loaded $host; then + do_unload_echo[$index]=1 + else + echo Could not install obdecho on $host + return 1 + fi + return 0 +} + +load_obdechos () { + for ((i = 0; i < ${#unique_hosts[@]}; i++)); do + load_obdecho $i || cleanup 1 + done +} + +# unload obdecho module from host kernel. +unload_obdecho () { + local index=$1 + local host=${unique_hosts[$index]} + if ((${do_unload_echo[$index]})); then + remote_shell $host $rmmod obdecho + do_unload_echo[$index]=0 + fi +} + +# returns the device number which is displayed in "lctl device_list" +# +# parameter: 1. hostname +# 2. type of device ex: echo_client +# 3. name of device ex: ECHO_matrix.linsyssoft.com +get_devno () { + local host=$1 + local type=$2 + local name=$3 + remote_shell $host $lctl device_list | \ + awk "{if (\$2 == \"UP\" && \$3 == \"$type\" && \$4 == \"$name\") {\ + print \$1; exit}}" +} + +get_devnos () { + local i=0 + local host + for ((i = 0; i < $count; i++)); do + ost=${ost_names[$i]} + host=${host_list[$i]} + dev=$(get_devno $host obdfilter $ost) + dev_list[$i]=$dev + if [ -z "$dev" ]; then + echo Cant find device for $ost on $host + return 1 + fi + done + return 0 +} + +# do cleanup for netdisk case. +cleanup_netdisk () { + for osc in $@; do + lctl <<EOF + cfg_device $osc + cleanup + detach +EOF + done +} + +# do cleanup for network case. +cleanup_network () { + local clean_srv_OSS=$1 + lctl <<EOF + cfg_device echotmp + cleanup + detach +EOF + remote_shell "root@$server_nid" "lctl << EOF + cfg_device echo_srv + cleanup + detach +EOF" + if [ $clean_srv_OSS ]; then + remote_shell "root@$server_nid" "lctl << EOF + cfg_device OSS + cleanup + detach +EOF" + fi +} + +# do cleanup and exit. +cleanup () { + local exit_status=$1 + shift + for ((i = 0; i < $ndevs; i++)); do + host=${host_names[$i]} + if [ -n ${do_teardown_ec[$i]} ]; then + teardown_ec_devno $host ${client_names[$i]} + fi + done + pidcount=0 + for host in ${unique_hosts[@]}; do + remote_shell $host "killall -q vmstat >/dev/null 2>&1" & + pid=$! + kill -term ${vmstatpids[$pidcount]} 2>/dev/null + kill -kill ${vmstatpids[$pidcount]} 2>/dev/null + wait $pid + pidcount=$((pidcount+1)) + if ((${do_unload_obdecho[$host]})); then + unload_obdecho $host + fi + done + if [ $case == "network" ]; then + cleanup_network $1 + fi + if [ $case == "netdisk" ]; then + shift + cleanup_netdisk $@ + fi + if [ $exit_status ]; then + if [ $exit_status -ne 0 ]; then + echo "program exited with error " + else + echo "done!" + fi + else + echo "Terminated" + fi + exit $exit_status +} +trap cleanup SIGHUP SIGINT SIGTERM + +# gets echoclient device number and attch it to the client UUID +# +# parameter: 1. hostname +# 2. client name, ex:- ns8:ECHO_ns8 +# 3. name of ost instances, ex:- lustre-OST0001 +get_ec_devno () { + local host=$1 + local client_name="$2" + local ost_name="$3" + if [ -z "$client_name" ]; then + if [ -z "$ost_name" ]; then + echo "client and ost name both null" 1>&2 + return + fi + client_name=${ost_name}_ecc + fi + ec=`get_devno $host echo_client $client_name` + if [ -n "$ec" ]; then + echo $ec $client_name $client_name + return + fi + if [ -z "$ost_name" ]; then + echo "no echo client and ost_name not set, client: $client_name, host: $host" 1>&2 + return + fi + ost=`get_devno $host obdfilter $ost_name` + if [ -z "$ost" ]; then + echo "OST $ost_name not setup" 1>&2 + return + fi + client_name=${ost_name}_ecc + remote_shell $host "$lctl <<EOF + attach echo_client $client_name ${client_name}_UUID + setup $ost_name +EOF" + ec=`get_devno $host echo_client $client_name` + if [ -z "$ec" ]; then + echo "Can't setup echo-client" 1>&2 + return + fi + echo $ec $client_name 1 +} + +# Create echo-clients using osc_names and osc_uuid +# It creates echoclients for all osc listed using #lctl device_list command +ec_using_osc () { + local osc_name=$1 + $lctl <<EOF + attach echo_client ${osc_name}_ecc ${osc_name}_ecc_UUID + cfg_device ${osc_name}_ecc + setup $osc_name +EOF + +} + +# create echo client using server nid. +ec_using_srv_nid () { + local server_nid=$1 + local ocsname=$2 + local oscuuid=$3 + $lctl add_uuid echo_UUID $server_nid@tcp >/dev/null 2>&1 + $lctl <<EOF + attach osc $ocsname $oscuuid + cfg_device $ocsname + setup echo_srv_UUID echo_UUID +EOF + $lctl <<EOF + attach echo_client ${ocsname}_ecc $oscuuid + setup $ocsname +EOF +} + +setup_osc_for_remote_ost () { + local ost_nid=$1 + local obdfilter_name=$2 + local host_name=host_$3 + $lctl add_uuid ${host_name}_UUID $ost_nid@tcp >/dev/null 2>&1 + $lctl <<EOF + attach osc ${obdfilter_name}_osc ${obdfilter_name}_osc_UUID + cfg_device ${obdfilter_name}_osc + setup ${obdfilter_name}_UUID ${host_name}_UUID +EOF +} + +# setup obdecho on server +setup_srv_obd () { + local server_nid=$1 + local test_ostfsnm=$2 + remote_shell "root@$server_nid" "$lctl << EOF + attach obdecho $test_ostfsnm ${test_ostfsnm}_UUID + cfg_device $test_ostfsnm + setup +EOF" +} + +# setup OSS on server +setup_OSS () { + local server_nid=$1 + remote_shell "root@$server_nid" "$lctl << EOF + attach ost OSS OSS_UUID + cfg_device OSS + setup +EOF" +} + +# cleanup and detach the echo-clients that we have created during the test. +# parameter: 1. hostname +# 2. client name, ex:- ns8:ECHO_ns8 +teardown_ec_devno () { + local host=$1 + local client_name=$2 + remote_shell $host "$lctl <<EOF + cfg $client_name + cleanup + detach +EOF" +} + +unique () { + echo "$@" | xargs -n1 echo | sort -u +} + +split_hostname () { + local name=$1 + case $name in + *:*) host=`echo $name | sed 's/:.*$//'` + name=`echo $name | sed 's/[^:]*://'` + ;; + *) host=localhost + ;; + esac + echo "$host $name" +} + +check_cleanup () { + type_obj="$1" + osc_names_str=$(lctl dl | grep $type_obj) + count=0; + for name in $osc_names_str; do + count=$((count+1)) + done + + if [ $count != 0 ]; then + echo "$type_obj could not be cleanup"; + exit 0; + fi + +} + +check_setup () { + type_obj="$1" + osc_names_str=$(lctl dl | grep $type_obj) + count=0; + for name in $osc_names_str; do + count=$((count+1)) + done + + if [ $count == 0 ]; then + echo "$type_obj could not be setup"; + exit 0; + fi + +} + +# added from bugzill req. +get_targets () { + if [ -z "$ost_names" ]; then + targets=$($lctl device_list | awk "{if (\$2 == \"UP\" && \ + \$3 == \"obdfilter\") {print \$4} }") + fi + if [ -z "$targets" ]; then + echo "Can't find any OSTs to test. Please set targets=..." + exit 1 + fi + count=0 + for name in $targets; do + ost_names[$count]=$name + str=(`split_hostname $name`) + host_names[$count]=${str[0]} + count=$((count+1)) + done +} + +get_hosts () { + # split out hostnames from ost names + for ((i = 0; i < count; i++)); do + str=(`split_hostname ${targets[$i]}`) + host_list[$i]=${str[0]} + ost_names[$i]=${str[1]} + done +} diff --git a/lustre-iokit/obdfilter-survey/obdfilter-survey b/lustre-iokit/obdfilter-survey/obdfilter-survey new file mode 100755 index 0000000000000000000000000000000000000000..7b83a8e6fcb338d2548f9c3b6bc02dd69dc0e8e8 --- /dev/null +++ b/lustre-iokit/obdfilter-survey/obdfilter-survey @@ -0,0 +1,532 @@ +#!/bin/bash + +###################################################################### +# customize per survey + +# specify obd instances to exercise +# these can be either... +# obdfilter instances (set 'ost_names') +# ...or... +# echo_client instances (set 'client_names') +# ... use 'host:name' for obd instances on other nodes. +# allow these to be passed in via string... +# OR +# one can specify only case=disk or case=network or case=netdisk through +# command line. + +# Perquisite: For "disk" case and "netdisk" case you need to have lustre setup +# with one or more ost's. For "network" case you need to have all +# modules (those llmount.sh loades) loaded in kernel. And the +# 'lctl dl' output must be blank. + +# How to run test: +# case 1 (local disk): +# $ nobjhi=2 thrhi=2 size=1024 case=disk sh obdfilter-survey +# one can also run test with user defined targets as follows, +# $ nobjhi=2 thrhi=2 size=1024 targets="lustre-OST0000 lustre-OST0001 ..." sh obdfilter-survey +# case 2 (network): +# $ nobjhi=2 thrhi=2 size=1024 targets="<name/ip_of_server>" case=network sh obdfilter-survey +# where, targets is name or ip address of system, which you want to +# set as server. +# case 3 (network and disk): +# $ nobjhi=2 thrhi=2 size=1024 case=netdisk sh obdfilter-survey +# one can also run test with user defined targets as follows, +# $ nobjhi=2 thrhi=2 size=1024 targets="<osc_name> ..." sh obdfilter-survey +#[ NOTE: It is advised to have automated login (passwordless entry) between server and +# client systems on which this test runs.] + +# include library +source libecho + +# The following variables can be set in the environment, or on the +# command line +# result file prefix (date/time + hostname makes unique) +# NB ensure path to it exists +rslt_loc=${rslt_loc:-"/tmp"} +rslt=${rslt:-"$rslt_loc/obdfilter_survey_`date +%F@%R`_`uname -n`"} + +# Set this true to check file contents +verify=${verify:-0} + +# total size (MBytes) per obd instance +# large enough to avoid cache effects +# and to make test startup/shutdown overhead insignificant +size=${size:-16384} + +# record size (KBytes) ( 7168 max) +rszlo=${rszlo:-1024} +rszhi=${rszhi:-1024} + +# number of objects per OST +nobjlo=${nobjlo:-1} +#was nobjhi=${nobjhi:-512} +nobjhi=${nobjhi:-16} + +# threads per OST (1024 max) +thrlo=${thrlo:-1} +thrhi=${thrhi:-16} + +# End of variables + +# create a set of objects, check there are 'n' contiguous ones and +# return the first or 'ERROR' +# parameter: 1. hostname +# 2. device number +# 3. number of object to be created (specified by user) +# 4. tempfile name +create_objects () { + local host=$1 + local devno=$2 + local nobj=$3 + local rfile=$4 + remote_shell $host $lctl --device $devno create $nobj > $rfile 2>&1 + first=0 + prev=0 + count=0 + error=0 + while read line; do + echo "$line" | grep -q 'is object id' + if [ $? -ne 0 ]; then + continue + fi + if [ $first -eq 0 ]; then + first=$(echo $line | awk '{print $6}') + first=$(printf "%d" $first) + prev=$first + count=1 + else + obj=$(echo $line | awk '{print $6}') + obj=$(printf "%d" $obj) + diff=$((obj - (prev+1))) + if [ $diff -ne 0 ]; then + error=1 + fi + prev=$obj + count=$((count+1)) + fi + done < $rfile + if [ $nobj -ne $count ]; then + echo "ERROR: $nobj != $count" >&2 + cat $rfile >&2 + echo "ERROR" + elif [ $error -ne 0 ]; then + echo "ERROR: non contiguous objs found" >&2 + else + echo $first + fi +} + +# destroys all objects created in create_objects routine +# parameter: 3. start obj id. +destroy_objects () { + local host=$1 + local devno=$2 + local obj0=$3 + local nobj=$4 + local rfile=$5 + remote_shell $host $lctl --device $devno destroy $obj0 $nobj > $rfile 2>&1 +} + +get_stats () { + local rfile=$1 + gawk < $rfile \ + '/^Selected device [0-9]+$/ {n = 0; next}\ + /error/ {n = -1; exit}\ + /^[0-9]+\/[0-9]+ Total: [0-9]+\.[0-9]+\/second$/ {n++; v=strtonum($3); \ + if (n == 1 || v < min) min = v;\ + if (n == 1 || v > max) max = v;\ + next}\ + {if (n != 0) {n = -1; exit}}\ + END {printf "%d %f %f\n", n, min, max}' +} + +get_global_stats () { + local rfile=$1 + awk < $rfile 'BEGIN {n = 0;}\ + {n++; if (n == 1) {err = $1; min = $2; max = $3} else\ + {if ($1 < err) err = $1;\ + if ($2 < min) min = $2;\ + if ($3 > max) max = $3}}\ + END {if (n == 0) err = 0;\ + printf "%d %f %f\n", err, min, max}' +} + +# enable or disable data check. +# parameter: 1. read/write +testname2type () { + # 'x' disables data check + if ((verify)); then + x="" + else + x="x" + fi + case $1 in + *write*) echo "w$x";; + *) echo "r$x";; + esac +} + +print_summary () { + if [ "$1" = "-n" ]; then + minusn=$1; shift + else + minusn="" + fi + echo $minusn "$*" >> $rsltf + echo $minusn "$*" +} + +# Customisation variables +##################################################################### +# One can change variable values in this section as per requirements + +targets=${targets:-""} +case=${case:-"disk"} +if [ -n "$targets" ]; then + declare -a ost_names + declare -a client_names + count=0 + for name in $targets; do + if [ $case == "disk" ]; then + ost_names[$count]=$name + else + client_names[$count]=$name + fi + count=$((count+1)) + done +fi + +# what tests to run (first must be write) +tests_str=${tests_str:-""} +if [ -n "$tests_str" ]; then + declare -a tests + count=0 + for name in $tests_str; do + tests[$count]=$name + count=$((count+1)) + done +else + #tests=(write rewrite read reread rewrite_again) + tests=(write rewrite read) +fi + +# restart from here iff all are defined +restart_rsz= +restart_thr=1 +restart_nobj=1 + +# machine's page size (K) +if [ -z "$PAGE_SIZE" ]; then + if which python >/dev/null; then + PAGE_SIZE=`echo 'import resource; print resource.getpagesize()/1024;' |python` + fi +fi +PAGE_SIZE=${PAGE_SIZE:-4} + +# max buffer_mem (total_threads * buffer size) +# (to avoid lctl ENOMEM problems) +max_buffer_mem=$((1024 * 1024)) +snap=1 +clean_srv_OSS=0 +# Customisation variables ends here. +##################################################################### +# leave the rest of this alone unless you know what you're doing... + +# check and insert obdecho module +if ! lsmod | grep obdecho > /dev/null; then + modprobe obdecho +fi +if [ ${#tests[@]} -eq 0 -o "${tests[0]}" != "write" ]; then + echo "tests: ${tests[@]}" + echo "First test must be 'write'" 1>&2 + exit 1 +fi + +rsltf="${rslt}.summary" +workf="${rslt}.detail" +cmdsf="${rslt}.script" +vmstatf="${rslt}.vmstat" +echo -n > $rsltf +echo -n > $workf + +# hide a little trick to unset this from the command line +if [ "$lustre_root" == " " ]; then + unset lustre_root +fi + +if [ -z "$lustre_root" ]; then + lctl=lctl +else + lctl=${lustre_root}/utils/lctl +fi + +# split out hostnames from client/ost names +ndevs=0 +for trgt in $targets; do + str=(`split_hostname $trgt`) + host_names[$ndevs]=${str[0]} + client_names[$ndevs]=${str[1]} + ndevs=$((ndevs+1)) +done +if [ $case == "netdisk" ]; then + if [ "$targets" ]; then + for ((i = 0; i < $ndevs; i++)); do + setup_osc_for_remote_ost ${host_names[$i]} ${client_names[$i]} $i + cleanup_oscs="$cleanup_oscs ${client_names[$i]}_osc" + host_names[$i]=localhost + done + fi + declare -a osc_names + declare -a osc_uuids + osc_names_str=$(lctl dl |grep osc | awk "{if (\$2 == \"UP\" && \$3 == \"osc\") {print \$4} }") + count=0; + for name in $osc_names_str; do + osc_names[$count]=$name + count=$((count+1)) + done + osc_uuid_str=$(lctl dl |grep osc | awk "{if (\$2 == \"UP\" && \$3 == \"osc\") {print \$5} }") + count=0; + for uuid in $osc_uuid_str; do + osc_uuids[$count]=$uuid + count=$((count+1)) + done + for (( i = 0; i < $count; i++ )) + do + ec_using_osc ${osc_names[$i]} + done + echo_clients=$(lctl dl | grep echo_client | awk "{if (\$2 == \"UP\" && \$3 == \"echo_client\") {print \$4} }") + cnt=0; + for name in $echo_clients; do + client_names[$cnt]=$name + host_names[$cnt]=localhost + cnt=$((cnt+1)) + done + ndevs=${#client_names[@]} +fi +if [ $case == "network" ]; then + server_nid=$targets + if [ -z "$server_nid" ]; then + echo "Specify hostname or ip-address of server" + exit 1; + fi + # check for obdecho module on server + if ! dsh $server_nid root "lsmod | grep obdecho > /dev/null"; then + dsh $server_nid root "modprobe obdecho" + fi + # Now do the server setup + setup_srv_obd $server_nid "echo_srv" + oss_on_srv=`dsh $server_nid root "lctl dl | grep OSS" | awk '{ print $4 }'` + if [ -z $oss_on_srv ]; then + setup_OSS $server_nid + clean_srv_OSS=1 + fi + if ! dsh $server_nid root "lctl dl | grep obdecho > /dev/null 2>&1"; then + echo "obdecho not setup on server" + exit 1 + fi + if ! dsh $server_nid root "lctl dl | grep ost > /dev/null 2>&1"; then + echo "ost not setup on server" + exit 1 + fi + # Now start client setup + osc_names_str=$(lctl dl) + if [ -n "$osc_names_str" ]; then + echo "The existing setup must be cleaned"; + exit 0; + fi + ec_using_srv_nid $server_nid "echotmp" "echotmp_UUID" + client_names[0]="echotmp_ecc" +fi +if [ -z "$targets" ]; then + if [ $case == "disk" ]; then + get_targets + ndevs=${#ost_names[@]} + fi +fi +# get vmstat started +# disable portals debug and get obdecho loaded on all relevant hosts +unique_hosts=(`unique ${host_names[@]}`) +pidcount=0 +for host in ${unique_hosts[@]}; do + host_vmstatf=${vmstatf}_${host} + echo -n > $host_vmstatf + remote_shell $host "vmstat 5 >> $host_vmstatf" & + pid=$! + vmstatpids[$pidcount]=$pid + pidcount=$((pidcount+1)) + do_unload_obdecho[$host]=0 + if obdecho_loaded $host; then + continue + fi + load_obdecho $host + if obdecho_loaded $host; then + do_unload_obdecho[$host]=1 + continue + fi + echo "Can't load obdecho on $host" 1>&2 + exit 1 +done +# get all the echo_client device numbers and names +for ((i=0; i < $ndevs; i++)); do + host=${host_names[$i]} + devno=(`get_ec_devno $host "${client_names[$i]}" "${ost_names[$i]}"`) + if ((${#devno[@]} != 3)); then + exit 1 + fi + devnos[$i]=${devno[0]} + client_names[$i]=${devno[1]} + do_teardown_ec[$i]=${devno[2]} +done +if (($ndevs <= 0 || ${#host_names[@]} <= 0)); then + echo "no devices or hosts specified" + cleanup 0 $clean_srv_OSS $cleanup_oscs +fi +print_summary "$(date) Obdfilter-survey for case=$case from $(hostname)" +for ((rsz = $rszlo; rsz <= $rszhi; rsz*=2)); do + for ((nobj = $nobjlo; nobj <= $nobjhi; nobj*=2)); do + for ((thr = $thrlo; thr <= $thrhi; thr*=2)); do + if ((thr % nobj)); then + continue + fi + # restart? + if [ -n "$restart_rsz" -a\ + -n "$restart_nobj" -a\ + -n "$restart_thr" ]; then + if ((rsz < restart_rsz ||\ + (rsz == restart_rsz &&\ + (nobj < restart_nobj ||\ + (nobj == restart_nobj &&\ + thr < restart_thr))))); then + continue; + fi + fi + # compute parameters + total_thr=$((ndevs*thr)) + total_nobj=$((ndevs*nobj)) + pages=$((rsz/PAGE_SIZE)) + actual_rsz=$((pages*PAGE_SIZE)) + count=$((size*1024/(actual_rsz*thr))) + actual_size=$((actual_rsz*count*thr)) + total_size=$((actual_size*ndevs)) + # show computed parameters + str=`printf 'ost %2d sz %8dK rsz %4d obj %4d thr %4d ' \ + $ndevs $total_size $actual_rsz $total_nobj $total_thr` + echo "=======================> $str" >> $workf + print_summary -n "$str" + if ((total_thr * actual_rsz > max_buffer_mem)); then + print_summary "Too much buffer space" + continue + fi + # create the objects + tmpf="${workf}_tmp" + for ((idx = 0; idx < $ndevs; idx++)); do + host=${host_names[$idx]} + devno=${devnos[$idx]} + client_name="${host}:${client_names[$idx]}" + echo "=============> Create $nobj on $client_name" >> $workf + first_obj=`create_objects $host $devno $nobj $tmpf` + cat $tmpf >> $workf + rm $tmpf + if [ $first_obj = "ERROR" ]; then + print_summary "created object #s on $client_name not contiguous" + exit 1 + fi + first_objs[$idx]=$first_obj + done + # run tests + for test in ${tests[@]}; do + declare -a pidarray + for host in ${unique_hosts[@]}; do + echo "starting run for test: $test rsz: $rsz threads: $thr objects: $nobj" >> ${vmstatf}_${host} + done + print_summary -n "$test " + # create per-host script files + for host in ${unique_hosts[@]}; do + echo -n > ${cmdsf}_${host} + done + for ((idx = 0; idx < $ndevs; idx++)); do + host=${host_names[$idx]} + devno=${devnos[$idx]} + tmpfi="${tmpf}_$idx" + first_obj=${first_objs[$idx]} + thr_per_obj=$((${thr}/${nobj})) + echo >> ${cmdsf}_${host} \ + "$lctl > $tmpfi 2>&1 \\ + --threads $thr -$snap $devno \\ + test_brw $count `testname2type $test` q $pages ${thr_per_obj}t${first_obj} &" + done + pidcount=0 + for host in ${unique_hosts[@]}; do + echo "wait" >> ${cmdsf}_${host} + pidarray[$pidcount]=0 + pidcount=$((pidcount+1)) + done + # timed run of all the per-host script files + t0=`date +%s.%N` + pidcount=0 + for host in ${unique_hosts[@]}; do + remote_shell $host bash < ${cmdsf}_${host} & + pidarray[$pidcount]=$! + pidcount=$((pidcount+1)) + done + pidcount=0 + for host in ${unique_hosts[@]}; do + wait ${pidarray[$pidcount]} + pidcount=$((pidcount+1)) + done + #wait + t1=`date +%s.%N` + # clean up per-host script files + for host in ${unique_hosts[@]}; do + rm ${cmdsf}_${host} + done + # compute bandwidth from total data / elapsed time + str=`awk "BEGIN {printf \"%7.2f \",\ + $total_size / (( $t1 - $t0 ) * 1024)}"` + print_summary -n "$str" + # collect/check individual OST stats + echo -n > $tmpf + for ((idx = 0; idx < $ndevs; idx++)); do + client_name="${host_names[$idx]}:${client_names[$idx]}" + tmpfi="${tmpf}_$idx" + echo "=============> $test $client_name" >> $workf + host="${host_names[$idx]}" + remote_shell $host cat $tmpfi >> $workf + get_stats $tmpfi >> $tmpf + rm $tmpfi + done + # compute/display global min/max stats + echo "=============> $test global" >> $workf + cat $tmpf >> $workf + stats=(`get_global_stats $tmpf`) + rm $tmpf + if ((stats[0] <= 0)); then + if ((stats[0] < 0)); then + str=`printf "%17s " ERROR` + else + str=`printf "%17s " SHORT` + fi + else + str=`awk "BEGIN {printf \"[%7.2f,%7.2f] \",\ + (${stats[1]} * $actual_rsz)/1024,\ + (${stats[2]} * $actual_rsz)/1024; exit}"` + fi + print_summary -n "$str" + done + print_summary "" + # destroy objects we created + for ((idx = 0; idx < $ndevs; idx++)); do + host=${host_names[$idx]} + devno=${devnos[$idx]} + client_name="${host}:${client_names[$idx]}" + first_obj=${first_objs[$idx]} + echo "=============> Destroy $nobj on $client_name" >> $workf + destroy_objects $host $devno $first_obj $nobj $tmpf + cat $tmpf >> $workf + rm $tmpf + done + done + done +done +cleanup 0 $clean_srv_OSS $cleanup_oscs +exit 0 diff --git a/lustre-iokit/obdfilter-survey/plot-obdfilter b/lustre-iokit/obdfilter-survey/plot-obdfilter new file mode 100644 index 0000000000000000000000000000000000000000..aaf51b095282cfc5d45d7ce7c3d76571e7c5392f --- /dev/null +++ b/lustre-iokit/obdfilter-survey/plot-obdfilter @@ -0,0 +1,258 @@ +#!/usr/bin/perl -w +#* Copyright (C) 2002 Cluster File Systems, Inc. +#* Author: Jitendra Pawar <jitendra@clusterfs.com> +#* +#* Lustre is free software; you can redistribute it and/or +#* modify it under the terms of version 2 of the GNU General Public +#* License as published by the Free Software Foundation. +#* +#* Lustre is distributed in the hope that it will be useful, +#* but WITHOUT ANY WARRANTY; without even the implied warranty of +#* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +#* GNU General Public License for more details. +#* +#* You should have received a copy of the GNU General Public License +#* along with Lustre; if not, write to the Free Software +#* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +# Report generation for plot-obdfilter +# ==================================== +# The plot-obdfilter script is used to generate csv file and +# instructions files for gnuplot from the output of obdfilter-survey script. +# +# The plot-obdfilter also creates .scr file that contains instructions +# for gnuplot to plot the graph. After generating .dat and .scr files this +# script invokes gnuplot to display graph. +# +# Syntax: +# $ obdfilter-survey > log_filename +# $ plot-obdfilter <log_filename> +# [Note: 1. Please use the .summary file generated by obdfilter-survey as log_file. +# It is generally available at /tmp/obdfilter_survey_<date_time_system>.summary +# 2. This script may need modifications whenever there will be +# modifications in output format of obdfilter-survey script. +# 3. Gnuplot version 4.0 or above is required.] + +my @GraphTitle; +sub usage() { + print "Usage: $0 <log_filename> [--st=<subtitle>] [--y0=<Y-axis start point>]\n"; + print " The $0 parses and plots graphs for output of obdfilter-survey using gnuplot.\n"; + print " It generates <log_filename>-<Rsize><rd/wr>.dat and\n"; + print " <log_filename>-<Rsize>-<rd/wr/rrd/rwr/rwa>.scr files.\n"; + print " Those will be used for graphing the results\n"; + print "OPTIONS:\n"; + print " --st: SubTitle for the graph\n"; + print " --y0: Start point of Y-axis, Default value automatically taken based on Y-axis values ranges\n"; + print " log_file: use the .summary file generated by obdfilter-survey as log_file.\n"; + print " It is generally available at /tmp/obdfilter_survey_<date_time_system>.summary\n"; + print "e.g. # $0 obdfilter-log --st=\"Sub-Title\" --y0=50\n"; + exit 1; +} + +# check whether gnuplot exists? +system ("which gnuplot > /dev/null") == 0 or die "gnuplot does not exist, please install it and try again.\n"; + +#Subroutine to write .scr file that further used by gnuplot to plot the graph. +sub write_scr_file() { + my $op = $_[0]; + my $rwlabel = $_[1]; + print "generating plot $file-$rsz-$op.png\n"; + open ( SCRFILE, "> $file-$rsz-$op.scr" ) or die "Can't open scr file for writing"; + + if ($subtitle) { + print SCRFILE "set title \"@GraphTitle\\n$rwlabel, Rsize = $rsz KBytes, $subtitle\"\n"; + } else { + print SCRFILE "set title \"@GraphTitle\\n$rwlabel, Rsize = $rsz KBytes\"\n"; + } + print SCRFILE "set xlabel \"Threads\"\n"; + print SCRFILE "set ylabel \"Speeds(MB/s)\"\n"; + print SCRFILE "set logscale x\n"; + print SCRFILE "set grid\n"; + print SCRFILE "set terminal png\n"; + print SCRFILE "set output \"/dev/null\"\n"; + if ($opt_y0 != -9999) { + print SCRFILE "set yrange [ $opt_y0: ]\n"; + } + my $plot = "plot"; + $i = 2; + $xrange = 1; + # generate instructions for gnuplot, with adjusting X-axes ranges + for ($j = $first_thread; $j <= $thread ; $j = $j + $j) { + printf SCRFILE "$plot \"$file-$rsz-$op.dat\" using 1:$i axes x%dy1 title \"$rwlabel-obj$j\" with line\n", $xrange; + $i++; + $plot = "replot"; + } + print SCRFILE "set output \"$file-$rsz-$op.png\"\n"; + print SCRFILE "replot\n"; + close SCRFILE; + $graphgen = 1; + # invoke gnuplot to display graph. + system ("gnuplot $file-$rsz-$op.scr") == 0 or die "ERROR: while ploting graph"; + system ("rm $file-$rsz-$op.scr"); +} + +#Subroutine to write .dat file that further used by gnuplot to plot the graph. +sub write_dat_file() { + my $op = $_[0]; + print "writing data $file-$rsz-$op.dat\n"; + # Open .csv/.dat file for writing required columns from log file. + open ( DATAFILE, "> $file-$rsz-$op.dat" ) or die "Can't open csv file for writing"; + printf DATAFILE "%-6s", "thrd"; + for ($j = $first_thread; $j <= $thread ; $j = $j + $j) { + printf DATAFILE "%-10s", "$op-obj$j"; + } + for ( $i = $first_obj; $i <= $obj; $i = $i + $i ) { + printf DATAFILE "\n%-6s", $i; + for ($j = $first_thread; $j <= $thread ; $j = $j + $j) { + # switch-case can be used instead if else + if ($op eq "rd") { + if ( $ard{$i}{$j} ) { + printf DATAFILE "%-10s", $ard{$i}{$j}; + } else { + printf DATAFILE "%-10s", "-"; + } + } elsif ($op eq "wr" ) { + if ( $awr{$i}{$j} ) { + printf DATAFILE "%-10s", $awr{$i}{$j}; + } else { + printf DATAFILE "%-10s", "-"; + } + } elsif ($op eq "rwr" ) { + if ( $arwr{$i}{$j} ) { + printf DATAFILE "%-10s", $arwr{$i}{$j}; + } else { + printf DATAFILE "%-10s", "-"; + } + } elsif ($op eq "rrd" ) { + if ( $arrd{$i}{$j} ) { + printf DATAFILE "%-10s", $arrd{$i}{$j}; + } else { + printf DATAFILE "%-10s", "-"; + } + } elsif ($op eq "rwa" ) { + if ( $arwa{$i}{$j} ) { + printf DATAFILE "%-10s", $arwa{$i}{$j}; + } else { + printf DATAFILE "%-10s", "-"; + } + } + } + } + close DATAFILE; +} + +#Subroutine to call .scr and .dat file write routines. +sub write_files() { + for ($cnt = 0; $cnt < @operations; $cnt = $cnt + 1) { + # switch-case can be used instead if else + if($operations[$cnt] eq "read") { + &write_dat_file("rd"); + &write_scr_file("rd", "read"); + } elsif ($operations[$cnt] eq "write") { + &write_dat_file("wr"); + &write_scr_file("wr", "write"); + } elsif ($operations[$cnt] eq "reread") { + &write_dat_file("rrd"); + &write_scr_file("rrd", "reread"); + } elsif ($operations[$cnt] eq "rewrite") { + &write_dat_file("rwr"); + &write_scr_file("rwr", "rewrite"); + } elsif ($operations[$cnt] eq "rewrite_again") { + &write_dat_file("rwa"); + &write_scr_file("rwa", "rewrite_again"); + } + } +} + +if ( !$ARGV[0] ) { + usage(); +} +$file = $ARGV[0]; +$obj = 0; +$thread = 0; +$first_obj = 1; +$first_thread = 1; +$count = 0; +$rsz = 0; +$subtitle = ""; +$opt_y0 = -9999; +$cnt = 0; +@operations = (); +$graphgen = 0; +# Command line parameter parsing +use Getopt::Long; +GetOptions ('help' => \$opt_help, 'st=s' => \$subtitle, 'y0=i' => \$opt_y0) or usage(); +if ($opt_help) { + usage(); +} +open ( PFILE, "$file") or die "Can't open results"; +LABEL: while ( <PFILE> ) { + chomp; + @line = split( /\s+/ ); + if ($count == 0) { + @GraphTitle = @line; + $count++; + next LABEL; + } + $linelen = @line; + if ($linelen > 26 || $linelen < 11) { + print "invalid file format at line $count\n"; + exit 1; + } + if (!$rsz && $line[5]) { + $cnt = 0; + $rsz = $line[5]; + $first_obj = $line[7]; + $first_thread = $line[9]; + for ($i = 10; $i <= $linelen; $i = $i + 5) { + if ($line[$i]) { + $operations[$cnt] = $line[$i]; + $cnt++; + } + } + } + if ($rsz != $line[5]) { + &write_files(); + $rsz = $line[5]; + $first_obj = $line[7]; + $first_thread = $line[9]; + @operations = (); + $cnt = 0; + for ($i = 10; $i <= $linelen; $i = $i + 5) { + if ($line[$i]) { + $operations[$cnt] = $line[$i]; + $cnt++; + } + } + $obj = 0; + $thread = 0; + } + for ($i = 0; $i < @operations; $i++) { + # switch-case can be used instead if else + if($operations[$i] eq "read") { + $ard{$line[9]}{$line[7]} = $line[$i * 5 + 11]; + } elsif ($operations[$i] eq "write") { + $awr{$line[9]}{$line[7]} = $line[$i * 5 + 11]; + } elsif ($operations[$i] eq "reread") { + $arrd{$line[9]}{$line[7]} = $line[$i * 5 + 11]; + } elsif ($operations[$i] eq "rewrite") { + $arwr{$line[9]}{$line[7]} = $line[$i * 5 + 11]; + } elsif ($operations[$i] eq "rewrite_again") { + $arwa{$line[9]}{$line[7]} = $line[$i * 5 + 11]; + } + } + if ( $obj < $line[9] ) { + $obj = $line[9]; + } + if ( $thread < $line[7] ) { + $thread = $line[7]; + } + $count++; +} +close PFILE; +if ($count > 1 && $rsz) { + &write_files(); +} +if (!$graphgen) { + print "Invalid log file format\n"; +} diff --git a/lustre-iokit/ost-survey/.cvsignore b/lustre-iokit/ost-survey/.cvsignore new file mode 100644 index 0000000000000000000000000000000000000000..282522db0342d8750454b3dc162493b5fc709cc8 --- /dev/null +++ b/lustre-iokit/ost-survey/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre-iokit/ost-survey/Makefile.am b/lustre-iokit/ost-survey/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..5d2c14a07c06cfee6b06f03c80196428ee281f9b --- /dev/null +++ b/lustre-iokit/ost-survey/Makefile.am @@ -0,0 +1,3 @@ +bin_SCRIPTS = ost-survey plot-ost +CLEANFILE = $(bin_SCRIPTS) +EXTRA_DIST = README.ost-survey ost-survey plot-ost diff --git a/lustre-iokit/ost-survey/README.ost-survey b/lustre-iokit/ost-survey/README.ost-survey new file mode 100644 index 0000000000000000000000000000000000000000..29d82b21c532d673cb9944c18616971c8e42a63c --- /dev/null +++ b/lustre-iokit/ost-survey/README.ost-survey @@ -0,0 +1,26 @@ +ost-survey (OST performance survey) +=================================== + This script is designed to test the client-to-disk performance +of the individual OSTs in a Lustre filesystem. The network transfer +time from the client is included; to get a better idea of the isolated +disk perfomance, run this script on a client running on the OST. + +Syntax: + $ ost-survey [-h] [-s <size>] <lustre_path> + where -s : size in MB + -h : help + <lustre_path> : mount point of lustre client + +Assumptions + - Lustre filesystem is up and running + - Script is being run on a client + + +plot-ost (OST survey graph) +=========================== + The plot-ost script can be used to plot the results from the +ost-survey script using gnuplot. + +Syntax: $ ost-survey /mnt/lustre > ost_log + $ plot-ost ost_log + diff --git a/lustre-iokit/ost-survey/ost-survey b/lustre-iokit/ost-survey/ost-survey new file mode 100755 index 0000000000000000000000000000000000000000..fc93117363c1242e2efa64a3a97c59ee0e6a36ce --- /dev/null +++ b/lustre-iokit/ost-survey/ost-survey @@ -0,0 +1,271 @@ +#!/usr/bin/perl +# This script is to be run on a client machine and will test all the +# OSTs to determine which is the fastest and slowest +# The current test method is as follows: +# -Create a directory for each OST +# -Use 'lfs setstripe' to set the Lustre striping such that IO goes to +# only one OST +# -Use 'dd' to write and read a file of a specified size +# -Compute the average, and Standard deviation +# -Find the slowest OST for read and write +# -Find the Fastest OST for read and write + +# GLOBALS +$pname = $0; # to hold program name +$OSTS = 0; # Number of OSTS we will loop over +$BSIZE = 1024 * 1024; # Size of i/o block +$MNT = "/mnt/lustre"; # Location of Lustre file system +$FSIZE = 30; # Number of i/o blocks + +# Usage +sub usage () { + print "Usage: $pname [-s <size>] [-h] <Lustre_Path>\n"; + print "[OPTIONS]\n"; + print " -s: size of test file in MB (default $FSIZE MB)\n"; + print " -h: To display this help\n"; + print "example : $pname /mnt/lustre\n"; + exit 1; +} + +# ost_count subroutine ets globle variable $OST with Number of OST's +# Also fills 1 for active OST indexes in ACTIVEOST_INX array. +sub ost_count () { + # numobd gives number of ost's and activeobd gives number of active ost's + my $tempfile = glob ("/proc/fs/lustre/lov/*-clilov-*/activeobd"); + open(PTR, $tempfile) || die "Cannot open $tempfile: $!\n"; + $OSTS = <PTR>; + close PTR; + print "Number of Active OST devices : $OSTS"; + my $tempfile = glob ("/proc/fs/lustre/lov/*-clilov-*/numobd"); + open(PTR, $tempfile) || die "Cannot open $tempfile: $!\n"; + $numost = <PTR>; + close PTR; + if ( $numost != $OSTS ) { + printf "Number of non active ots(s): %d\n", ( $numost - $OSTS ); + $OSTS = $numost; + } + my $tempfile = glob ("/proc/fs/lustre/lov/*-clilov-*/target_obd"); + open(PTR, $tempfile) || die "Cannot open $tempfile: $!\n"; + my $count = 0; + my $temp; + while (<PTR>) { + chop; + my ($ost_num, $ost_name, $ost_status) = split(/\s+/, $_); + if ( $ost_status eq "ACTIVE" ) { + $ACTIVEOST_INX[$count] = 1; + } + $count++; + } +} + +sub cache_off () { + $CACHEFILE = glob ("/proc/fs/lustre/llite/*/max_cached_mb"); + open(PTR, $CACHEFILE) || die "Cannot open $tempfile: $!\n"; + $CACHESZ = 0 + <PTR>; + close PTR; + system("echo 0 >> $CACHEFILE"); +} + +sub cache_on () { + system("echo $CACHESZ >> $CACHEFILE"); +} + +# make_dummy subroutine creates a dummy file that will be used for read operation. +sub make_dummy () { + my $SIZE = $_[0]; + my $tempfile = $_[1]; + system ("dd of=$tempfile if=/dev/zero count=$SIZE bs=$BSIZE 2> /dev/null"); +} + +# run_test subroutine actually writes and reads data to/from dummy file +# and compute corresponding time taken for read and write operation and +# byte transfer for the both operations. +# This subroutine also fill corresponding globle arrays with above information. +sub run_test () { + my $SIZE = $_[0]; + my $INX=$_[1]; + my $ACTION=$_[2]; + my $tempfile = $_[3]; + + if ( !(-f $tempfile) && $ACTION eq "read" ) { + &make_dummy($SIZE, $tempfile); + } + system("sync"); + my ($ts0, $tu0) = gettimeofday(); + $tu0 = $ts0 + ($tu0 / 1000000); + if ( $ACTION eq "write" ) { + system("dd of=$tempfile if=/dev/zero count=$SIZE bs=$BSIZE 2> /dev/null"); + } elsif ( $ACTION eq "read" ) { + system("dd if=$tempfile of=/dev/null count=$SIZE bs=$BSIZE 2> /dev/null"); + } else { + print "Action is neither read nor write\n"; + exit 1; + } + system("sync"); + my ($ts1, $tu1) = gettimeofday(); + $tu1 = $ts1 + ($tu1/1000000); + my $tdelta = $tu1 - $tu0; + my $delta = ($SIZE * $BSIZE / ( $tu1 - $tu0 )) / (1024 * 1024); + if ( $ACTION eq "write" ) { + $wTime[$INX] = $tdelta; + $wMBs[$INX] = $delta; + } else { + $rTime[$INX] = $tdelta; + $rMBs[$INX] = $delta; + } +} + +# calculate subroutine compute following things and displays them. +# - Finds worst and best OST for both read and write operations. +# - Compute average of read and write rate from all OSTS +# - Compute Standard deviation for read and write form all OST's +sub calculate () { + my ($op, $MBs); + $op = $_[0]; + @MBs = @_[1..$#_]; + my $count = 0; + my $total = 0; + my $avg = 0; + my $sd = 0; + my $best_OST = 0; + my $worst_OST = 0; + my $max_mb = 0; + my $min_mb = 999999999; + while ($count < $OSTS ) { + if ( $ACTIVEOST_INX[$count] ) { + $total = $total + $MBs[$count]; + if ($max_mb < $MBs[$count] ) { + $max_mb = $MBs[$count]; + $best_OST = $count; + } + if ($min_mb > $MBs[$count] ) { + $min_mb = $MBs[$count]; + $worst_OST = $count; + } + } + $count++; + } + $avg = $total/$OSTS; + $total = 0; + $count = 0; + while ($count < $OSTS ) { + if ( $ACTIVEOST_INX[$count] ) { + $total = $total + ($MBs[$count] - $avg) * ($MBs[$count] - $avg); + } + $count++; + } + $sd = sqrt($total/$OSTS); + printf "Worst %s OST indx: %d speed: %f\n", $op, $worst_OST, $min_mb; + printf "Best %s OST indx: %d speed: %f\n", $op, $best_OST, $max_mb; + printf "%s Average: %f +/- %f MB/s\n", $op, $avg, $sd; +} + +# output_all_data subroutine displays speed and time information +# for all OST's for both read and write operations. +sub output_all_data () { + my $count = 0; + print "Ost# Read(MB/s) Write(MB/s) Read-time Write-time\n"; + print "----------------------------------------------------\n"; + while ( $count < $OSTS ) { + if ( $ACTIVEOST_INX[$count] ) { + printf "%d %.3f %.3f %.3f %.3f\n",$count, + $rMBs[$count], $wMBs[$count], $rTime[$count], $wTime[$count]; + } else { + printf "%d Inactive ost\n",$count; + } + $count = $count + 1; + } +} + +@rTime = (); +@wTime = (); +@rMBs = (); +@wMBs = (); +@ACTIVEOST_INX; + +# Locals +my $filename = ""; +my $dirpath = ""; +my $flag = 0; + +# Command line parameter parsing +use Getopt::Std; +getopts('s:h') or usage(); +usage() if $opt_h; +$FSIZE = $opt_s if $opt_s; + +my $i = 0; +foreach (@ARGV) { + $MNT = $_; + $i++; + if ($i > 1) { + print "ERROR: extra argument $_\n"; + usage(); + } +} +#Check for Time::HiRes module +my $CheckTimeHiRes = "require Time::HiRes"; +eval ($CheckTimeHiRes) or die "You need to install the perl-Time-HiRes package to use this script\n"; +my $LoadTimeHiRes = "use Time::HiRes qw(gettimeofday)"; +eval ($LoadTimeHiRes); + +use POSIX qw(strftime); +my $time_v = time(); +my $hostname = `lctl list_nids | head -1` or die "You need to install lctl to use this script\n"; +chop($hostname); +print "$pname: ", strftime("%D", localtime($time_v)); +print " OST speed survey on $MNT from $hostname\n"; + +# get OST count +ost_count (); +# turn off local cache +cache_off (); + +$dirpath = "$MNT/ost_survey_tmp"; +eval { mkpath($dirpath) }; +if ($@) { + print "Couldn't create $dirpath: $@"; + exit 1; +} + +use File::Path; +$CNT = 0; +while ($CNT < $OSTS) { + $filename = "$dirpath/file$CNT"; + if ( $ACTIVEOST_INX[$CNT] ) { + # set stripe for OST number $CNT + system ("lfs setstripe $filename 0 $CNT 1"); + # Perform write for OST number $CNT + &run_test($FSIZE,$CNT,"write",$filename); + $flag++; + } + $CNT = $CNT + 1; +} +$CNT = 0; +while ($CNT < $OSTS) { + $filename = "$dirpath/file$CNT"; + if ( $ACTIVEOST_INX[$CNT] ) { + # Perform read for OST number $CNT + &run_test($FSIZE,$CNT,"read",$filename); + $flag++; + } + $CNT = $CNT + 1; +} + +# if read or write performed on any OST then display information. +if ( $flag ) { + if ( $flag > 1 ) { + &calculate("Read",@rMBs); + &calculate("Write",@wMBs); + } + output_all_data (); +} else { + print "There is no active OST's found\n"; +} + +cache_on (); + +eval { rmtree($dirpath) }; +if ($@) { + print "Warning: Couldn't remove $dirpath: $@"; +} diff --git a/lustre-iokit/ost-survey/plot-ost b/lustre-iokit/ost-survey/plot-ost new file mode 100755 index 0000000000000000000000000000000000000000..a16a2a161a2357221fee80b48ac83f6b808be3d0 --- /dev/null +++ b/lustre-iokit/ost-survey/plot-ost @@ -0,0 +1,80 @@ +#!/usr/bin/perl -w +# Report generation for ost-survey.pl +# =================================== +# The plot-ost.pl script is used to generate csv file and +# instructions files for gnuplot from the output of ost-survey.pl script. +# +# The plot-ost.pl also creates .scr file that contains instructions +# for gnuplot to plot the graph. After generating .dat and .scr files this +# script invokes gnuplot to display graph. +# +# Syntax: +# $ plot-ost.pl <log_filename> +# Note: 1. This script may need modifications whenever there will be +# modifications in output format of ost-survey.pl script. +# 2. Gnuplot version 4.0 or above is required. + +# arg 0 is filename +sub usages_msg(){ + print "Usage: $0 <log_filename> \n"; + print " $0 produces graphs from the output of ost-survey.pl\n"; + print " using gnuplot.\n"; + print "e.g.# perl ost-survey /mnt/lustre > ost-log; perl $0 ost-log\n"; + exit 1; +} + +my $count = 0; # count for number of rows in csv(.dat) file. +my @line; # To store recently read line from log file +my $flag = 0; +my @GraphTitle; +if ( !$ARGV[0] ) { + usages_msg(); +} + +$file = $ARGV[0]; +# Open log file for reading +open ( PFILE, "$file") or die "Can't open results log file"; +# Open .csv file for writting required columns from log file. +open ( DATAFILE, "> $file.dat" ) or die "Can't open csv file for writting"; +LABLE:while ( <PFILE> ) { + chomp; + @line = split( /\s+/ ); # splits line into tokens + # This comparison may be changed if there will be changes log file. + if ( $line[0] eq "Ost#" ) { + print DATAFILE "$line[0] $line[1] $line[2]\n"; + $flag = 1; + <PFILE>; # skip the "---------" line from result file. + last LABLE; + } + if ($line[2] eq "OST" && $line[3] eq "speed") { + @GraphTitle = @line; + @GraphTitle = split( /:/ ); + } +} +if ( !$flag) { + print "Invalid logfile format\n"; + exit 1; +} +while ( <PFILE> ) { + chomp; + @line = split( /\s+/ ); # splits line into tokens + if ( $line[1] ne "Inactive" ) { + print DATAFILE "$count $line[1] $line[2]\n"; + } + $count = $count + 1; +} +close PFILE; +close DATAFILE; +# Open .scr file for writting instructions for gnuplot. +open ( SCRFILE, "> $file.scr" ) or die "Can't open scr file for writting"; +# generate instructions for gnuplot. decide axes depends on ranges in @columnvalues +print SCRFILE "set title \"$GraphTitle[1]\"\n"; +print SCRFILE "set xlabel \"OST index\"\n"; +print SCRFILE "set ylabel \"MB/s\"\n"; +print SCRFILE "set boxwidth 0.2\n"; +print SCRFILE "plot \"$file.dat\" using 1:2 axes x1y1 title \"Read(MB/s)\" with boxes fs solid 0.7\n"; +print SCRFILE "replot \"$file.dat\" using (\$1 + 0.2):3 axes x1y1 title \"Write(MB/s)\" with boxes fs solid 0.7\n"; +print SCRFILE "pause -1\n"; +close SCRFILE; +# invoke gnuplot to display graph. +system ("gnuplot $file.scr") == 0 or die "ERROR: while ploting graph.\nMake sure that gnuplot is working properly"; diff --git a/lustre-iokit/sgpdd-survey/.cvsignore b/lustre-iokit/sgpdd-survey/.cvsignore new file mode 100644 index 0000000000000000000000000000000000000000..282522db0342d8750454b3dc162493b5fc709cc8 --- /dev/null +++ b/lustre-iokit/sgpdd-survey/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre-iokit/sgpdd-survey/Makefile.am b/lustre-iokit/sgpdd-survey/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..44e4cd8f00c3074d920c3b12f84cc9f02eab0aa0 --- /dev/null +++ b/lustre-iokit/sgpdd-survey/Makefile.am @@ -0,0 +1,3 @@ +bin_SCRIPTS = plot-sgpdd sgpdd-survey +CLEANFILE = $(bin_SCRIPTS) +EXTRA_DIST = README.sgpdd-survey plot-sgpdd sgpdd-survey diff --git a/lustre-iokit/sgpdd-survey/README.sgpdd-survey b/lustre-iokit/sgpdd-survey/README.sgpdd-survey new file mode 100644 index 0000000000000000000000000000000000000000..c1f3337bbd09e31bdfafd32084ae3020ece2ec5d --- /dev/null +++ b/lustre-iokit/sgpdd-survey/README.sgpdd-survey @@ -0,0 +1,86 @@ +WARNING: Running sgp_dd will ERASE the contents of the disk devices. + This is NOT to be run on any OST where you care about any data + or you are not expecting to reformat the filesystem afterward. + +Requirements +------------ + +. sg3_utils (for sgp_dd) + SCSI device + Or, if using non-scsi disk + raw device support + sg3_utils + +Overview +-------- + +This survey may be used to characterise the performance of a SCSI device. +It simulates an OST serving multiple stripe files. The data gathered by it +can help set expectations for the performance of a lustre OST exporting the +device. + +The script uses sgp_dd to do raw sequential disk I/O. It runs with +variable numbers of sgp_dd threads to show how performance varies with +different request queue depths. + +The script spawns variable numbers of sgp_dd instances, each reading or +writing a separate area of the disk to show how performance varies with the +number of concurrent stripe files. + +The device(s) used must meet one of two tests: +SCSI device: + Must appear in the output of 'sg_map' + (make sure the kernel module "sg" is loaded) +Raw device: + Must appear in the output of 'raw -qa' + + If you need to create raw devices in order to use this tool, note that + raw device 0 can not be used due to a bug in certain versions of the + "raw" utility (including that shipped with RHEL4U4.) + +You may not mix raw and SCSI devices in the test specification. + +Running +------- + +The script must be customised according to the particular device under test +and where it should keep its working files. Customisation variables are +described clearly at the start of the script. + +e.g.: scsidevs=/dev/sda size=128 crghi=16 thrhi=32 ./sgpdd-survey + +When the script runs, it creates a number of working files and a pair of +result files. All files start with the prefix given by ${rslt}. + +${rslt}_<date/time>.summary same as stdout +${rslt}_<date/time>_* tmp files +${rslt}_<date/time>.detail collected tmp files for post-mortem + +The summary file and stdout contain lines like... + +total_size 8388608K rsz 1024 thr 1 crg 1 180.45 MB/s 1 x 180.50 = 180.50 MB/s + +The number immediately before the first MB/s is the bandwidth computed by +measuring total data and elapsed time. The other numbers are a check on +the bandwidths reported by the individual sgp_dd instances. + +If there are so many threads that sgp_dd is unlikely to be able to allocate +I/O buffers, "ENOMEM" is printed. + +If not all the sgp_dd instances successfully reported a bandwidth number +"failed" is printed. + +Visualising Results +------------------- + +I've found it most useful to import the summary data (it's fixed width) +into Excel (or any graphing package) and graph bandwidth v. # threads for +varying numbers of concurrent regions. This shows how the device performs +with varying queue depth. If the series (varying numbers of concurrent +regions) all seem to land on top of each other, it shows the device is +phased by seeks at the given record size. + +The included script "plot-sgpdd" will process output files and create +.dat (similar to csv) files for spreadsheet import. It also plots the +results directly using gnuplot and creates .png files. + diff --git a/lustre-iokit/sgpdd-survey/plot-sgpdd b/lustre-iokit/sgpdd-survey/plot-sgpdd new file mode 100755 index 0000000000000000000000000000000000000000..fcd4bda736dfab013a99d2086be1c7b1cb706783 --- /dev/null +++ b/lustre-iokit/sgpdd-survey/plot-sgpdd @@ -0,0 +1,289 @@ +#!/usr/bin/perl -w +# Report generation for plot-sgpdd +# ================================ +# The plot-sgpdd script is used to generate csv file and +# instructions files for gnuplot from the output of sgpdd-survey script. +# +# The plot-sgpdd also creates .scr file that contains instructions +# for gnuplot to plot the graph. After generating .dat and .scr files this +# script invokes gnuplot to display graph. +# +# Syntax: +# $ sgpdd-survey > log_filename +# $ plot-sgpdd <log_filename> +# [Note: 1. This script may need modifications whenever there will be +# modifications in output format of sgpdd-survey script. +# 2. Gnuplot version 4.0 or above is required.] + +sub usage() +{ + print STDERR "Usage: $0 [options] <log_filename>\n"; + print STDERR " $0 parses and plots graphs from the output of sgpdd-survey\n"; + print STDERR " It generates text data files (.dat) and graphs (.png) using gnuplot.\n"; + print STDERR "options:\n"; + print STDERR " --rt: Subtitle for read graphs\n"; + print STDERR " --wt: Subtitle for write graphs\n"; + print STDERR " --y: Y-axis scale\n"; + print STDERR "e.g. # $0 --rt=\"no prefetch\" --wt=\"WB disabled\" --y=500 sgpdd.summary\n"; + exit 1; +} + +# check whether gnuplot exists? +system ("which gnuplot > /dev/null") == 0 or die "gnuplot does not exist, please install it and try again.\n"; + +# check whether gnuplot supports png +$pngsupport = "ldd `which gnuplot` | grep -q libpng"; +system ("$pngsupport") == 0 or die "gnuplot installed does not support png. + Please install gnuplot to support png and try again.\n"; + +my @GraphTitle; + +#Subroutine to write .scr file that further used by gnuplot to plot the graph. +sub write_scr_file() { + my $op = $_[0]; + print "generating plot $file-$rsz-$op.png\n"; + open ( SCRFILE, "> $file-$rsz-$op.scr" ) or die "Can't open scr file for writing"; + if ($op eq "rd") { + $rwlabel = "Read"; + } + if ($op eq "wr") { + $rwlabel = "Write"; + } + + if ($opt_rdtitle || $opt_wrtitle) { + if ($op eq "rd") { + print SCRFILE "set title \"@GraphTitle\\n$rwlabel, Rsize = $rsz KBytes, $opt_rdtitle\"\n"; + } + if ($op eq "wr") { + print SCRFILE "set title \"@GraphTitle\\n$rwlabel, Rsize = $rsz KBytes, $opt_wrtitle\"\n"; + } + } else { + print SCRFILE "set title \"@GraphTitle\\n$rwlabel, Rsize = $rsz KBytes\"\n"; + } + print SCRFILE "set xlabel \"Threads\"\n"; + print SCRFILE "set ylabel \"Speeds(MB/s)\"\n"; + print SCRFILE "set logscale x\n"; + print SCRFILE "set grid\n"; + print SCRFILE "set terminal png\n"; + print SCRFILE "set output \"/dev/null\"\n"; + if ($opt_y != 0) { + print SCRFILE "set yrange [ 0:$opt_y ]\n"; + } else { + print SCRFILE "set yrange [ 0: ]\n"; + } + + my $plot = "plot"; + $i = 2; + my @numrgs = split " ", $regions; + $xrange = 1; + # generate instructions for gnuplot, with adjusting X-axes ranges + + foreach my $j (sort numerically split " ", $threads) { + if ($op eq "wr") { + $using = ( $i < $#numrgs ) ? $i : $#numrgs; + printf SCRFILE "$plot \"$file-$rsz-$op.dat\" using 1:$using axes x%dy1 title \"write-obj$j\" with line\n", $xrange; + } + if ($op eq "rd") { + $using = ( $i < $#numrgs ) ? $i : $#numrgs; + printf SCRFILE "$plot \"$file-$rsz-$op.dat\" using 1:$using axes x%dy1 title \"read-obj$j\" with line\n", $xrange; + } + $i++; + $plot = "replot"; + } + print SCRFILE "set output \"$file-$rsz-$op.png\"\n"; + print SCRFILE "replot\n"; + close SCRFILE; + # invoke gnuplot to display graph. + system ("gnuplot $file-$rsz-$op.scr") == 0 or die "ERROR: while ploting graph"; + system ("rm $file-$rsz-$op.scr"); +} + +sub check_data_file () { + my $file=shift; + my @values; + my @tmp; + + open ( FILE, "< $file" ) or die "Can't open $file for reading"; + while ( <FILE> ) { + @tmp = split; + push @values, [ @tmp ]; + } + close FILE; + + for ( $j = 0; $j <= $#tmp; $j++) { + my $sum=0; + for ($i = 2; $i <= $#values ; $i ++) { + $values [$i][$j] =~ "-" or $sum = $sum + $values [$i][$j]; + } + die "File: $file : $j column contains no data.\n" unless $sum != 0; + } +} + +sub numerically { $a <=> $b; } + +#Subroutine to write .dat file that further used by gnuplot to plot the graph. +sub write_dat_file() { + my $op = $_[0]; + print "writing data $file-$rsz-$op.dat\n"; + # Open .csv/.dat file for writing required columns from log file. + my $datafile = "$file-$rsz-$op.dat"; + open ( DATAFILE, "> $datafile" ) or die "Can't open csv $datafile for writing"; + printf DATAFILE "%-6s", "0"; + + foreach my $j (sort numerically split " ", $regions) { + printf DATAFILE "%-8s", "$op$j"; + } + + # threads, line [7], strings + foreach my $i (sort numerically split " ", $threads) { + printf DATAFILE "\n%-6s", $i; + + # regions, line [5], column + foreach my $j (sort numerically split " ", $regions) { + if (($op eq "rd" && $rdwr) || ($op eq "wr" && $wrrd) || ($readop) || ($writeop)) { + if ( $out{$i}{$j} ) { + printf DATAFILE "%-8s", $out{$i}{$j}; + } else { + printf DATAFILE "%-8s", "-"; + } + } else { + if (($j <= 1 && $out{$i}{$j - 1})) { + printf DATAFILE "%-8s", $out{$i}{$j - 1}; + }elsif ($out{$i}{$j + 1} && $j > 1) { + printf DATAFILE "%-8s", $out{$i}{$j + 1}; + } else { + printf DATAFILE "%-8s", "-"; + } + } + } + } + close DATAFILE; + &check_data_file ( $datafile ); +} + +if ( !$ARGV[0] ) { + usage(); +} +$regions = ""; +$threads = ""; +$count = 0; +$wrrd = 0; +$rdwr = 0; +$writeop = 0; +$readop = 0; +$rsz = 0; +$opt_rdtitle = ""; +$opt_wrtitle = ""; +$opt_y = 0; +# Command line parameter parsing +use Getopt::Long; +GetOptions ('help' => \$opt_help, 'rt=s' => \$opt_rdtitle, 'wt=s' => \$opt_wrtitle, 'y=i' => \$opt_y) or usage(); +if ($opt_help) { + usage(); +} +$file = $ARGV[0]; + +open ( PFILE, "$file") or die "Can't open $file"; +LABEL: while ( <PFILE> ) { + chomp; + @line = split( /\s+/ ); + if ($line[27] && $count != 0) { + print "invalid file format\n"; + exit 1; + } + if ($count == 0) { + @GraphTitle = @line; + $count++; + next LABEL; + } + if ($line[8]) { + if ($line[8] eq "ENOMEM") { + next LABEL; + } + } + if (!$rsz && $line[3]) { + $rsz = $line[3]; + } + if ($rsz != $line[3]) { + if($readop) { + &write_dat_file("rd"); + &write_scr_file("rd"); + } + if($writeop) { + &write_dat_file("wr"); + &write_scr_file("wr"); + } + if ($wrrd || $rdwr) { + &write_dat_file("rd"); + &write_scr_file("rd"); + &write_dat_file("wr"); + &write_scr_file("wr"); + } + $rsz = $line[3]; + $regions = ""; + $threads = ""; + } + #print "rg$line[5] th$line[7] w$line[9] r$line[$rindex]\n"; + $rindex = 18; + if ($line[18]) { + if ($line[10] eq "failed") { + $rindex = 12; + } + if ($line[8] eq "write" && $line[17] eq "read") { + $wrrd = 1; + } + if ($line[8] eq "read" && $line[17] eq "write") { + $rdwr = 1; + } + } else { + if ($line[8] eq "write" && $line[9]) { + $writeop = 1; + } + if ($line[8] eq "read" && $line[9]) { + $readop = 1; + } + + } + if ($wrrd || $rdwr) { + $out{$line[7]}{$line[5]} = $line[9]; + if ($line[$rindex+1]) { + if (!($line[$rindex+1] eq "failed")) { + goto LABEL2; + } + } else { +LABEL2: if ($line[5] <= 1 ) { + $out{$line[7]}{$line[5] - 1} = $line[$rindex]; + } else { + $out{$line[7]}{$line[5] + 1} = $line[$rindex]; + } + } + } + if ($writeop) { + $out{$line[7]}{$line[5]} = $line[9]; + } + if ($readop) { + $out{$line[7]}{$line[5]} = $line[9]; + } + $regions .= " $line[5]" unless $regions =~ $line[5]; + $threads .= " $line[7]" unless $threads =~ $line[7]; + $count++; +} +close PFILE; +if ($count > 1 && $rsz) { + if($readop) { + &write_dat_file("rd"); + &write_scr_file("rd"); + } + if($writeop) { + &write_dat_file("wr"); + &write_scr_file("wr"); + } + if ($wrrd || $rdwr) { + &write_dat_file("rd"); + &write_scr_file("rd"); + &write_dat_file("wr"); + &write_scr_file("wr"); + } +} else { + print "Invalid log file format\n"; +} diff --git a/lustre-iokit/sgpdd-survey/sgpdd-survey b/lustre-iokit/sgpdd-survey/sgpdd-survey new file mode 100755 index 0000000000000000000000000000000000000000..db40392f5a136bcc4768a73e74de870d468b079e --- /dev/null +++ b/lustre-iokit/sgpdd-survey/sgpdd-survey @@ -0,0 +1,212 @@ +#!/bin/bash + +###################################################################### +# customize per survey + +# CHOOSE EITHER scsidevs or rawdevs +# the SCSI devices to measure - WARNING: will be erased. +# The raw devices to use +# rawdevs=${rawdevs:-"/dev/raw/raw1"} +# scsidevs=`ls /dev/sd[a-z] /dev/sd[a-z][a-z]` # all devices, if you use udev + +# result file prefix. date/time+hostname makes unique +# NB ensure the path exists if it includes subdirs +rslt_loc=${rslt_loc:-"/tmp"} +rslt=${rslt:-"$rslt_loc/sgpdd_survey_`date +%F@%R`_`uname -n`"} + +# what to do (read or write) +actions=${actions:-"write read"} + +# total size per device (MBytes) +# NB bigger than device cache is good +size=${size:-8192} + +# record size (KBytes) +rszlo=${rszlo:-1024} +rszhi=${rszhi:-1024} + +# Concurrent regions per device +crglo=${crglo:-1} +crghi=${crghi:-256} + +# threads to share between concurrent regions per device +# multiple threads per region simulates a deeper request queue +# NB survey skips over #thr < #regions and #thr/#regions > SG_MAX_QUEUE +thrlo=${thrlo:-1} +thrhi=${thrhi:-4096} + +##################################################################### +# leave the rest of this alone unless you know what you're doing... + +# and max # threads one instance will spawn +SG_MAX_QUEUE=16 + +# is the sg module loaded? +sg_is_loaded=$(grep -q "^sg " /proc/modules && echo true || echo false) + +# did we load it? +sg_was_loaded=false + +# map given device names into SG device names +i=0 +devs=() +if [ "$scsidevs" ]; then + # we will test for a LUN, the test for a partition + # if the partition number is > 9 this will fail + + # make sure sg kernel module is loaded + if ! $sg_is_loaded; then + echo "loading the sg kernel module" + modprobe sg && sg_was_loaded=true + sg_is_loaded=true + fi + + for d in $scsidevs; do + devs[$i]=`sg_map | awk "{if (\\\$2 == \"$d\") print \\\$1}"` + if [ -z "${devs[i]}" ]; then + echo "Can't find SG device for $d, testing for partition" + pt=`echo $d | sed 's/[0-9]$//'` + # Try again + devs[$i]=`sg_map | awk "{if (\\\$2 == \"$pt\") print \\\$1}"` + if [ -z "${devs[i]}" ]; then + echo -e "Can't find SG device $pt.\nDo you have the sg module configured for your kernel?" + exit 1 + fi + fi + i=$((i+1)) + done +elif [ "$rawdevs" ]; then + for r in $rawdevs; do + RES=`raw -q $r` + if [ $? -eq 0 ];then + devs[$i]=$r + i=$((i+1)) + else + echo "Raw device $r not set up" + exit 1 + fi + done +else + echo "Must specify scsidevs or rawdevs" + exit 1 +fi + +ndevs=${#devs[@]} + +# determine block size. This should also work for raw devices +# If it fails, set to 512 +bs=$((`sg_readcap -b ${devs[0]} | awk '{print $2}'`)) +if [ $bs == 0 ];then + echo "sg_readcap failed, setting block size to 512" + bs=512 +fi +rsltf=${rslt}.summary +workf=${rslt}.detail +echo -n > $rsltf +echo -n > $workf + +print_summary () { + if [ "$1" = "-n" ]; then + minusn=$1; shift + else + minusn="" + fi + echo $minusn "$*" >> $rsltf + echo $minusn "$*" +} + +print_summary "$(date) sgpdd-survey on $rawdevs$scsidevs from $(hostname)" + +for ((rsz=$rszlo;rsz<=$rszhi;rsz*=2)); do + for ((crg=$crglo;crg<=$crghi;crg*=2)); do + for ((thr=$thrlo;thr<=$thrhi;thr*=2)); do + if ((thr < crg || thr/crg > SG_MAX_QUEUE)); then + continue + fi + # compute parameters + bpt=$((rsz*1024/bs)) + blocks=$((size*((1024*1024)/bs)/crg)) + count=$blocks + # show computed parameters + actual_rsz=$((bpt*bs/1024)) + actual_size=$((bs*count*crg/1024)) + str=`printf 'total_size %8dK rsz %4d crg %5d thr %5d ' \ + $((actual_size*ndevs)) $actual_rsz $((crg*ndevs)) $((thr*ndevs))` + echo "==============> $str" >> $workf + print_summary -n "$str" + freemem=`awk < /proc/meminfo '/^MemTotal:/ {printf "%d\n", $2}'` + if (((actual_rsz*thr/crg + 64)*crg*ndevs > freemem)); then + print_summary "ENOMEM" + continue + fi + # run tests + for action in $actions; do + print_summary -n "$action " + echo "=====> $action" >> $workf + tmpf=${workf}_tmp + # start test + t0=`date +%s.%N` + for ((i=0;i<ndevs;i++)); do + dev=${devs[i]} + devsize=$((bs*`sg_readcap -b ${dev} | awk '{print $1}'`/1024)) + if [ $devsize -lt $actual_size ]; then + _dev=$(sg_map | grep $dev | awk '{ print $2; }') + echo -e "device $_dev not big enough: $devsize <" \ + "$actual_size.\nConsider reducing \$size" + exit 1 + fi + if [ $action = read ]; then + inf="if=$dev" + outf="of=/dev/null" + skip=skip + else + inf="if=/dev/zero" + outf="of=$dev" + skip=seek + fi + for ((j=0;j<crg;j++)); do + sgp_dd 2> ${tmpf}_${i}_${j} \ + $inf $outf ${skip}=$((1024+j*blocks)) \ + thr=$((thr/crg)) count=$count bs=$bs bpt=$bpt time=1& + done + done + wait + t1=`date +%s.%N` + # collect/check individual stats + echo > $tmpf + ok=0 + for ((i=0;i<ndevs;i++)); do + for ((j=0;j<crg;j++)); do + rtmp=${tmpf}_${i}_${j} + if grep 'error' $rtmp > /dev/null 2>&1; then + echo "Error found in $rtmp" + elif grep 'time to transfer data' $rtmp > /dev/null 2>&1; then + ok=$((ok + 1)) + fi + cat ${rtmp} >> $tmpf + cat ${rtmp} >> $workf + rm ${rtmp} + done + done + if ((ok != ndevs*crg)); then + print_summary -n "$((ndevs*crg - ok)) failed " + else + # compute MB/sec from elapsed + bw=`awk "BEGIN {printf \"%7.2f MB/s\", $actual_size * $ndevs / (( $t1 - $t0 ) * 1024); exit}"` + # compute MB/sec from nregions*slowest + check=`awk < $tmpf \ + '/time to transfer data/ {mb=$8/1.048576; if (n == 0 || mb < min) min = mb; n++}\ + END {printf "%5d x %6.2f = %7.2f MB/s", n, min, min * n}'` + print_summary -n "$bw $check " + fi + rm $tmpf + done + print_summary "" + done + done +done + +if $sg_was_loaded; then + echo "unloading sg module" + rmmod sg +fi diff --git a/lustre-iokit/stats-collect/.cvsignore b/lustre-iokit/stats-collect/.cvsignore new file mode 100644 index 0000000000000000000000000000000000000000..282522db0342d8750454b3dc162493b5fc709cc8 --- /dev/null +++ b/lustre-iokit/stats-collect/.cvsignore @@ -0,0 +1,2 @@ +Makefile +Makefile.in diff --git a/lustre-iokit/stats-collect/Makefile.am b/lustre-iokit/stats-collect/Makefile.am new file mode 100644 index 0000000000000000000000000000000000000000..3cb3f611ac9d78c0e3f12ad76b4b4e6a87830e85 --- /dev/null +++ b/lustre-iokit/stats-collect/Makefile.am @@ -0,0 +1,3 @@ +bin_SCRIPTS = config.sh gather_stats_everywhere.sh lstats.sh +CLEANFILE = $(bin_SCRIPTS) +EXTRA_DIST = README.lstats.sh config.sh gather_stats_everywhere.sh lstats.sh diff --git a/lustre-iokit/stats-collect/README b/lustre-iokit/stats-collect/README new file mode 100644 index 0000000000000000000000000000000000000000..60270dd89b842a8c6d1b5a05e959281c32094f09 --- /dev/null +++ b/lustre-iokit/stats-collect/README @@ -0,0 +1,70 @@ +Overview +-------- +These scripts will be used to collect application profiling info from +lustre clients and servers. It will be run on a single (control) +node, and collect all the profile info and create a tarball on the +control node. + +lstat.sh : script for single node, will be run on each profile node. +gather_stats_everywhere.sh : collect stats script. +config.sh : customized configuration description + +Requirements +------- +1) Lustre is installed and setup on your cluster. +2) ssh/scp to these nodes works without requiring a password. + +Configuration +------ +Configuration is very simple for this script - all of the profiling +config VARs are in config.sh + +XXXX_INTERVAL: the profiling interval +where value of interval means: + 0 - gather stats at start and stop only + N - gather stats every N seconds +if XXX_INTERVAL isn't specified, XXX stats won't be collected +XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD, CLIENT + + +Running +-------- +The gather_stats_everywhere.sh should be run in three phases: + + a)sh gather_stats_everywhere.sh config.sh start + It will start stats collection on each node specified in config.sh + + b)sh gather_stats_everywhere.sh config.sh stop <log_name.tgz> + It will stop collect stats on each node. If <log_name.tgz> is provided, + it will create a profile tarball /tmp/<log_name.tgz> + + c)sh gather_stats_everywhere.sh config.sh analyse log_tarball.tgz csv + It will analyse the log_tarball and create a csv tarball for this + profiling tarball. + + +Example +------- +When you want collect your profile info, you should + 1) start the collect profile daemon on each node. + sh gather_stats_everywhere.sh config.sh start + + 2) run your test. + + 3) stop the collect profile daemon on each node, cleanup the tmp + file and create a profiling tarball. + sh gather_stats_everywhere.sh config.sh stop log_tarball.tgz + + 4) create a csv file according to the profile. + sh gather_stats_everywhere.sh config.sh analyse log_tarball.tgz csv + + +TBD +------ +Add liblustre profiling support and add more options for analyse. + + + + + + diff --git a/lustre-iokit/stats-collect/README.lstats.sh b/lustre-iokit/stats-collect/README.lstats.sh new file mode 100644 index 0000000000000000000000000000000000000000..28e4d12d8a965686c617aba24b9eaa1d507a4cb2 --- /dev/null +++ b/lustre-iokit/stats-collect/README.lstats.sh @@ -0,0 +1,75 @@ +Overview +-------- +These script will be used to collect profile info of lustre client and server. +It will be run on a single(control) node, and collect all the profile info and +create a tarball on the control node. + +lstat.sh : The stat script for single node, which will be run on each profile + node. +gather_stats_everywhere.sh : collect stats script. +config.sh : the config for gather_stats_everywhere.sh. + +Requirements +------- +1) Lustre is installed and setup on your profiling cluster. +2) ssh/scp to these node names works without requiring a password. + +Configuration +------ +Configuration is very simple for this script, all of the profiling config VARs are +in config.sh + +XXXX_INTERVAL: the profiling interval +where value of interval means: + 0 - gather stats at start and stop only + N - gather stats every N seconds +if some XXX_INTERVAL isn't specified, related stats won't be collected +XXXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD, CLIENT + +As for ior-collect-stat.sh, you can modify the various IOR and MPI +parameters inside ior-collect-stat.sh + +Running +-------- +1) The gather_stats_everywhere.sh will be run in three mode + + a)sh gather_stats_everywhere.sh config.sh start + It will start collect stats on each node provided in config.sh + + b)sh gather_stats_everywhere.sh config.sh stop <log_name> + It will stop collect stats on each node. If <log_name> is provided, + it will create a profile tarball /tmp/<log_name>.tar.gz. + + c)sh gather_stats_everywhere.sh config.sh analyse log_tarball.tar.gz csv + It will analyse the log_tarball and create a csv tarball for this + profiling tarball. + +2) The ior-collect-stat.sh will be run as + sh ior-collect-stat.sh start <profile> + It will create a ior result csv file. If <profile> is provided, + the detail profile info tarball will be created under /tmp. + +Example +------- +When you want collect your profile info, you should + 1)sh gather_stats_everywhere.sh config.sh start + #start the collect profile daemon on each node. + + 2)run your test. + + 3)sh gather_stats_everywhere.sh config.sh stop log_tarball + #stop the collect profile daemon on each node, cleanup + the tmp file and create a profiling tarball. + + 4)sh gather_stats_everywhere.sh config.sh analyse log_tarball.tar.gz csv + #create a csv file according to the profile. + +TBD +------ +Add liblustre profiling support and add more options for analyse. + + + + + + diff --git a/lustre-iokit/stats-collect/config.sh b/lustre-iokit/stats-collect/config.sh new file mode 100755 index 0000000000000000000000000000000000000000..b346dcd6801dca5e21bbb897009b64ea76dfa07f --- /dev/null +++ b/lustre-iokit/stats-collect/config.sh @@ -0,0 +1,56 @@ +# +# system configuration. Set these variables to point to the locations +# of various system utilities. +# +AWK=/usr/bin/awk +XARGS=/usr/bin/xargs + +# +# control debug output. set PRINT_INFO_MSGS=1 to see additional messages +# set PRINT_DEBUG_MSGS=1 to see debug messages +# +PRINT_INFO_MSGS=0 +PRINT_DEBUG_MSGS=0 + +# +# TARGETS: set this variable to the list of nodes you want to +# gather stats from +# +# Example: +# +PERCH_MDS_LIST="nid00135" +PERCH_OST_LIST="nid00128 nid00131 nid00136 nid00139 nid00008 nid00011 nid00012" + +MDS_LIST=${PERCH_MDS_LIST} +OST_LIST=${PERCH_OST_LIST} +export TARGETS="${MDS_LIST} ${OST_LIST}" + +#script var +#case $TARGET in +# oss*) +# VMSTAT_INTERVAL=0 +# SERVICE_INTERVAL=2 +# SDIO_INTERVAL=0 +# ;; +# client*) ALEX_SCRIPT_CLIENT_VAR1="hello!" +# ;; +#esac + +#FIXME: diff these parameters according to client/MDS/OSS +VMSTAT_INTERVAL=${VMSTAT_INTERVAL:-1} +SERVICE_INTERVAL=${SERVICE_INTERVAL:-0} +SDIO_INTERVAL=${SDIO_INTERVAL:-0} +BRW_INTERVAL=${BRW_INTERVAL:-0} +MBALLOC_INTERVAL=${MBALLOC_INTERVAL:-0} +IO_INTERVAL=${IO_INTERVAL:-1} +JBD_INTERVAL=${JBD_INTERVAL:-1} + +#some environment var +TMP=${TMP:-"/tmp"} +SCRIPT=${SCRIPT:-"lstats.sh"} +#Remote ssh script +DSH=${DSH:-ssh} +DCP=${DCP:-scp} +USER="" +TAR=${TAR:-tar -zcvf} + diff --git a/lustre-iokit/stats-collect/gather_stats_everywhere.sh b/lustre-iokit/stats-collect/gather_stats_everywhere.sh new file mode 100755 index 0000000000000000000000000000000000000000..72791095ad5312429c3993b4000828eccfc13e5c --- /dev/null +++ b/lustre-iokit/stats-collect/gather_stats_everywhere.sh @@ -0,0 +1,565 @@ +#!/bin/sh + +######################################################################### +# gather_stats_everywhere: +# script on a selection of nodes and collect all the results into a single +# tar ball +# +# Copyright (c) 2007 - Cluster File Systems, Inc. +######################################################################### + +error() { + echo "ERROR: $0: $@" +} + +warning() { + echo "WARNING: $@" +} + +info () { + if [ ${PRINT_INFO_MSGS} -gt 0 ] + then + echo "INFO: $@" + fi +} + +debug () { + if [ ${PRINT_DEBUG_MSGS} -gt 0 ] + then + echo "DEBUG: $@" + fi +} + +usage() { + printf $"Usage: gather_stats_everywhere [-help] config_file [start|stop|cleanup] <log_name>\n" + if [ x$1 = x-h ] + then + printf $" +The distribution script will run on a single node. It is parameterised +with a set of target node names. It may assume ssh/scp to these node +names works without requiring a password. It will run in 2 modes... + +gather_stats_everywhere config_file start + +...will copy the script to /tmp everywhere described in +config_file running on all the target hosts. And... + +gather_stats_everywhere config_file stop log_name + +...will stop script running on all the hosts it started on and collect +all the individual stats files into a single compressed tarball if the log_name is +provided. + +The config file is just a list of shell variable assignments that can be +customised. + +Serveral variables must be set in the config file + +Targets: the nodes where run the script. +" + exit 0 + else + exit 1 + fi +} + +options=`getopt -o h --long help:: -- "$@"` + +if [ $? -ne 0 ] +then + usage +fi + +eval set -- "$options" + +while true +do + case "$1" in + -h) + usage -h ;; + --help) + usage -h ;; + --) + shift + break ;; + esac +done + +if [ $# != 2 -a $# != 3 ] ; then + usage +fi + +CONFIG=$1 +OPTION=$2 +shift +shift + +GLOBAL_TIMESTAMP="" + +if [ ! -r $CONFIG ]; then + error "Config_file: $CONFIG does not exist " + exit 1 +fi + +. $CONFIG + +if [ -z "$SCRIPT" ]; then + error "SCRIPT in ${CONFIG} is empty" + exit 1 +fi + +if [ -z "$TARGETS" ]; then + error "TARGETS in ${CONFIG} is empty" + exit 1 +fi + +#check nodes accessiable +Check_nodes_available() { + local NODES_NOT_AVAILABLE="" + + debug "Entering Check_nodes_available()" + + for TARGET in $TARGETS; do + if ! ping -c 1 -w 3 $TARGET > /dev/null; then + NODES_NOT_AVAILABLE=$NODES_NOT_AVAILABLE$TARGET + fi + done + + if [ -z "$NODES_NOT_AVAILABLE" ]; then + debug "Check_nodes_available() returning 0 (success - all nodes available)" + return 0 + fi + + error "Check_nodes_available: these nodes are not available (did not respond to pings): ${NODES_NOT_AVAILABLE}" + debug "Check_nodes_available() returning with errors" + + return 1 +} + +if ! Check_nodes_available; then + error "not all the nodes are available" + exit 1 +fi + +# +# returns 1 if copies of lstats are found running on any of the $TARGETS nodes +# +Nodes_are_not_clean() { + local DIRTY_NODES="" + + debug "Entering Nodes_are_not_clean()" + + # check whether there are running threads on the targets + for TARGET in $TARGETS; do + ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"` + if [ -n "$ps_str" ]; then + DIRTY_NODES="${DIRTY_NODES} ${TARGET}" + fi + done + + if [ -n "$DIRTY_NODES" ]; then + debug "Nodes_are_not_clean() returning 1" + return 1 + fi + + debug "Nodes_are_not_clean() returning 0" + return 0 +} + +Clean_nodes() { + + debug "Entering Clean_nodes()" + + # + # if debugging is enabled, show lists of lstats processes + # still running on the target nodes before the clean operation + # + if [ ${PRINT_DEBUG_MSGS} -gt 0 ] + then + for TARGET in $TARGETS; do + debug "List of processes which need to be cleaned up on ${TARGET}:" + $DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}" + debug "List of pids which need to be cleaned up on ${TARGET}:" + $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }'" + done + fi + + # + # do the actual cleanup + # kill any old lstats processes still running on the target nodes + # + for TARGET in $TARGETS; do + + ps_str=`$DSH $TARGET "ps aux | grep -v grep | grep ${SCRIPT}-${TARGET}"` + if [ -n "$ps_str" ]; then + debug "cleaning node ${TARGET}" + $DSH $TARGET "ps aux | grep ${SCRIPT}-${TARGET} | grep -v grep | ${AWK} '{ print \$2 }' | ${XARGS} kill" + fi + done + + debug "Leaving Clean_nodes()" + return 0 +} + +copy_target_script() { + local target=$1 + + debug "Entering copy_target_script()" + + #copy alex's run scripts to the target + copy_cmd="$DCP $SCRIPT ${USER}${target}:$TMP/${SCRIPT}-${target}" + ${copy_cmd} 1>/dev/null 2>&1 + if [ ${PIPESTATUS[0]} != 0 ]; then + echo "copy command failed: ${copy_cmd}" 2>&1 + debug "Leaving copy_target_script() (error return)" + return 1 + fi + + echo "$SCRIPT copied to ${USER}${target} (into $TMP)" + debug "Leaving copy_target_script() (normal return)" + return 0 +} + +start_target_script() { + local target=$1 + + debug "Entering start_target_script()" + + if ! copy_target_script $target; then + echo "copy_target_script $target failed." 2>&1 + debug "Leaving start_target_script() (error return)" + return 1 + fi + + #run the script on the target + $DSH ${USER}${target} "VMSTAT_INTERVAL=${VMSTAT_INTERVAL} \ + SDIO_INTERVAL=${SDIO_INTERVAL} \ + SERVICE_INTERVAL=${SERVICE_INTERVAL} \ + BRW_INTERVAL=${BRW_INTERVAL} \ + JBD_INTERVAL=${JBD_INTERVAL} \ + IO_INTERVAL=${IO_INTERVAL} \ + MBALLOC_INTERVAL=${MBALLOC_INTERVAL} \ + sh ${TMP}/${SCRIPT}-${target} start \ + 1> /dev/null 2>/dev/null </dev/null" + + if [ ${PIPESTATUS[0]} != 0 ]; then + echo "Start the ${SCRIPT} on ${target} failed" + debug "Leaving start_target_script() (error return)" + return 1 + fi + + echo "Start the ${SCRIPT} on ${target} success" + debug "Leaving start_target_script() (normal return)" + return 0 +} + +stop_target_script() { + local target=$1 + + debug "Entering stop_target_script()" + + #stop the target script first + $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} stop" 1>/dev/null 2>&1 + if [ ${PIPESTATUS[0]} != 0 ]; then + echo "stop the collecting stats script on ${target} failed" + debug "Leaving stop_target_script() (error return)" + return 1 + else + echo "stop the collecting stats script on ${target} success" + fi + + #remove those tmp file + $DSH ${USER}${target} "rm -rf $TMP/${SCRIPT}-${target}" 1>/dev/null 2>&1 + echo "cleanup ${target} tmp file after stop " + + debug "Leaving stop_target_script() (normal return)" + return 0 +} + +# +# create a unique timestamp-based name which we can use for +# naming files on all the $TARGET nodes. +# +# By creating one timestamp here on the master node, we avoid +# the problem of clock skew on the $TARGET nodes causing them +# to use different filenames than we expect (if their clocks are +# different from the clock on this node) +# +generate_timestamp() { + if [ "X${GLOBAL_TIMESTAMP}" = "X" ] + then + export GLOBAL_TIMESTAMP=`date +%F-%H.%M.%S` + debug "Global Timestamp Created: ${GLOBAL_TIMESTAMP}" + fi +} + +fetch_target_log() { + generate_timestamp + local target=$1 + local date=${GLOBAL_TIMESTAMP} + local target_log_name="stats-${target}-${date}" + + echo "Getting log: ${target_log_name}.tar.gz from ${target}" + $DSH ${USER}${target} "sh ${TMP}/${SCRIPT}-${target} fetch " \ + > $TMP/${target_log_name}.tar.gz + echo "Got log: ${target_log_name}.tar.gz from ${target}" + + echo "Moving $TMP/${target_log_name}.tar.gz to $TMP/$log_name" + mv $TMP/${target_log_name}.tar.gz $TMP/$log_name +} + +fetch_log() { + generate_timestamp + local log_name=${GLOBAL_TIMESTAMP} + local stat_tar_name=$1 + local -a pids_array + local -a clients_array + + debug "Entering fetch_log()" + + if ! mkdir -p $TMP/$log_name ; then + error "can not mkdir $log_name" + exit 1 + fi + + #retrive the log_tarball from remote nodes background + local n=0 + for TARGET in $TARGETS; do + (fetch_target_log ${TARGET}) & + pids_array[$n]=$! + clients_array[$n]=$TARGET + + debug "fetch_log: spawned fetch_target_log process for ${TARGET} pid ${pids_array[$n]}" + let n=$n+1 + done + + local num_pids=$n + + #Waiting log fetch finished + for ((n=0; $n < $num_pids; n++)); do + debug "fetch_log(): waiting for pid ${pids_array[$n]}" + wait ${pids_array[$n]} + + # + # TODO: add check of exit status from wait() + # + done + + #compress the log tarball + cmd="$TAR ${stat_tar_name} $TMP/${log_name}" + echo "Creating compressed tar file ${stat_tar_name} from log files in $TMP/${log_name}" + ${cmd} 1>/dev/null 2>&1 + if [ ${PIPESTATUS[0]} == 0 ]; then + echo "removing temporary directory $TMP/${log_name}" + rm -rf $TMP/${log_name} + else + echo "Compressed logfiles are in $TMP/${stat_tar_name}" + fi + + debug "Leaving fetch_log()" +} + +stop_targets_script() { + local -a pids_array + local -a clients_array + local n=0 + + debug "Entering stop_targets_script()" + + for TARGET in $TARGETS; do + (stop_target_script ${TARGET}) & + pids_array[$n]=$! + clients_array[$n]=$TARGET + let n=$n+1 + done + local num_pids=$n + + #Waiting log fetch finished + for ((n=0; $n < $num_pids; n++)); do + if ! wait ${pids_array[$n]}; then + echo "${clients_array[$n]}: can not stop stats collect" + fi + done + + debug "Leaving stop_targets_script()" + +} + +gather_start() { + local -a pids_array + local -a clients_array + local n=0 + + debug "Entering gather_start()" + + #check whether the collect scripts already start in some targets + + Nodes_are_not_clean + ret=$? + + if [ $ret -gt 0 ] + then + warning "$SCRIPT already running in some targets, attempting cleanup..." + + Clean_nodes + + Nodes_are_not_clean + ret=$? + + if [ $ret -gt 0 ] + then + error "$SCRIPT automatic cleanup attempt failed." + error "$SCRIPT Please make sure lstats is no longer running on target nodes and try again." + debug "Error return from gather_start()" + return 1 + fi + fi + + for TARGET in $TARGETS; do + (start_target_script ${TARGET}) & + pids_array[$n]=$! + clients_array[$n]=$TARGET + let n=$n+1 + done + + local num_pids=$n + + local RC=0 + #Waiting log fetch finished + for ((n=0; $n < $num_pids; n++)); do + if ! wait ${pids_array[$n]}; then + echo "${clients_array[$n]}: can not start stats collect" + let RC=$RC+1 + fi + done + + if [ $RC != 0 ]; then + stop_targets_script + fi + + debug "Leaving gather_start()" +} + +gather_stop() { + log=$1 + + debug "Entering gather_stop()" + + if [ -n "$log" ]; then + fetch_log $log + fi + + stop_targets_script + + debug "Leaving gather_stop()" +} + +get_end_line_num() +{ + local log_name=$1 + + ln=`grep -n snapshot_time ${log_name} | awk -F":" '{ln=$1;} END{print ln;}'` + total_ln=`wc ${log_name} | awk '{print $1}'` + + local endlen=$((${total_ln} - ${ln})) + echo $endlen +} + +get_csv() +{ + local logdir=$1 + local statf=$2 + + local statf_name=`basename ${statf}` + type_name=`echo ${statf_name} | awk -F "." '{print $3}'` + stat_name=`head -n 1 ${statf} | awk '{print $4}'` + stat_type=`head -n 1 ${statf} | awk '{print $1}'` + + #currently, it can only analyse client application log + if [ "$stat_type" != "client" ]; then + error "can not analyse ${statf} ......." + exit 1 + fi + + #create the header + echo "${node_name}_${type_name}, ${stat_name}" \ + >> $logdir/analyse_${type_name}.csv + + #get total stats collection + end_len=`get_end_line_num ${statf}` + if [ $end_len != 1 -a $end_len != 0 ]; then + if [ "$type_name" != "osc-rpc_stats" ]; then + tail -n $end_len ${statf} | awk '{print $1 "," $2}' \ + >> $logdir/analyse_${type_name}.csv + else + tail -n $end_len ${statf} | \ + awk '/^[[:digit:]]/{print $1","$2","$6} \ + /^page/{print "page per rpc,read,write"} \ + /^rpcs/{print "rpcs,read,write"} \ + /^offset/{print "offset, read,write"}' \ + >> $logdir/analyse_${type_name}.csv + fi + fi +} + +gather_analyse() +{ + local log_tarball=$1 + local option=$2 + + debug "Entering gather_analyze()" + + #validating option + if [ -z "$log_tarball" -o -r "$option" ]; then + usage; + fi + + if [ ! -r $log_tarball ]; then + error " not exist $log_tarball " + return 1 + fi + + shift + + local date=`date +%F-%H-%M` + local logdir="analyse-${date}" + + mkdir -p ${TMP}/${logdir} + mkdir -p ${TMP}/${logdir}/tmp + + $UNTAR $log_tarball -C ${TMP}/${logdir}/tmp 1>/dev/null 2>&1 + for log_file in `find $TMP/$logdir/tmp`; do + if test -f $log_file; then + #get the node name + local file_name=`basename ${log_file}` + node_name=`echo ${file_name} | awk -F "-" '{print $2}'` + echo "analysing the sublog ...$log_file" + mkdir -p ${TMP}/${logdir}/${node_name} + mkdir -p ${TMP}/${logdir}/${node_name}/tmp + + $UNTAR $log_file -C ${TMP}/${logdir}/${node_name}/tmp 1>/dev/null 2>&1 + for statf in `find ${TMP}/${logdir}/${node_name}/tmp`; do + if test -f $statf ; then + if [ "$option" == "csv" ]; then + get_csv "$TMP/$logdir/${node_name}" "$statf" + fi + fi + done + rm -rf ${TMP}/${logdir}/${node_name}/tmp + fi + done + + rm -rf ${TMP}/${logdir}/tmp + $TAR ${TMP}/${logdir}.tar.gz ${TMP}/${logdir} 1>/dev/null 2>&1 + + echo "create analysed tarball ${TMP}/${logdir}.tar.gz" + + debug "Leaving gather_analyze()" +} + +case $OPTION in + start) gather_start ;; + stop) gather_stop $@;; + analyse) gather_analyse $@;; + *) error "Unknown option ${OPTION}" ; exit 1 +esac diff --git a/lustre-iokit/stats-collect/lstats.sh b/lustre-iokit/stats-collect/lstats.sh new file mode 100755 index 0000000000000000000000000000000000000000..217144d54bba0e607fa8195d0783cc8bad5e2858 --- /dev/null +++ b/lustre-iokit/stats-collect/lstats.sh @@ -0,0 +1,703 @@ +#!/bin/sh + +# +# very short example: +# +# to start collection: +# VMSTAT_INTERVAL=0 SERVICE_INTERVAL=2 SDIO_INTERVAL=0 lstats.sh start +# +# where value of interval means: +# 0 - gather stats at start and stop only +# N - gather stats every N seconds +# if some XXX_INTERVAL isn't specified, related stats won't be collected +# XXX can be: VMSTAT, SERVICE, BRW, SDIO, MBALLOC, IO, JBD +# +# to stop collection: +# lstats.sh stop +# +# to fetch collected stats: +# lstats.sh fetch >file +# in file you'll get a tarbal containing directory with stats +# directory's name consists of hostname and date, +# like: stats-bzzz-2007-05-13-22.52.31 +# + +# +# TODO +# - close all file descriptors, otherwise sshd can't finish session +# - for sd_iostats convert partition to whole device +# + +# configuration variables +TMP=${TMP:-/tmp} +PREFIX=${PREFIX:-${TMP}/lstats.} +PIDFILE=${PREFIX}pid +STATPIDS=${PREFIX}pids +OUTPREFIX=${OUTPREFIX:-${PREFIX}out.} +STIMEPREFIX=${STIMEPREFIX:-${PREFIX}time.} + + +function ls_grab_control() +{ + OCOMM=`ps -p $$ -o comm=` + if [ "$OCOMM" == "" ]; then + echo "Can't fetch process name" + exit + fi + + # check for running master first + PID=`cat $PIDFILE 2>/dev/null` +#echo "check master $PID" + if [ "x$PID" != "x" ]; then + COMM=`ps -p $PID -o comm=` + if [ "$COMM" == "$OCOMM" ]; then + echo "Master is already running by $PID" + return 1 + fi + fi + + # XXX: race -- two process can do this at same time, use rename instead + echo $$ >${PIDFILE}.$$ + mv ${PIDFILE}.$$ ${PIDFILE} + a=`cat ${PIDFILE}` + if [ "$$" != "$a" ]; then + echo "Some one $a won the race" + return 1 + fi + + HAS_CONTROL="yes" +#echo "We've got control" + + return 0 + +} + +function ls_release_control() +{ +#echo "Release control" + + rm -f $PIDFILE +} + +trap ls_atexit EXIT +function ls_atexit() +{ + if [ "$HAS_CONTROL" != "" ]; then + ls_release_control + fi +} + + +function usr1signal() +{ + stop_collector=1 +} + +function idle_collector() +{ + while [ "$stop_collector" != "1" ]; do + sleep 100; + done +} + +# +# args: +# - type +# - collector function +# - collector arguments +function run_collector() +{ + local pid + local stime + local ctype=$1 + local cfunc=$2 + shift + shift + + read pid NN </proc/self/stat + stime=`ps -p $pid -o bsdstart=` + echo -n "$pid " >>$STATPIDS + echo -n "$stime" >>${STIMEPREFIX}${pid} + + trap "usr1signal" SIGUSR1 + +# echo "$pid: new collector $ctype $cfunc" + $cfunc $@ </dev/null >&${OUTPREFIX}${ctype}.${pid} + +} + +# +# vmstat collector +# +# VMSTAT_INTERVAL: +# - 0 - collect at start and stop only +# - N - collect each N seconds +function vmstat_collector() +{ + echo "vmstat " `date` + + if let "VMSTAT_INTERVAL==0"; then + date + vmstat + idle_collector + date + vmstat + elif let "VMSTAT_INTERVAL>0"; then + vmstat $VMSTAT_INTERVAL + else + echo "Invalid VMSTAT_INTERVAL=$VMSTAT_INTERVAL" + idle_collector + fi +} + +function vmstat_start() +{ + if [ "$VMSTAT_INTERVAL" == "" ]; then + return; + fi + + run_collector "vmstat" vmstat_collector & +} + +# +# brw_stats collector +# +# BRW_INVERVAL: +# - 0 - collect at start and stop only +# - N - collect each N seconds +# +function brw_collector() +{ + local filter=$1 + + echo "brw_* for $filter " `date` + + # clear old stats + for i in /proc/fs/lustre/obdfilter/${filter}/brw_*; do + echo 0 >$i + done + + if let "BRW_INTERVAL==0"; then + cat /proc/fs/lustre/obdfilter/${filter}/brw_* + idle_collector + cat /proc/fs/lustre/obdfilter/${filter}/brw_* + elif let "BRW_INTERVAL>0"; then + while [ "$stop_collector" != "1" ]; do + cat /proc/fs/lustre/obdfilter/${filter}/brw_* + sleep $BRW_INTERVAL + done + else + echo "Invalid BRW_INTERVAL=$BRW_INTERVAL" + idle_collector + fi +} + +function brw_start() +{ + if [ "$BRW_INTERVAL" == "" ]; then + return; + fi + + # find all obdfilters + for i in /proc/fs/lustre/obdfilter/*; do + filter=`basename $i` + if [ "$filter" == "num_refs" ]; then + continue; + fi + run_collector "brw" brw_collector $filter & + done +} + +# +# service_stats collector +# +# SERVICE_INVERVAL: +# - 0 - collect at start and stop only +# - N - collect each N seconds +# +function service_collector() +{ + local file=$1 + local target=$2 + local srv=$3 + + echo "service stats for ${target}/${srv} " `date` + + # clear old stats + echo 0 >$file + + if let "SERVICE_INTERVAL==0"; then + grep -v "^[^ ]*[^0-9]*0 samples" $file + idle_collector + grep -v "^[^ ]*[^0-9]*0 samples" $file + elif let "SERVICE_INTERVAL>0"; then + while [ "$stop_collector" != "1" ]; do + grep -v "^[^ ]*[^0-9]*0 samples" $file + sleep $SERVICE_INTERVAL + done + else + echo "Invalid SERVICE_INTERVAL=$SERVICE_INTERVAL" + idle_collector + fi +} + +function service_start() +{ + if [ "$SERVICE_INTERVAL" == "" ]; then + return; + fi + + # find all OSTs and MDTs + for i in /proc/fs/lustre/ost/* /proc/fs/lustre/mdt/*; do + target=`basename $i` + if [ "$target" == "num_refs" ]; then + continue; + fi + for j in ${i}/*; do + srv=`basename $j` + if [ "$srv" == "uuid" ]; then + continue; + fi + run_collector "service-${srv}" service_collector \ + ${j}/stats $target $srv & + done + done + + # find all LDLM services + for i in /proc/fs/lustre/ldlm/services/*; do + srv=`basename $i` + run_collector "service" service_collector ${i}/stats "ldlm" $srv & + done + +} + +# +# client_stats collector +# +# CLIENT_INTERVAL: +# - 0 - collect at start and stop only +# - N - collect each N seconds +# +function client_collector() +{ + local file=$1 + local target=$2 + local srv=$3 + + echo "client stats for ${target}/${srv} " `date` + + # clear old stats + echo 0 >$file + + if let "CLIENT_INTERVAL==0"; then + grep -v "^[^ ]*[^0-9]*0 samples" $file + idle_collector + grep -v "^[^ ]*[^0-9]*0 samples" $file + elif let "CLIENT_INTERVAL>0"; then + while [ "$stop_collector" != "1" ]; do + grep -v "^[^ ]*[^0-9]*0 samples" $file + sleep $CLIENT_INTERVAL + done + else + echo "Invalid CLIENT_INTERVAL=$CLIENT_INTERVAL" + idle_collector + fi +} + +function client_start() +{ + if [ "$CLIENT_INTERVAL" == "" ]; then + return; + fi + + # find all osc + for i in /proc/fs/lustre/osc/* ; do + target=`basename $i` + if [ "$target" == "num_refs" ]; then + continue; + fi + for j in ${i}/*; do + stats=`basename $j` + if [ "$stats" == "stats" -o "$stats" == "rpc_stats" ]; then + run_collector "osc-${stats}" client_collector \ + ${j} $target $stats & + fi + done + done + # find all llite stats + for i in /proc/fs/lustre/llite/* ; do + target=`basename $i` + for j in ${i}/*; do + stats=`basename $j` + if [ "$stats" == "stats" -o "$stats" == "vfs_ops_stats" ]; then + run_collector "llite-${stats}" client_collector \ + ${j} $target ${stats} & + fi + done + done +} + +# +# sdio_stats collector +# +# SDIO_INVERVAL: +# - 0 - collect at start and stop only +# - N - collect each N seconds +# +function sdio_collector() +{ + local obd=$1 + local uuid=`cat $obd/uuid` + local tmp=`cat $obd/mntdev` + local disk=`basename $tmp` + local file="/proc/scsi/sd_iostats/${disk}" + + echo "sd_iostats for ${uuid}/${disk} " `date` + + # clear old stats + echo 0 >$file + + if let "SDIO_INTERVAL==0"; then + cat $file + idle_collector + cat $file + elif let "SDIO_INTERVAL>0"; then + while [ "$stop_collector" != "1" ]; do + cat $file + sleep $SDIO_INTERVAL + done + else + echo "Invalid SDIO_INTERVAL=$SDIO_INTERVAL" + idle_collector + fi +} + +function sdio_start() +{ + if [ "$SDIO_INTERVAL" == "" ]; then + return; + fi + + # find all obdfilters and MDSs + for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do + obd=`basename $i` + if [ "$obd" == "num_refs" ]; then + continue; + fi + if [ ! -f ${i}/mntdev ]; then + continue; + fi + tmp=`cat ${i}/mntdev` + disk=`basename $tmp` + if [ ! -f /proc/scsi/sd_iostats/${disk} ]; then + continue; + fi + run_collector "sdio" sdio_collector ${i} & + done +} + +# +# mballoc_stats collector +# +# MBALLOC_INVERVAL: +# - 0 - collect at start and stop only +# - N - isn't implemented yet, works as with 0 +# +function mballoc_collector() +{ + local obd=$1 + local uuid=`cat $obd/uuid` + local tmp=`cat $obd/mntdev` + local disk=`basename $tmp` + local file="/proc/fs/ldiskfs*/${disk}/mb_history" + + echo "mballoc history for ${uuid}/${disk} " `date` + + # log allocations only + for i in $file; do + echo 3 >$i + done + + if let "MBALLOC_INTERVAL==0"; then + idle_collector + cat $file + elif let "MBALLOC_INTERVAL>0"; then + idle_collector + cat $file + else + echo "Invalid MBALLOC_INTERVAL=$MBALLOC_INTERVAL" + idle_collector + fi +} + +function mballoc_start() +{ + if [ "$MBALLOC_INTERVAL" == "" ]; then + return; + fi + + # find all obdfilters and MDSs + for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do + obd=`basename $i` + if [ "$obd" == "num_refs" ]; then + continue; + fi + if [ ! -f ${i}/mntdev ]; then + continue; + fi + tmp=`cat ${i}/mntdev` + disk=`basename $tmp` + if [ ! -f /proc/fs/ldiskfs*/${disk}/mb_history ]; then + continue; + fi + run_collector "mballoc" mballoc_collector ${i} & + done +} + +# +# io_stats collector +# +# IO_INVERVAL: +# - 0 - collect at start and stop only +# - N - collect each N seconds +# +function io_collector() +{ + local obd=$1 + local uuid=`cat $obd/uuid` + local tmp=`cat $obd/mntdev` + local disk=`basename $tmp` + local file="/sys/block/${disk}/stat" + + echo "iostats for ${uuid}/${disk} " `date` + + if let "IO_INTERVAL==0"; then + cat $file + idle_collector + cat $file + elif let "IO_INTERVAL>0"; then + while [ "$stop_collector" != "1" ]; do + cat $file + sleep $IO_INTERVAL + done + else + echo "Invalid IO_INTERVAL=$IO_INTERVAL" + idle_collector + fi +} + +function io_start() +{ + if [ "$IO_INTERVAL" == "" ]; then + return; + fi + + # find all obdfilters and MDSs + for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do + obd=`basename $i` + if [ "$obd" == "num_refs" ]; then + continue; + fi + if [ ! -f ${i}/mntdev ]; then + continue; + fi + tmp=`cat ${i}/mntdev` + disk=`basename $tmp` + if [ ! -f /sys/block/${disk}/stat ]; then + continue; + fi + run_collector "io" io_collector ${i} & + done +} + +# +# jbd_stats collector +# +# JBD_INVERVAL: +# - 0 - collect at start and stop only +# - N - isn't implemented yet, works as with 0 +# +function jbd_collector() +{ + local obd=$1 + local uuid=`cat $obd/uuid` + local tmp=`cat $obd/mntdev` + local disk=`basename $tmp` + local file="/proc/fs/jbd/${disk}/history" + + echo "jbd history for ${uuid}/${disk} " `date` + + if let "JBD_INTERVAL==0"; then + idle_collector + cat $file + elif let "JBD_INTERVAL>0"; then + idle_collector + cat $file + else + echo "Invalid JBD_INTERVAL=$JBD_INTERVAL" + idle_collector + fi +} + +function jbd_start() +{ + if [ "$JBD_INTERVAL" == "" ]; then + return; + fi + + # find all obdfilters and MDSs + for i in /proc/fs/lustre/obdfilter/* /proc/fs/lustre/mds/*; do + obd=`basename $i` + if [ "$obd" == "num_refs" ]; then + continue; + fi + if [ ! -f ${i}/mntdev ]; then + continue; + fi + tmp=`cat ${i}/mntdev` + disk=`basename $tmp` + if [ ! -f /proc/fs/jbd/${disk}/history ]; then + continue; + fi + run_collector "jbd" jbd_collector ${i} & + done +} + +# +# start entry point +# +function ls_start() +{ + if ! ls_grab_control; then + exit + fi + + PID=`cat $STATPIDS 2>/dev/null` + if [ "x$PID" != "x" ]; then + for i in $PID; do + i=`echo $i | sed 's/^[^:]*://'` + TO=`cat ${STIMEPREFIX}$i` + TN=`ps -p $i -o bsdstart=` + if [ "$TO" != "" -a "$TO" == "$TN" ]; then + echo "Some slave is already running by $i" + exit + fi + done + fi + + # clean all all stuff + rm -rf ${STATPIDS}* ${OUTPREFIX}* ${STIMEPREFIX} + + vmstat_start + brw_start + service_start + sdio_start + mballoc_start + io_start + jbd_start + client_start +} + +# +# stop entry point +# +# should stop collection, gather all collected data +# +function ls_stop() +{ + if ! ls_grab_control; then + exit + fi + + PID=`cat $STATPIDS 2>/dev/null` + if [ "x$PID" != "x" ]; then + pids2wait="" + for i in $PID; do + i=`echo $i | sed 's/^[^:]*://'` + TO=`cat ${STIMEPREFIX}$i 2>/dev/null` + TN=`ps -p $i -o bsdstart=` + if [ "$TO" == "" -o "$TO" != "$TN" ]; then + echo "No collector with $i found" + continue + fi + /bin/kill -s USR1 -- -${i} + pids2wait="$pids2wait $i" + done +#echo "XXX: wait collectors $pids2wait" + for i in $pids2wait; do + TO=`cat ${STIMEPREFIX}$i 2>/dev/null` + TN=`ps -p $i -o bsdstart=` + while [ "$TO" != "" -a "$TO" == "$TN" ]; do + sleep 1 + TN=`ps -p $i -o bsdstart=` + done + done + fi + rm -f $STATPIDS ${STIMEPREFIX}* +} + +# +# fetch entry point +# +# creates tarball of all collected stats +# current version is silly - just finds all *out* files in $TMP +ls_fetch() +{ + if [ "X${GLOBAL_TIMESTAMP}" = "X" ] + then + local date=`date +%F-%H.%M.%S` + else + local date=${GLOBAL_TIMESTAMP} + fi + + local hostname=`hostname -s` + local name="stats-$hostname-$date" + + stats=${OUTPREFIX}* + if ! mkdir ${TMP}/${name}; then + echo "Can't create ${TMP}/${name}" + exit + fi + + let found=0 + for i in ${OUTPREFIX}*; do + mv $i ${TMP}/${name}/ + let "found++" + done + + if let "found > 0"; then + (cd ${TMP}; tar -zcf "./${name}.tar.gz" "./${name}") + cat ${TMP}/${name}.tar.gz + else + echo "No stats found" + fi + rm -rf ${TMP}/${name}* + +} + +# +# abort entry point +# +# should kill all running collections +# +function ls_abort() +{ + echo "Abort isn't implemented yet" +} + +######### +# main +######### + +# required to put all background processes into different process groups +# so that we can manage whole groups sending them a single signal +set -m + +case $1 in + start) ls_start ;; + stop) ls_stop ;; + fetch) ls_fetch ;; + abort) ls_abort ;; + *) echo "Unknown command" +esac + diff --git a/lustre.spec.in b/lustre.spec.in new file mode 100644 index 0000000000000000000000000000000000000000..87f7d33d6fd5dd68e87fc9ee2ba7764cffa07211 --- /dev/null +++ b/lustre.spec.in @@ -0,0 +1,247 @@ +# lustre.spec +%define version @VERSION@ +%define kversion @LINUXRELEASE@ + +Summary: Lustre File System +Name: lustre +Version: %{version} +Release: @RELEASE@ +License: GPL +Group: Utilities/System +Source: lustre-%{version}.tar.gz +URL: http://clusterfs.com/ +BuildRoot: %{_tmppath}/lustre-%{version}-root +Obsoletes: lustre-lite, lustre-lite-utils, lustre-ldap nfs-utils-lustre +Provides: lustre-lite = %{version}, lustre-lite-utils = %{version} +# GSS requires this: BuildRequires: pkgconfig, libgssapi-devel >= 0.10 + +%description +Userspace tools and files for the Lustre file system. + +%package modules +Summary: Kernel Lustre modules for Linux %{kversion} +Requires: modutils >= 2.4.10 +Group: Development/Kernel + +%description modules +Lustre file system, server and network drivers for Linux %{kversion}. + +%package source +Summary: Object-Based Disk storage driver source +Group: Development/Kernel + +%description source +Lustre sources for further development + +# Since the RPMs we ship are to be used on both SLES and RHEL, we +# can't include any dependency information (since the package names +# are different on the two platforms). +# +# Instead, we can build these empty meta-packages that only include +# dependency information. These let people get the correct +# dependencies for their platform and lets them use tools like yum and +# red carpet to install the correct files. +# +# Unfortunately I have not seen this come up on the lists much, so I +# have disabled them (by commenting out their empty files section +# below) until it's clear that they resolve more confusion than they +# add. + +%package deps-sles +Summary: Lustre dependencies meta-package for SLES +Group: Utilities/System +Provides: lustre-deps = %{version} +Requires: lustre = %{version}, sles-release +Conflicts: lustre-deps-rhel + +%description deps-sles +This package has RPM dependencies appropriate for SLES systems. + +%package deps-rhel +Summary: Lustre dependencies meta-package for RHEL +Group: Utilities/System +Provides: lustre-deps = %{version} +Requires: lustre = %{version}, redhat-release +Conflicts: lustre-deps-sles + +%description deps-rhel +This package has RPM dependencies appropriate for RHEL, RHL, and FC +systems. + +%package tests +Summary: Lustre testing framework +Group: Development/Kernel +Provides: lustre-tests = %{version} +Requires: lustre = %{version} + +%description tests +This package contains a set of test binaries and scripts that are intended +to be used by the Lustre testing framework. + +%prep +%setup -qn lustre-%{version} + +%build +# if RPM_BUILD_NCPUS unset, set it +if [ -z "$RPM_BUILD_NCPUS" ] ; then + RPM_BUILD_NCPUS=$(egrep -c "^cpu[0-9]+" /proc/stat 2>/dev/null || echo 0 :) + if [ $RPM_BUILD_NCPUS -eq 0 ] ; then + RPM_BUILD_NCPUS=1 + fi + if [ $RPM_BUILD_NCPUS -gt 8 ] ; then + RPM_BUILD_NCPUS=8 + fi +fi + +rm -rf $RPM_BUILD_ROOT + +# Set an explicit path to our Linux tree, if we can. +cd $RPM_BUILD_DIR/lustre-%{version} +./configure @ac_configure_args@ %{?configure_flags:configure_flags} \ + --sysconfdir=%{_sysconfdir} \ + --mandir=%{_mandir} \ + --libdir=%{_libdir} +make -j $RPM_BUILD_NCPUS -s + +%install +make install DESTDIR=$RPM_BUILD_ROOT +# hack to avoid changing the libsysio code for "make install" +rm -f $RPM_BUILD_ROOT%{_libdir}/libsysio.a +# Remove ldiskfs module(s) - they are packaged by the ldiskfs .spec. +rm -rf $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/fs/lustre-ldiskfs + +# hack to include the llog_test module in lustre-tests +llog_base=$RPM_BUILD_DIR/lustre-%{version}/lustre/obdclass/llog_test +if [ -e ${llog_base}.ko ]; then + cp ${llog_base}.ko $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/fs/lustre +elif [ -e ${llog_base}.o ]; then + cp ${llog_base}.o $RPM_BUILD_ROOT/lib/modules/%{kversion}/kernel/fs/lustre +fi + +# Create the pristine source directory. +cd $RPM_BUILD_DIR/lustre-%{version} +mkdir -p $RPM_BUILD_ROOT/usr/src +rm -f lustre-source +ln -s $RPM_BUILD_ROOT/usr/src lustre-source +make distdir distdir=lustre-source/lustre-%{version} +chmod -R go-w lustre-source/lustre-%{version} + +cat >lustre.files <<EOF +%attr(-, root, root) /sbin/mount.lustre +%attr(-, root, root) /usr/sbin/* +%attr(-, root, root) /usr/bin/* + +%attr(-, root, root) /usr/share/lustre/* + +%attr(-, root, root) %{_libdir}/libptlctl.a +%attr(-, root, root) %{_libdir}/liblustreapi.a +%attr(-, root, root) /usr/include/lustre + +%attr(-, root, root) %{_mandir}/man?/* + +%attr(-, root, root) %{_libdir}/lustre/lc_common +EOF + +if [ -f $RPM_BUILD_ROOT%{_libdir}/libcfsutil.a ] ; then + echo '%attr(-, root, root) %{_libdir}/libcfsutil.a' >>lustre.files +fi + +if [ -f $RPM_BUILD_ROOT%{_libdir}/liblustre.so ] ; then + echo '%attr(-, root, root) %{_libdir}/liblustre.a' >>lustre.files + echo '%attr(-, root, root) %{_libdir}/liblustre.so' >>lustre.files +fi + +if [ -f $RPM_BUILD_DIR/lustre-%{version}/lustre/utils/libiam.c ] ; then + echo '%attr(-, root, root) %{_libdir}/libiam.a' >>lustre.files +fi + +if [ -d $RPM_BUILD_ROOT%{_libdir}/lustre/snmp ] ; then + echo '%attr(-, root, root) %{_libdir}/lustre/snmp' >>lustre.files + echo '%attr(-, root, root) %{_datadir}/lustre/snmp/mibs' >>lustre.files +fi + +# Have universal lustre headers +if [ -f $RPM_BUILD_DIR/lustre-%{version}/lustre/include/lustre/lustre_idl.h ] ; then + echo '%attr(-, root, root) /usr/include/linux/lustre_types.h' >>lustre.files + echo '%attr(-, root, root) /usr/include/linux/lustre_user.h' >>lustre.files +else + echo '%attr(-, root, root) /usr/include/linux/lustre_idl.h' >>lustre.files +fi + +echo '%attr(-, root, root) %{_libdir}/lustre/tests/*' >lustre-tests.files +echo '%attr(-, root, root) /lib/modules/%{kversion}/kernel/fs/lustre/llog_test.*' >>lustre-tests.files +modules_excludes="llog_test" +if [ -d $RPM_BUILD_ROOT%{_libdir}/lustre/liblustre/tests ] ; then + echo '%attr(-, root, root) %{_libdir}/lustre/liblustre/tests/*' >>lustre-tests.files +fi + +pushd $RPM_BUILD_ROOT >/dev/null +find lib/modules/%{kversion}/kernel -type f | awk "!/($modules_excludes)/ {print \"/\"\$0}" >>$RPM_BUILD_DIR/lustre-%{version}/lustre-modules.files +popd >/dev/null + +%files -f lustre.files + +%files modules -f lustre-modules.files +%attr(-, root, root) %doc COPYING + +%files source +%attr(-, root, root) /usr/src/lustre-%{version} + +# uncomment these lines to enable deps packages +# %files deps-sles +# %files deps-rhel + +%files tests -f lustre-tests.files + +%post modules +if [ -f /boot/System.map-%{kversion} ]; then + depmod -ae -F /boot/System.map-%{kversion} %{kversion} || exit 0 +else + depmod -ae %{kversion} || exit 0 +fi + +# for update from < v1.4.6 + +for f in /etc/modules.conf /etc/modprobe.conf /etc/modprobe.conf.local ; +do + if [ -f $f ]; then + if grep 'lustre llite' $f >/dev/null 2>/dev/null ; then + [ ! -f $f.rpmsave ] && cp $f $f.rpmsave + TMPFILE=`mktemp $f.XXXXXX` && \ + rm -f $TMPFILE && touch $TMPFILE && \ + grep -v 'lustre llite' $f >> $TMPFILE && \ + mv $TMPFILE $f + fi + if egrep "^[^#]*(add below|install) ptlrpc" $f ; then + [ ! -f $f.rpmsave ] && cp $f $f.rpmsave + TMPFILE=`mktemp $f.XXXXXX` && \ + rm -f $TMPFILE && touch $TMPFILE && \ + sed -e "s/^[^#]*\(add below\|install\) ptlrpc.*/#&/" $f >> $TMPFILE && \ + mv $TMPFILE $f + fi + fi +done + +%postun modules +if [ -f /boot/System.map-%{kversion} ]; then + depmod -ae -F /boot/System.map-%{kversion} %{kversion} || exit 0 +else + depmod -ae %{kversion} || exit 0 +fi + +%post tests +if [ -f /boot/System.map-%{kversion} ]; then + depmod -ae -F /boot/System.map-%{kversion} %{kversion} || exit 0 +else + depmod -ae %{kversion} || exit 0 +fi + +%postun tests +if [ -f /boot/System.map-%{kversion} ]; then + depmod -ae -F /boot/System.map-%{kversion} %{kversion} || exit 0 +else + depmod -ae %{kversion} || exit 0 +fi + +%clean +rm -rf $RPM_BUILD_ROOT diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 1b78eb1393728baac1c9aa2e82ac4904ba12573c..89ad1b0701ce35fc97b76eee33f6126df87b8729 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,31 +1,497 @@ -04-26-2008 Sun Microsystems, Inc. +tbd Sun Microsystems, Inc. + * version 1.8.0 + * Support for kernels: + 2.6.5-7.311 (SLES 9), + 2.6.9-67.0.20.EL (RHEL 4), + 2.6.16.54-0.2.5 (SLES 10), + 2.6.18-53.1.21.el5 (RHEL 5), + 2.6.22.14 vanilla (kernel.org) + * Client support for unpatched kernels: + (see http://wiki.lustre.org/index.php?title=Patchless_Client) + 2.6.16 - 2.6.22 vanilla (kernel.org) + * Client support for unpatched kernels: + we do not recommend using patchless RHEL4 clients with kernels + prior to 2.6.9-55EL (RHEL4U5). + * Recommended e2fsprogs version: 1.40.11-sun1 + * Note that reiserfs quotas are disabled on SLES 10 in this kernel. + * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a + removed cwd "./" (refer to Bugzilla 14399). + * A new quota file format has been introduced in 1.6.5. + The format conversion from prior releases is handled transparently, + but releases older than 1.4.12/1.6.5 don't understand this new + format. The automatic format conversion can be avoided by running + the following command on the MDS: + 'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'. + For more information, please refer to bugzilla 13904. + * A new quota file format was introduced in 1.8.0. + The format conversion from prior releases is handled transparently, + but releases older than 1.6.6/1.8.0 don't understand this new + format. The automatic format conversion can be avoided by running + the following commands on the MDS and OSS servers (for + pre 1.4.12-1.6.5 quota files): + 'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV', + 'tunefs.lustre --param="ost.quota_type=ug1" $MDTDEV' + or (for 1.4.12/1.6.5 quota files) + 'tunefs.lustre --param="mdt.quota_type=ug2" $MDTDEV', + 'tunefs.lustre --param="ost.quota_type=ug2" $MDTDEV' + For more information, please refer to bugzilla 13904. + * Output of lfs quota has been made less detailed by default, + old (verbose) output can be obtained by using -v option. + +Severity : normal +Bugzilla : 16318 +Frequency : rare, on PPC clients +Description: don't swab ost objects in response about directory, because + this not exist. +Details : bug similar bug 14856, but in different function. + +Severity : enhancement +Bugzilla : 15754 +Description: lfs quota tool enhancement +Details : added units specifiers support for setquota, default to + current uid/gid for quota report, short quota stats by + default, nonpositional parameters for setquota, added + llapi_quotactl manual page. + +Severity : enhancement +Bugzilla : 15625 +Description: *optional* service tags registration +Details : if the "service tags" package is installed on a Lustre node + When the filesystem is mounted, a local-node service tag will + be created. See http://inventory.sun.com/ for more information + about the Service Tags asset management system. + +Severity : enhancement +Bugzilla : 16189 +Description: Update to RHEL4 kernel-2.6.9-67.0.20. + +Severity : normal +Frequency : occasional +Bugzilla : 15210 +Description: add refcount for osc callbacks, so avoid panic on shutdown + +Severity : normal +Frequency : testing only +Bugzilla : 12653 +Description: sanity test 65a fails if stripecount of -1 is set +Details : handle -1 striping on filesystem in ll_dirstripe_verify + +Severity : normal +Frequency : only in unusual configurations +Bugzilla : 16014 +Description: Kernel panic with find ost index. +Details : lov_obd have panic if some OST's have sparse indexes. + +Severity : enhancement +Bugzilla : 15865 +Description: Update to RHEL5 kernel-2.6.18-53.1.21.el5. + +Severity : major +Frequency : rarely, if filesystem is mounted with -o flock +Bugzilla : 15924 +Description: do not process already freed flock +Details : flock can possibly be freed by another thread before it reaches + to ldlm_flock_completion_ast. + +Severity : normal +Frequency : rarely, if filesystem is mounted with -o flock +Bugzilla : 14480 +Description: LBUG during stress test +Details : Need properly lock accesses the flock deadlock detection list. + +Severity : minor +Frequency : rarely, if binaries are being run from Lustre +Bugzilla : 15837 +Description: oops in page fault handler +Details : kernel page fault handler can return two special 'pages' in + error case, don't try dereference NOPAGE_SIGBUS and NOPAGE_OMM. + +Severity : minor +Frequency : rarely, during shutdown +Bugzilla : 15716 +Description: timeout with invalidate import. +Details : ptlrpcd_check call obd_zombie_impexp_cull and wait request which + should be handled by ptlrpcd. This produce long age waiting and + -ETIMEOUT ptlrpc_invalidate_import and as result LASSERT. + +Severity : normal +Frequency : rarely +Bugzilla : 14742 +Frequency : rare +Description: ASSERTION(CheckWriteback(page,cmd)) failed +Details : badly clear PG_Writeback bit in ll_ap_completion can produce false + positive assertion. + +Severity : normal +Frequency : only with broken builds/installations +Bugzilla : 15779 +Description: no LBUG if lquota.ko and fsfilt_ldiskfs.ko are different versions +Details : just return an error to a user, put a console error message + +Severity : enhancement +Bugzilla : 15741 +Description: Update to RHEL5 kernel-2.6.18-53.1.19.el5. + +Severity : enhancement +Bugzilla : 15742 +Description: Update to RHEL4 kernel-2.6.9-67.0.15. + +Severity : enhancement +Bugzilla : 14134 +Description: enable MGS and MDT services start separately +Details : add a 'nomgs' option in mount.lustre to enable start a MDT with + a co-located MGS without starting the MGS, which is a complement + to 'nosvc' mount option. + +Severity : normal +Frequency : always, on big-endian systems +Bugzilla : 14856 +Description: cleanup in ptlrpc code, related to PPC platform +Details : store magic in native order avoid panic's in recovery on PPC + node and forbid from this error in future. Also fix posibily + of twice swab data. Fix get lov striping to userpace. + +Severity : normal +Frequency : rarely, if replay get lost on server +Bugzilla : 15756 +Description: server incorrectly drop resent replays lead to recovery failure. +Details : do not drop replay according to msg flags, instead we check the + per-export recovery request queue for duplication of transno. + +Severity : normal +Frequency : after recovery +Bugzilla : 14835 +Description: precreate to many object's after del orphan. +Details : del orphan st in oscc last_id == next_id and this triger growing + count of precreated objects. Set flag LOW to skip increase count + of precreated objects. + +Severity : normal +Frequency : rare, on clear nid stats +Bugzilla : 15139 +Description: ASSERTION(client_stat->nid_exp_ref_count == 0) +Details : when clean nid stats sometimes try destroy live entry, + and this produce panic in free. + +Severity : major +Frequency : occasionally since 1.6.4 +Bugzilla : 15575 +Description: Stack overflow during MDS log replay +Details : ease stack pressure by using a thread dealing llog_process. + +Severity : minor +Frequency : very rare +Bugzilla : 13380 +Description: MDT cannot be unmounted, reporting "Mount still busy" +Details : Mountpoint references were being leaked during open reply + reconstruction after an MDS restart. Drop mountpoint reference + in reconstruct_open() and free dentry reference also. + +Severity : normal +Frequency : rare +Bugzilla : 15443 +Description: wait until IO finished before start new when do lock cancel. +Details : VM protocol want old IO finished before start new, in this case + need wait until PG_writeback is cleared until check dirty flag + and call writepages in lock cancel callback. + +Severity : normal +Frequency : rare +Bugzilla : 12888 +Description: mds_mfd_close() ASSERTION(rc == 0) +Details : In mds_mfd_close(), we need protect inode's writecount change + within its orphan write semaphore to prevent possible races. + +Severity : minor +Frequency : rare, on shutdown ost +Bugzilla : 14645 +Description: don't hit live lock with umount ost. +Details : shrink_dcache_parent can be in long loop with destroy dentries, + use shrink_dcache_sb instead. + +Severity : minor +Frequency : only when echo_client is used +Bugzilla : 14949 +Description: don't panic with use echo_client +Details : echo client pass NULL as client nid pointer and this produce NULL + pointer dereference. + +Severity : normal +Frequency : Always on 32-bit PowerPC systems +Bugzilla : 15278 +Description: fix build on PPC32 +Details : compile code with -m64 flag produce wrong object file for PPC32. + +Severity : normal +Frequency : rare +Bugzilla : 15574 +Description: MDS LBUG: ASSERTION(!IS_ERR(dchild)) +Details : In reconstruct_* functions, LASSERTs on both the data supplied + by a client, and the data on disk are dangerous and incorrect. + Change them with client eviction. + +Severity : enhancement +Bugzilla : 15346 +Description: skiplist implementation simplification +Details : skiplists are used to group compatible locks on granted list + that was implemented as tracking first and last lock of each lock + group the patch changes that to using doubly linked lists + +Severity : normal +Bugzilla : 15933 +Description: delete compatibility for 32bit qdata +Details : as planned, when lustre is beyond b1_8, lquota won't support 32bit + qunit. That means servers of b1_4 and servers of b1_8 can't be + used together if users want to use quota. + +Severity : normal +Frequency : only with administrator action +Bugzilla : 14693 +Description: mount failure if config log has invalid conf_param setting +Details : If administrator specified an incorrect configuration parameter + with "lctl conf_param" this would cause an error during future + client mounts. Instead, ignore the bad configuration parameter. + +Severity : normal +Frequency : blocks per group < blocksize*8 and uninit_groups is enabled +Bugzilla : 15932 +Description: ldiskfs error: XXX blocks in bitmap, YYY in gd +Details : If blocks per group is less than blocksize*8, set rest of the + bitmap to 1. + +Severity : major +Frequency : Application do stride read on lustre +Bugzilla : 16172 +Description: The read performance will drop a lot if the application does + stride read. +Details : Because the stride_start_offset are missing in stride read-ahead, + it will cause clients read a lot of unused pages in read-ahead, + then the read-performance drops. + +Severity : normal +Bugzilla : 15953 +Description: more ldlm soft lockups +Details : In ldlm_resource_add_lock(), call to ldlm_resource_dump() + starve other threads from the resource lock for a long time in + case of long waiting queue, so change the debug level from + D_OTHER to the less frequently used D_INFO. + +Severity : enhancement +Bugzilla : 13128 +Description: add -gid, -group, -uid, -user options to lfs find + +Severity : enhancement +Bugzilla : 15284 +Description: ll_recover_lost_found_objs - recover objects in lost+found +Details : OST corruption and subsequent e2fsck can leave objects in the + lost+found directory. Using the "ll_recover_lost_found_objs" + tool, these objects can be retrieved and data can be salvaged + by using the object ID saved in the fid EA on each object. + +Severity : minor +Frequency : rare +Bugzilla : 15758 +Description: this bug _only_ happens when inode quota limitation is very low + (less than 12), so that inode quota unit is 1 at initialization. +Details : if remaining quota equates 1, it is a sign to demonstate that quota + is effective now. So least quota qunit should be 2. + +Severity : normal +Bugzilla : 15950 +Description: Hung threads in invalidate_inode_pages2_range +Details : The direct IO path doesn't call check_rpcs to submit a new RPC once + one is completed. As a result, some RPCs are stuck in the queue + and are never sent. + +Severity : normal +Bugzilla : 15684 +Description: Procfs and llog threads access destoryed import sometimes. +Details : Sync the import destoryed process with procfs and llog threads by + the import refcount and semaphore. + +Severity : major +Bugzilla : 15674 +Description: mds fails to respond, threads stuck in ldlm_completion_ast +Details : Sort source/child resource pair after updating child resource. + +Severity : major +Frequncy : rare +Bugzilla : 16226 +Description: kernel BUG at ldiskfs2_ext_new_extent_cb +Details : If insertion of an extent fails, then discard the inode + preallocation and free data blocks else it can lead to duplicate + blocks. + +Severity : normal +Bugzilla : 16199 +Description: don't always update ctime in ext3_xattr_set_handle() +Details : Current xattr code updates the inode ctime in ext3_xattr_set_handle. + In some cases the ctime should not be updated, for example for + 2.0->1.8 compatibility it is necessary to delete an xattr and it + should not update the ctime. + +------------------------------------------------------------------------------- + + +2008-05-26 Sun Microsystems, Inc. * version 1.6.5 * Support for kernels: - 2.6.5-7.311 (SLES 9), - 2.6.9-67.0.7.EL (RHEL 4), - 2.6.16.54-0.2.5 (SLES 10), - 2.6.18-53.1.14.el5 (RHEL 5), - 2.6.18.8 vanilla (kernel.org) - 2.6.22.14 vanilla (kernel.org) + 2.6.5-7.311 (SLES 9), + 2.6.9-67.0.7.EL (RHEL 4), + 2.6.16.54-0.2.5 (SLES 10), + 2.6.18-53.1.14.el5 (RHEL 5), + 2.6.22.14 vanilla (kernel.org) * Client support for unpatched kernels: - (see http://wiki.lustre.org/index.php?title=Patchless_Client) - 2.6.16 - 2.6.22 vanilla (kernel.org) + (see http://wiki.lustre.org/index.php?title=Patchless_Client) + 2.6.16 - 2.6.22 vanilla (kernel.org) * Due to problems with nested symlinks and FMODE_EXEC (bug 12652), - we do not recommend using patchless RHEL4 clients with kernels - prior to 2.6.9-55EL (RHEL4U5). + we do not recommend using patchless RHEL4 clients with kernels + prior to 2.6.9-55EL (RHEL4U5). * Recommended e2fsprogs version: 1.40.7-sun1 * Note that reiserfs quotas are disabled on SLES 10 in this kernel. * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a - removed cwd "./" (refer to Bugzilla 14399). + removed cwd "./" (refer to Bugzilla 14399). * A new quota file format has been introduced in 1.6.5. - The format conversion from prior releases is handled transparently, + The format conversion from prior releases is handled transparently, but releases older than 1.4.12/1.6.5 will not understand this new format. The automatic format conversion can be avoided by running - the following command on the MDS: - 'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'. + the following command on the MDS before upgrading: + 'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'. For more information, please refer to bugzilla 13904. +Severity : major +Bugzilla : 14443 +Description: quota performance fix +Details : quota data is written in journalled mode instead of ordered to + increase performance + +Severity : normal +Bugzilla : 13915 +Description: lfs support for human-readable quota grace time strings +Details : lfs setquota -t and lfs quota -t represent quota grace times + in "XXwXXdXXhXXmXXs" format instead of large values in seconds + +Severity : normal +Frequency : always with o2ib 1.3 and sles10 +Bugzilla : 15870 +Description: fix build with SLES10 and o2ib v3. +Details : sles10 uses diffrent name for Module.symver file but configure + assume this file has same name on RHEL/SLES/vanila kernels. + +Severity : critical +Frequency : very rare, if additional xattrs are used on kernels >= 2.6.12 +Bugzilla : 15777 +Description: MDS may lose file striping (and hence file data) in some cases +Details : If there are additional extended attributes stored on the MDS, + in particular ACLs, SELinux, or user attributes (if user_xattr + is specified for the client mount options) then there is a risk + of attribute loss. Additionally, the Lustre file striping + needs to be larger than default (e.g. striped over all OSTs), + and an additional attribute must be stored initially in the + inode and then increase in size enough to be moved to the + external attribute block (e.g. ACL growing in size) for file + data to be lost. + +Severity : enhancement +Bugzilla : 12191 +Description: add message levels for liblustreapi + +Severity : normal +Frequency : rare +Bugzilla : 13380 +Description: MDT cannot be unmounted, reporting "Mount still busy" +Details : Mountpoint references were being leaked during open reply + reconstruction after an MDS restart. Drop mountpoint reference + in reconstruct_open() and free dentry reference also. + +Severity : minor +Frequency : rare +Bugzilla : 13380 +Description: fix for occasional failure case of -ENOSPC in recovery-small tests +Details : Move the 'good_osts' check before the 'total_bavail' check. This + will result in an -EAGAIN and in the exit call path we call + alloc_rr() which will with increasing aggressiveness attempt to + aquire precreated objects on the minimum number of required OSCs. + +Severity : major +Bugzilla : 14326 +Description: Use old size assignment to avoid deadlock +Details : This reverts the changes in bugs 2369 and bug 14138 that introduced + the scheduling while holding a spinlock. We do not need locking + for size in ll_update_inode() because size is only updated from + the MDS for directories or files without objects, so there is no + other place to do the update, and concurrent access to such inodes + are protected by the inode lock. + +Severity : normal +Bugzilla : 14655 +Description: Use __u64 instead of int for valid bits + +Severity : normal +Bugzilla : 14746 +Description: resolve "_IOWR redefined" build error on SLES10 + +Severity : normal +Bugzilla : 14763 +Description: dump the memory debugging after all modules are unloaded to + suppress false negative in conf_sanity test 39 + +Severity : normal +Bugzilla : 14872 +Description: the recovery timer never expires +Details : for new client connect request, the recovery timer should not be + reset, otherwise recovery timer will never expired, if the old + client never come. Only old client connect and first connection + req should trigger recovery timer reset. + Severity : normal +Bugzilla : 15521 +Description: the min numbers of lproc stats are wrong +Details : adding a new constant LC_MIN_INIT and use it for initialization + of lc_min. + +Severity : normal +Frequency : always with interactive lfs +Bugzilla : 15212 +Description: Reinitialize optind to 0 so that interactive lfs works in all cases + +Severity : normal +Frequency : with multiple concurrent readdir processes in same directory +Bugzilla : 15406, 15169, 15175 +Description: misc fixes for directory readahead. +Details : prevent previous statahead async RPC callback from processing the + current "statahead_info", race condition between async RPC callback + add dentry into dentry hash table and "ls" thread revalidate such + dentry, statahead his/miss control for hidden items, and so on. + +Severity : enhancement +Bugzilla : 15316 +Description: build kernel-ib packages for OFED 1.3 in our release cycle + +Severity : normal +Bugzilla : 15036 +Description: incore types cleaning in quota code (with respect to 64-bit limits) +Details : several u32 variables declarations are replaced with u64 declarations + +Severity : minor +Frequency : always +Bugzilla : 13969 +Description: fix SLES kernel versioning +Details : the kernel version for our SLES 10 kernel did not include a "-" + before the "smp" at the end. while this was not a problem in + general, it did mean that software trying to use the kernel + version to try to detect a vendor specific kernel would fail. + this was most evident by the OFED build scripts. + +Severity : normal +Frequency : rare +Bugzilla : 14803 +Description: Don't update lov_desc members until making sure they are valid +Details : When updating lov_desc members via proc fs, need fix their + validities before doing the real update. + +Severity : normal +Frequency : very rare Bugzilla : 15069 Description: don't put request into delay list while invalidate in flight. Details : ptlrpc_delay_request sometimes put in delay list while invalidate @@ -40,50 +506,53 @@ Severity : enhancement Bugzilla : 15240 Description: Update kernel to RHEL4 2.6.9-67.0.7. -Severity : minor +Severity : normal +Frequency : always Bugzilla : 14856 -Frequency : on ppc only +Frequency : on PPC only Description: not convert ost objects for directory because it's not exist. Details : ll_dir_getstripe assume dirrectory has ost objects but this wrong. -Severity : minor +Severity : enhancement Bugzilla : 15517 Description: Fix warnings with compile liblustre at sles10/rhel5 which have __u64 as usingied long long type. Severity : minor -Bugzilla : 15210 Frequency : rare, on shutdown +Bugzilla : 15210 Description: race process ast vs remove callback -Details : removing callback before disconnect import open race with processing - callback. +Details : removing callback before disconnect import open race with + processing callback. Severity : enhancement Bugzilla : 15416 Description: Update kernel to SLES9 2.6.5-7.311. -Severity : normal +Severity : enhancement Bugzilla : 12652 -Description: Add FMODE_EXEC to SLES10 SP1 series. +Description: Files open for execute are not marked busy on SLES10 +Details : Add FMODE_EXEC to SLES10 SP1 server kernel series. Severity : enhancement Bugzilla : 13397 Description: Add server support for vanilla-2.6.22.14. Severity : normal +Frequency : occasional Bugzilla : 13375 -Descriptoin: make lov_create() will not stuck in obd_statfs_rqset() +Description: Avoid lov_create() getting stuck in obd_statfs_rqset() Details : If an OST is down the MDS will hang indefinitely in obd_statfs_rqset() waiting for the statfs data. While for MDS QOS usage of statfs, it should not stuck in waiting. -Severity : normal +Severity : enhancement Bugzilla : 3055 Description: Disable adaptive timeouts by default Severity : major -Bugzilla : 15027 Frequency : on network error +Bugzilla : 15027 Description: panic with double free request if network error Details : mdc_finish_enqueue is finish request if any network error occuring, but it's true only for synchronus enqueue, for async enqueue @@ -91,8 +560,8 @@ Details : mdc_finish_enqueue is finish request if any network error occuring, himself. Severity : normal -Bugzilla : 14533 Frequency : rare, on recovery +Bugzilla : 14533 Description: read procfs can produce deadlock in some situation Details : Holding lprocfs lock with send rpc can produce block for destroy obd objects and this also block reconnect with -EALREADY. @@ -109,6 +578,7 @@ Description: mdc_set_open_replay_data LBUG Details : Set replay data for requests that are eligible for replay. Severity : normal +Frequency : common Bugzilla : 14321 Description: lustre_mgs: operation 101 on unconnected MGS Details : When MGC is disconnected from MGS long enough, MGS will evict the @@ -142,25 +612,28 @@ Bugzilla : 14793 Description: Update RHEL4 kernel to 2.6.9-67.0.4. Severity : minor -Frequency : rare on shutdown ost +Frequency : rare on shutdown OST Bugzilla : 13196 Description: Don't allow skipping OSTs if index has been specified. Details : Don't allow skipping OSTs if index has been specified, make locking in internal create lots better. Severity : normal +Frequency : rare Bugzilla : 14421 Description: ASSERTION(!PageDirty(page)) failed Details : Wrong check could lead to an assertion failure under specific load patterns. Severity : normal +Frequency : rare Bugzilla : 12228 Description: LBUG in ptlrpc_check_set() bad phase ebc0de00 Details : access to bitfield in structure is always rounded to long and this produce problem with not atomic change any bit. Severity : normal +Frequency : always Bugzilla : 13647 Description: Lustre make rpms failed. Details : Remove ldiskfs spec file to avoids rpmbuild be confused when @@ -171,7 +644,7 @@ Bugzilla : 14498 Description: Update to SLES9 SP4 kernel-2.6.5-7.308. Severity : normal -Frequency : rare on shutdown ost +Frequency : rare on shutdown OST Bugzilla : 14608 Description: If llog cancel was not send before clean_exports phase, this can produce deadlock in llog code. @@ -181,6 +654,7 @@ Details : If llog thread has last reference to obd and call class_import_put called from llog_commit_thread. Severity : normal +Frequency : only if OST index is skipped Bugzilla : 14607 Description: NULL lov_tgts causing MDS oops Details : more safe checks for NULL lov_tgts for avoid oops. @@ -194,6 +668,7 @@ Bugzilla : 14368 Description: Update to RHEL5 latest kernel-2.6.18-53.1.4.el5. Severity : normal +Frequency : always Bugzilla : 14136 Description: make mgs_setparam() handle fsname containing dash Details : fsname containing a dash does not work with lctl conf_param @@ -203,8 +678,8 @@ Bugzilla : 14288 Description: Update to RHEL4 Update-6 kernel-2.6.9-67.EL. Severity : normal -Bugzilla : 12702 Frequency : rare, in recovery and (or) destroy lovobjid file. +Bugzilla : 12702 Description: rewrite lov objid code. Details : Cleanup for lov objid code, remove scability problems and wrong locking. Fix sending last_id into ost. @@ -219,8 +694,8 @@ Description: Update to RHEL5 Update-1 kernel 2.6.18-53.el5. Details : Use d_move_locked instead of __d_move. Severity : major -Bugzilla : 14260 Frequency : rare, at shutdown +Bugzilla : 14260 Description: access already free / zero obd_namespace. Details : if client_disconnect_export was called without force flag set, and exist connect request in flight, this can produce access to @@ -228,13 +703,13 @@ Details : if client_disconnect_export was called without force flag set, store ocd flags in obd_namespace. Severity : minor -Bugzilla : 14418 Frequency : only at startup +Bugzilla : 14418 Description: not alloc memory with spinlock held. Details : allocation memory with GFP_KERNEL can produce sleep deadlock, if any spinlock held. -Severity : major +Severity : normal Frequency : always Bugzilla : 14270 Description: lfs find does not continue on file error @@ -292,10 +767,10 @@ Bugzilla : 13497 Description: LASSERT_{REQ,REP}SWAB macros are buggy Details : If SWAB_PARANOIA is disabled, the LASSERT_REQSWAB and LASSERT_REPSWAB macros become no-ops, which is incorrect. Drop - these macros and replace them with their difinitions instead. + these macros and replace them with their definitions instead. Severity : normal -Frequency : rarely +Frequency : rare Bugzilla : 13888 Description: interrupt oig_wait produce painc on resend. Details : brw_redo_request can be used for resend requests from ptlrpcd and @@ -346,14 +821,14 @@ Details : When CRAY_XT3 is defined, the fsgid supplied by the client is whereas the supplied fsgid can be trusted if it is in the list of supplementary groups returned by the group upcall. -Severity : normal +Severity : enhancement Bugzilla : 12749 Description: Root Squash Functionality Details : Implementation of NFS-like root squash capability. Specifically, don't allow someone with root access on a client node to be able to manipulate files owned by root on a server node. -Severity : normal +Severity : enhancement Bugzilla : 10718 Description: Slow trucate/writes to huge files at high offsets. Details : Directly associate cached pages to lock that protect those pages, @@ -361,6 +836,7 @@ Details : Directly associate cached pages to lock that protect those pages, once lock callback is received. Severity : normal +Frequency : common Bugzilla : 14379 Description: Too many locks accumulating on client during NFS usage Details : mds_open improperly used accmode to find out access mode to a @@ -383,6 +859,7 @@ Description: Allow masking D_WARNING, D_ERROR messages from console Details : Console messages can now be disabled via lnet.printk. Severity : normal +Frequency : always Bugzilla : 14614 Description: User code with malformed file open parameter crashes client node Details : Before packing join_file req, all the related reference should be @@ -562,10 +1039,46 @@ Severity : normal Bugzilla : 15198 Description: LDLM soft lockups - improvement Details : It is be possible to send the lock handle along with each read - or write request because the client is already doing a lock match + or write request because the client is already doing a lock match itself so there isn't any reason the OST should have to re-do that search. +Severity : normal +Frequency : rare +Bugzilla : 14036 +Description: lfs quota fails with deactivated OSTS +Details : With this patch, three improvements are included: + 1. detete the softlimit in mds and osts when use "lfs quota". + 2. display the inaccurate data in the output of "lfs quota". + 3. try to get quota info when "lfs quota" is executed. + +Severity : normal +Frequency : rare +Bugzilla : 15776 +Description: Extent locks not granted with no conflicts sometimes. +Details : When race occurs in glimpse handler and nothing is returned, + we do not reprocess the queue after lock cancel, and that leads + to a stall until next activity on a resource + +Severity : normal +Frequency : failover with quotaon +Bugzilla : 14840 +Description: during mds failovers with quota on, OSTs got into deadlock state + and causing dumpstack. +Details : for every quota slave, at any time, there is only one quota req + is sent to quota master for every uid/gid. Before that quota req + returns, all the thread relative to the same uid/gid will wait. + So if the quota req is lost because mds failovers or any other + reasons, this bug will be hit. Now, dqacq_interpret() will handle + quota reqs who time out. + +Severity : enhancement +Frequency : always +Bugzilla : 14783 +Description: when quota slave checks if quota is enough, there is an unnecessary + wait. +Details : place this wait on necessary place instead of always waiting. + -------------------------------------------------------------------------------- 2007-12-07 Cluster File Systems, Inc. <info@clusterfs.com> @@ -2112,15 +2625,6 @@ Description: Improve multi-block allocation algorithm to avoid fragmentation Details : The mballoc3 code (ldiskfs2 only) adds new mechanisms to improve allocation locality and avoid filesystem fragmentation. -Severity : normal -Frequency : rare -Bugzilla : 14036 -Description: lfs quota fails with deactivated OSTS -Details : With this patch, three improvements are included: - 1. detete the softlimit in mds and osts when use "lfs quota". - 2. display the inaccurate data in the output of "lfs quota". - 3. try to get quota info when "lfs quota" is executed. - ------------------------------------------------------------------------------ 2007-04-01 Cluster File Systems, Inc. <info@clusterfs.com> diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 7838733c85d1107c02a4817791dffea669278796..88a0e2e45bce16ef935a87d468c44054edc15fa9 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -621,6 +621,22 @@ AC_DEFUN([LC_FUNC_SET_FS_PWD], ]) ]) +# +# check for FS_RENAME_DOES_D_MOVE flag +# +AC_DEFUN([LC_FS_RENAME_DOES_D_MOVE], +[AC_MSG_CHECKING([if kernel has FS_RENAME_DOES_D_MOVE flag]) +LB_LINUX_TRY_COMPILE([ + #include <linux/fs.h> +],[ + int v = FS_RENAME_DOES_D_MOVE; +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_FS_RENAME_DOES_D_MOVE, 1, [kernel has FS_RENAME_DOES_D_MOVE flag]) +],[ + AC_MSG_RESULT([no]) +]) +]) # # LC_FUNC_MS_FLOCK_LOCK @@ -794,19 +810,6 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# LC_SYMVERFILE -# SLES 9 uses a different name for this file - unsure about vanilla kernels -# around this version, but it matters for servers only. -AC_DEFUN([LC_SYMVERFILE], - [AC_MSG_CHECKING([name of symverfile]) - if grep -q Modules.symvers $LINUX/scripts/Makefile.modpost ; then - SYMVERFILE=Modules.symvers - else - SYMVERFILE=Module.symvers - fi - AC_MSG_RESULT($SYMVERFILE) - AC_SUBST(SYMVERFILE)]) - # LC_DQUOTOFF_MUTEX # after 2.6.17 dquote use mutex instead if semaphore AC_DEFUN([LC_DQUOTOFF_MUTEX], @@ -1148,8 +1151,7 @@ AC_DEFINE(HAVE___D_MOVE, 1, # matter what symbol is exported, the kernel #defines node_to_cpumask # to the appropriate function and that's what we use. AC_DEFUN([LC_EXPORT_NODE_TO_CPUMASK], - [LB_LINUX_ARCH - LB_CHECK_SYMBOL_EXPORT([node_to_cpumask], + [LB_CHECK_SYMBOL_EXPORT([node_to_cpumask], [arch/$LINUX_ARCH/mm/numa.c], [AC_DEFINE(HAVE_NODE_TO_CPUMASK, 1, [node_to_cpumask is exported by @@ -1207,6 +1209,26 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +# 2.6.12 merge patch from oracle to convert tree_lock from spinlock to rwlock +AC_DEFUN([LC_RW_TREE_LOCK], +[AC_MSG_CHECKING([if kernel has tree_lock as rwlock]) +tmp_flags="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="-Werror" +LB_LINUX_TRY_COMPILE([ + #include <linux/fs.h> +],[ + struct address_space a; + + write_lock(&a.tree_lock); +],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_RW_TREE_LOCK, 1, [kernel has tree_lock as rw_lock]) +],[ + AC_MSG_RESULT([no]) +]) +EXTRA_KCFLAGS="$tmp_flags" +]) + # 2.6.23 have return type 'void' for unregister_blkdev AC_DEFUN([LC_UNREGISTER_BLKDEV_RETURN_INT], [AC_MSG_CHECKING([if unregister_blkdev return int]) @@ -1305,16 +1327,17 @@ AC_DEFUN([LC_PROG_LINUX], LC_QUOTA_READ LC_COOKIE_FOLLOW_LINK LC_FUNC_RCU + LC_QUOTA64 # does the kernel have VFS intent patches? LC_VFS_INTENT_PATCHES + # 2.6.12 + LC_RW_TREE_LOCK + # 2.6.15 LC_INODE_I_MUTEX - # SLES 10 (at least) - LC_SYMVERFILE - # 2.6.17 LC_DQUOTOFF_MUTEX @@ -1343,6 +1366,7 @@ AC_DEFUN([LC_PROG_LINUX], # 2.6.22 LC_INVALIDATE_BDEV_2ARG + LC_FS_RENAME_DOES_D_MOVE # 2.6.23 LC_UNREGISTER_BLKDEV_RETURN_INT LC_KERNEL_SPLICE_READ @@ -1399,11 +1423,8 @@ AC_ARG_ENABLE([liblustre-acl], AC_HELP_STRING([--disable-liblustre-acl], [disable ACL support for liblustre]), [],[enable_liblustre_acl=yes]) -if test x$enable_liblustre != xyes ; then - enable_liblustre_acl='no' -fi AC_MSG_RESULT([$enable_liblustre_acl]) -if test x$enable_liblustre_acl != xno ; then +if test x$enable_liblustre_acl = xyes ; then AC_DEFINE(LIBLUSTRE_POSIX_ACL, 1, Liblustre Support ACL-enabled MDS) fi @@ -1436,9 +1457,9 @@ fi AC_DEFUN([LC_CONFIG_ADAPTIVE_TIMEOUTS], [AC_MSG_CHECKING([whether to enable ptlrpc adaptive timeouts support]) AC_ARG_ENABLE([adaptive_timeouts], - AC_HELP_STRING([--enable-adaptive-timeouts], - [enable ptlrpc adaptive timeouts support]), - [],[enable_adaptive_timeouts='no']) + AC_HELP_STRING([--disable-adaptive-timeouts], + [disable ptlrpc adaptive timeouts support]), + [],[enable_adaptive_timeouts='yes']) AC_MSG_RESULT([$enable_adaptive_timeouts]) if test x$enable_adaptive_timeouts == xyes; then AC_DEFINE(HAVE_AT_SUPPORT, 1, [Enable adaptive timeouts support]) @@ -1535,6 +1556,30 @@ LB_LINUX_TRY_COMPILE([ ]) ]) +# +# LC_QUOTA64 +# linux kernel may have 64-bit limits support +# +AC_DEFUN([LC_QUOTA64], +[AC_MSG_CHECKING([if kernel has 64-bit quota limits support]) +LB_LINUX_TRY_COMPILE([ + #include <linux/kernel.h> + #include <linux/fs.h> + #include <linux/quotaio_v2.h> + int versions[] = V2_INITQVERSIONS_R1; + struct v2_disk_dqblk_r1 dqblk_r1; +],[],[ + AC_DEFINE(HAVE_QUOTA64, 1, [have quota64]) + AC_MSG_RESULT([yes]) + +],[ + AC_MSG_WARN([You have got no 64-bit kernel quota support.]) + AC_MSG_WARN([Continuing with limited quota support.]) + AC_MSG_WARN([quotacheck is needed for filesystems with recent quota versions.]) + AC_MSG_RESULT([no]) +]) +]) + # # LC_CONFIGURE # @@ -1546,10 +1591,6 @@ AC_DEFUN([LC_CONFIGURE], # include/liblustre.h AC_CHECK_HEADERS([asm/page.h sys/user.h sys/vfs.h stdint.h blkid/blkid.h]) -# include/lustre/lustre_user.h -# See note there re: __ASM_X86_64_PROCESSOR_H -AC_CHECK_HEADERS([linux/fs.h linux/quota.h]) - # liblustre/llite_lib.h AC_CHECK_HEADERS([xtio.h file.h]) @@ -1563,8 +1604,12 @@ AC_CHECK_FUNCS([inet_ntoa]) # libsysio/src/readlink.c LC_READLINK_SSIZE_T -# lvfs/prng.c -AC_CHECK_HEADERS([linux/random.h]) +# lvfs/prng.c - depends on linux/types.h from liblustre/dir.c +AC_CHECK_HEADERS([linux/random.h], [], [], + [#ifdef HAVE_LINUX_TYPES_H + # include <linux/types.h> + #endif + ]) # utils/llverfs.c AC_CHECK_HEADERS([ext2fs/ext2fs.h]) diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac index 9292e9edd3085b0f89642219df196308a598c0fd..6c48d8de114bf19b3bf7b3a21364ee484d3ab338 100644 --- a/lustre/autoconf/lustre-version.ac +++ b/lustre/autoconf/lustre-version.ac @@ -1,9 +1,15 @@ m4_define([LUSTRE_MAJOR],[1]) -m4_define([LUSTRE_MINOR],[6]) -m4_define([LUSTRE_PATCH],[5]) +m4_define([LUSTRE_MINOR],[7]) +m4_define([LUSTRE_PATCH],[60]) m4_define([LUSTRE_FIX],[0]) # Note: we're starting prerelease versions at 50 this time. +dnl # don't forget to update the service tags info +m4_define([CLIENT_URN],["LUSTRE-180-CLT"]) +m4_define([MDS_URN],["LUSTRE-180-MDS"]) +m4_define([MGS_URN],["LUSTRE-180-MGS"]) +m4_define([OSS_URN],["LUSTRE-180-OSS"]) + dnl # liblustre delta is 0.0.1.32 , next version with fixes is ok, but dnl # after following release candidate/beta would spill this warning already. m4_define([LUSTRE_VER_ALLOWED_OFFSET],["OBD_OCD_VERSION(0,0,1,32)"]) @@ -25,6 +31,10 @@ m4_define([LUSTRE_VERSION],m4_if(LUSTRE_FIX,[0],LUSTRE_MAJOR.LUSTRE_MINOR.LUSTRE [AC_LUSTRE_VER_ALLOWED_OFFSET]=LUSTRE_VER_ALLOWED_OFFSET [AC_LUSTRE_LIB_VER_OFFSET_WARN]=LUSTRE_LIB_VER_OFFSET_WARN [AC_LUSTRE_CLI_VER_OFFSET_WARN]=LUSTRE_CLI_VER_OFFSET_WARN +[AC_LUSTRE_CLIENT_URN]=CLIENT_URN +[AC_LUSTRE_MGS_URN]=MGS_URN +[AC_LUSTRE_MDS_URN]=MDS_URN +[AC_LUSTRE_OSS_URN]=OSS_URN AC_SUBST([AC_LUSTRE_MAJOR]) AC_SUBST([AC_LUSTRE_MINOR]) @@ -34,3 +44,7 @@ AC_SUBST([AC_LUSTRE_VERSION_STRING]) AC_SUBST([AC_LUSTRE_VER_ALLOWED_OFFSET]) AC_SUBST([AC_LUSTRE_LIB_VER_OFFSET_WARN]) AC_SUBST([AC_LUSTRE_CLI_VER_OFFSET_WARN]) +AC_SUBST([AC_LUSTRE_CLIENT_URN]) +AC_SUBST([AC_LUSTRE_MDS_URN]) +AC_SUBST([AC_LUSTRE_MGS_URN]) +AC_SUBST([AC_LUSTRE_OSS_URN]) diff --git a/lustre/doc/Makefile.am b/lustre/doc/Makefile.am index 549e740cf419491798d772d4e79263e2876ece08..0d740e90d6cd881f594e0052fd61ffb74d6a56ab 100644 --- a/lustre/doc/Makefile.am +++ b/lustre/doc/Makefile.am @@ -15,7 +15,7 @@ TEXEXPAND = texexpand SUFFIXES = .lin .lyx .pdf .ps .sgml .html .txt .tex .fig .eps .dvi MANFILES = lustre.7 lfs.1 mount.lustre.8 mkfs.lustre.8 tunefs.lustre.8 lctl.8 \ - llverdev.8 llbackup.8 + llverdev.8 llbackup.8 llapi_quotactl.3 if UTILS man_MANS = $(MANFILES) endif diff --git a/lustre/doc/lfs.1 b/lustre/doc/lfs.1 index 7f241096388ed069d0960b3df779cfcbac306202..b4c2cc39f38a9cdefeae523b3c7ae528428a9a85 100644 --- a/lustre/doc/lfs.1 +++ b/lustre/doc/lfs.1 @@ -11,8 +11,10 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the .br .B lfs find [[!] --atime|-A [-+]N] [[!] --mtime|-M [-+]N] \fB[[!] --ctime|-C [-+]N] [--maxdepth|-D N] [--name|-n pattern] - \fB[--print|-p] \fB[--print0|-P] [--obd|-O <uuid>] + \fB[--print|-p] \fB[--print0|-P] [--obd|-O <uuid[s]>] \fB[[!] --size|-S [-+]N[kMGTPE]] [--type |-t {bcdflpsD}] + \fB[[!] --gid|-g N] [[!] --group|-G <name>] + \fB[[!] --uid|-u N] [[!] --user|-U <name>] \fB<dirname|filename>\fR .br .B lfs getstripe [--obd|-O <uuid>] [--quiet|-q] [--verbose|-v] @@ -31,15 +33,33 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the .br .B lfs quotaoff [-ug] <filesystem> .br -.B lfs quotainv [-ug] <filesystem> +.B lfs quotainv [-ug] [-f] <filesystem> .br -.B lfs setquota [-u|-g] <username|groupname> <block-softlimit> - \fB<block-hardlimit> <inode-softlimit> <inode-hardlimit> +.B lfs setquota [-u|--user|-g|--group] <username|groupname> + \fB[--block-softlimit <block-softlimit>] + \fB[--block-hardlimit <block-hardlimit>] + \fB[--inode-softlimit <inode-softlimit>] + \fB[--inode-hardlimit <inode-hardlimit>] \fB<filesystem>\fR .br -.B lfs setquota -t [-u|-g] <block-grace> <inode-grace> <filesystem> +.B lfs setquota [-u|--user|-g|--group] <username|groupname> + \fB[-b <block-softlimit>] [-B <block-hardlimit>] + \fB[-i <inode-softlimit>] [-I <inode-hardlimit>] + \fB<filesystem>\fR +.br +.B lfs setquota -t [-u|-g] + \fB[--block-grace <block-grace>] + \fB[--inode-grace <inode-grace>] + \fB<filesystem>\fR +.br +.B lfs setquota -t [-u|-g] + \fB[-b <block-grace>] [-i <inode-grace>] + \fB<filesystem>\fR +.br + +.B lfs quota [-v] [-o obd_uuid] [-u|-g] <username|groupname> <filesystem> .br -.B lfs quota [-o obd_uuid] [-u|-g] <username|groupname> <filesystem> +.B lfs quota <filesystem> .br .B lfs quota -t [-u|-g] <filesystem> .br @@ -57,7 +77,7 @@ Display the status of MDS or OSTs (as specified in the command) or all the serve Report filesystem disk space usage or inodes usage of each MDT/OST. .TP .B find -To search the directory tree rooted at the given dir/file name for the files that match the given parameters: \fB--atime\fR (file was last accessed N*24 hours ago), \fB--ctime\fR (file's status was last changed N*24 hours ago), \fB--mtime\fR (file's data was last modified N*24 hours ago), \fB--obd\fR (file has an object on a specific OST), \fB--size\fR (file has size in bytes, or \fBk\fRilo-, \fBM\fRega-, \fBG\fRiga-, \fBT\fRera-, \fBP\fReta-, or \fBE\fRxabytes if a suffix is given), \fB--type\fR (file has the type: \fBb\fRlock, \fBc\fRharacter, \fBd\fRirectory, \fBp\fRipe, \fBf\fRile, sym\fBl\fRink, \fBs\fRocket, or \fBD\fRoor (Solaris)). The option \fB--maxdepth\fR allows find to decend at most N levels of directory tree. The options \fB--print\fR and \fB--print0\fR print full file name, followed by a newline or NUL character correspondingly. Using \fB!\fR before an option negates its meaning (\fIfiles NOT matching the parameter\fR). Using \fB+\fR before a numeric value means \fIfiles with the parameter OR MORE\fR, while \fB-\fR before a numeric value means \fIfiles with the parameter OR LESS\fR. +To search the directory tree rooted at the given dir/file name for the files that match the given parameters: \fB--atime\fR (file was last accessed N*24 hours ago), \fB--ctime\fR (file's status was last changed N*24 hours ago), \fB--mtime\fR (file's data was last modified N*24 hours ago), \fB--obd\fR (file has an object on a specific OST or OSTs), \fB--size\fR (file has size in bytes, or \fBk\fRilo-, \fBM\fRega-, \fBG\fRiga-, \fBT\fRera-, \fBP\fReta-, or \fBE\fRxabytes if a suffix is given), \fB--type\fR (file has the type: \fBb\fRlock, \fBc\fRharacter, \fBd\fRirectory, \fBp\fRipe, \fBf\fRile, sym\fBl\fRink, \fBs\fRocket, or \fBD\fRoor (Solaris)), \fB--uid\fR (file has specific numeric user ID), \fB--user\fR (file owned by specific user, numeric user ID allowed), \fB--gid\fR (file has specific group ID), \fB--group\fR (file belongs to specific group, numeric group ID allowed). The option \fB--maxdepth\fR allows find to decend at most N levels of directory tree. The options \fB--print\fR and \fB--print0\fR print full file name, followed by a newline or NUL character correspondingly. Using \fB!\fR before an option negates its meaning (\fIfiles NOT matching the parameter\fR). Using \fB+\fR before a numeric value means \fIfiles with the parameter OR MORE\fR, while \fB-\fR before a numeric value means \fIfiles with the parameter OR LESS\fR. .TP .B getstripe To list the striping info for a given filename or files in a directory, optionally recursively, for all files in a directory tree: \fB--quiet\fR (don't print object IDs), \fB--verbose\fR (print striping parameters), \fB--recursive\fR (recurse into subdirectories). @@ -65,8 +85,27 @@ To list the striping info for a given filename or files in a directory, optional .B osts List all the OSTs for the filesystem .TP -.B setstripe -To create a new file with a specific striping pattern +.B setstripe [--size stripe-size] [--count stripe-cnt] [--index start-ost] +To create a new file, or set the directory default, with the specified striping parameters. The +.I stripe-count +is the number of OSTs to stripe a file over. A +.I stripe-count +of 0 means to use the filesystem-wide default stripe count (default 1), and a +.I stripe-count +of -1 means to stripe over all available OSTs. The +.I stripe-size +is the number of bytes to store on each OST before moving to the next OST. A +.I stripe-size +of 0 means to use the filesystem-wide default stripe size (default 1MB). The +.I start-ost +is the OST index (starting at 0) on which to start striping for this file. A +.I start-ost +of -1 allows the MDS to specify the starting index and it is strongly +recommended that the starting OST not be given, as this allows space and +load balancing to be done by the MDS as needed. +.TP +.B lfs setstripe -d +Delete the default striping on the specified directory. .TP .B quotachown To change files' owner and group on OSTs of the specified filesystem @@ -80,17 +119,17 @@ To turn filesystem quotas on. Options specify quota for users (-u) groups (-g) a .B quotaoff [-ugf] <filesystem> To turn filesystem quotas off. Options specify quota for users (-u) groups (-g) and force (-f) .TP -.B quotainv [-ug] <filesystem> -Clear quota files, all of their quota entries, for (-u) users or (-g) groups; after quotainv one must use quotacheck before using quotas. USE THIS COMMAND WITH EXTREME CARE, ITS RESULTS CANNOT BE UNDONE. +.B quotainv [-ug] [-f] <filesystem> +Clear quota files (administrative quota files if used without -f, operational quota files otherwise), all of their quota entries, for (-u) users or (-g) groups; after quotainv one must use quotacheck before using quotas. DO NOT USE THIS COMMAND UNLESS YOU REALLY KNOW WHAT IT DOES. IT IS MAINLY FOR INTERNAL PURPOSES. .TP -.B setquota [-u|-g] <name> <block-softlimit> <block-hardlimit> <inode-softlimit> <inode-hardlimit> <filesystem> -To set filesystem quotas for users or groups. Limits are specific as blocks and inodes, see EXAMPLES +.B setquota [-u|-g] <name> [--block-softlimit <block-softlimit>] [--block-hardlimit <block-hardlimit>] [--inode-softlimit <inode-softlimit>] [--inode-hardlimit <inode-hardlimit>] <filesystem> +To set filesystem quotas for users or groups. Limits can be specified with -b, -k, -m, -g, -t, -p suffixes which specify units of 1, 2^10, 2^20, 2^30, 2^40 and 2^50 accordingly. Block limits unit is kilobyte (1024) by default and block limits are always kilobyte-grained (even if specified in bytes), see EXAMPLES .TP -.B setquota -t [-u|-g] <block-grace> <inode-grace> <filesystem> +.B setquota -t [-u|-g] [--block-grace <block-grace>] [--inode-grace <inode-grace>] <filesystem> To set filesystem quota grace times for users or groups. Grace time is specified in "XXwXXdXXhXXmXXs" format or as an integer seconds value, see EXAMPLES .TP -.B quota [-o obd_uuid] [-u|-g] <username|groupname> <filesystem> -To display disk usage and limits, either for the full filesystem, or for objects on a specific obd. A user or group name must be specified. +.B quota [-v] [-o obd_uuid] [-u|-g] <username|groupname> <filesystem> +To display disk usage and limits, either for the full filesystem, or for objects on a specific obd. A user or group name can be specified. If both user and group are omitted quotas for current uid/gid are shown. -v provides more verbose (with per-obd statistics) output. .TP .B quota -t [-u|-g] <filesystem> To display block and inode grace times for user (-u) or group (-g) quotas @@ -103,7 +142,7 @@ Quit the interactive lfs session .SH EXAMPLES .TP .B $ lfs setstripe -s 128k -c 2 /mnt/lustre/file1 -This creats a file striped on two OSTs with 128kB on each stripe. +This creates a file striped on two OSTs with 128kB on each stripe. .TP .B $ lfs setstripe -d /mnt/lustre/dir This deletes a default stripe pattern on dir. New files will use the default striping pattern created therein. @@ -144,10 +183,10 @@ Turn quotas of user and group on .B $ lfs quotaoff -ug /mnt/lustre Turn quotas of user and group off .TP -.B $ lfs setquota -u bob 0 1000000 0 10000 /mnt/lustre -Set quotas of user `bob': 1GB block quota and 10,000 file quota +.B $ lfs setquota -u bob --block-softlimit 2000000 --block-hardlimit 1000000 /mnt/lustre +Set quotas of user `bob': 1GB block quota hardlimit and 2 GB block quota softlimit .TP -.B $ lfs setquota -t -u 1000 1w4d /mnt/lustre +.B $ lfs setquota -t -u --block-grace 1000 --inode-grace 1w4d /mnt/lustre Set grace times for user quotas: 1000 seconds for block quotas, 1 week and 4 days for inode quotas .TP .B $ lfs quota -u bob /mnt/lustre diff --git a/lustre/doc/llapi_quotactl.3 b/lustre/doc/llapi_quotactl.3 new file mode 100644 index 0000000000000000000000000000000000000000..87ad80c6480680cdc1d4dc66a167914df3691523 --- /dev/null +++ b/lustre/doc/llapi_quotactl.3 @@ -0,0 +1,179 @@ +.TH LLAPI_QUOTACTL 3 +.SH NAME +llapi_quotactl \- manipulate disk quotas on a Lustre filesystem +.SH SYNOPSIS +.nf +.B #include <liblustre.h> +.B #include <lustre/lustre_idl.h> +.B #include <lustre/liblustreapi.h> +.B #include <lustre/lustre_user.h> +.sp +.BI "int llapi_quotactl(char" " *mnt," " struct if_quotactl" " *qctl) +.sp +\fBstruct if_quotactl { + __u32 qc_cmd; + __u32 qc_type; + __u32 qc_id; + __u32 qc_stat; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; + char obd_type[16]; + struct obd_uuid obd_uuid; +}; +.sp +\fBstruct obd_dqblk { + __u64 dqb_bhardlimit; + __u64 dqb_bsoftlimit; + __u64 dqb_curspace; + __u64 dqb_ihardlimit; + __u64 dqb_isoftlimit; + __u64 dqb_curinodes; + __u64 dqb_btime; + __u64 dqb_itime; + __u32 dqb_valid; + __u32 padding; +}; +.sp +\fBstruct obd_dqinfo { + __u64 dqi_bgrace; + __u64 dqi_igrace; + __u32 dqi_flags; + __u32 dqi_valid; +}; +.sp +\fBstruct obd_uuid { + char uuid[40]; +}; +.fi +.SH DESCRIPTION +.LP +.IX "filesystem" "quotactl() disk quotas" "" "\fLquotactl()\fP \(em disk quotas" +.IX "quotactl() disk quotas" "" "\fLquotactl()\fP \(em disk quotas" +.IX "disk quotas quotactl()" "" "disk quotas \(em \fLquotactl()\fP" +.LP +The +.B llapi_quotactl(\|) +function manipulates disk quotas on a Lustre filesystem +.I mnt. +.I qc_cmd +indicates a command to be applied to +.SM UID +.IR qc_id +or +.SM GID +.IR qc_id . +.TP 15 +.SB LUSTRE_Q_QUOTAON +Turn on quotas for a Lustre filesystem. +.I qc_type +is USRQUOTA, GRPQUOTA or UGQUOTA (both user and group quotas). +The quota files must exist; they are normally created with the +.BR llapi_quotacheck (3) +call. This call is restricted to the super-user. +.TP +.SB LUSTRE_Q_QUOTAOFF +Turn off quotas for a Lustre filesystem. +.I qc_type +is USRQUOTA, GRPQUOTA or UGQUOTA (both user and group quotas). +This call is restricted to the super-user. +.TP +.SB LUSTRE_Q_GETQUOTA +Get disk quota limits and current usage for user or group +.IR qc_id . +.I qc_type +is USRQUOTA or GRPQUOTA. +.I uuid +may be filled with OBD UUID string to query quota information from a specific node. +.I dqb_valid +may be set nonzero to query information only from MDS. If +.I uuid +is an empty string and +.I dqb_valid +is zero then clusterwide limits and usage are returned. On return +.I obd_dqblk +contains the requested information (block limits unit is kilobyte). +Quotas must be turned on before using this command. +.TP +.SB LUSTRE_Q_SETQUOTA +Set disk quota limits for user or group +.IR qc_id . +.I qc_type +is USRQUOTA or GRPQUOTA. +.I dqb_valid +must be set to QIF_ILIMITS, QIF_BLIMITS or QIF_LIMITS (both inode limits and block limits) dependent on updating limits. +.I obd_dqblk +must be filled with limits values (as set in +.I dqb_valid +, block limits unit is kilobyte). +Quotas must be turned on before using this command. +.TP +.SB LUSTRE_Q_GETINFO +Get information about quotas. +.I qc_type +is either USRQUOTA or GRPQUOTA. On return +.I dqi_igrace +is inode grace time (in seconds), +.I dqi_bgrace +is block grace time (in seconds), +.I dqi_flags +is not used by the current Lustre version. +.TP +.SB LUSTRE_Q_SETINFO +Set quota information (like grace times). +.I qc_type +is either USRQUOTA or GRPQUOTA. +.I dqi_igrace +is inode grace time (in seconds), +.I dqi_bgrace +is block grace time (in seconds), +.I dqi_flags +is not used by the current Lustre version and must be zeroed. +.SH RETURN VALUES +.LP +.B llapi_quotactl(\|) +returns: +.TP +0 +on success. +.TP +\-1 +on failure and sets +.B errno +to indicate the error. +.SH ERRORS +.TP 15 +.SM EFAULT +.I qctl +is invalid. +.TP +.SM ENOSYS +The kernel or Lustre modules have not been compiled with the +.SB QUOTA +option. +.TP +.SM ENOMEM +Insufficient memory to complete operation. +.TP +.SM ENOTTY +.I qc_cmd +is invalid. +.TP +.SM EBUSY +Cannot process during quotacheck. +.TP +.SM ENOENT +.I uuid +does not correspond to OBD or +.I mnt +does not exist +.TP +.SM EPERM +The call is privileged and the caller was not the super-user. +.TP +.SM ESRCH +No disc quota is found for the indicated user. +.IP +Quotas have not been turned on for this filesystem. +.SH "SEE ALSO" +.BR lfs (1), +.BR lustre (7) diff --git a/lustre/doc/mount.lustre.8 b/lustre/doc/mount.lustre.8 index ae3c8423fb2a09e467eca25feb24dc805dd8ef0a..8254d881ccc911c26efecc1fbd32482a8ca93d25 100644 --- a/lustre/doc/mount.lustre.8 +++ b/lustre/doc/mount.lustre.8 @@ -96,6 +96,9 @@ options: .BI nosvc Only start the MGC (and MGS, if co-located) for a target service, and not the actual service. .TP +.BI nomgs +Start a MDT with a co-located MGS without starting the MGS. +.TP .BI exclude= ostlist Start a client or MDT with a (colon-separated) list of known inactive OSTs. .TP @@ -114,6 +117,18 @@ Start the Lustre metadata target service from /dev/sda1 on mountpoint /mnt/test/ .B mount -t lustre -L testfs-MDT0000 -o abort_recov /mnt/test/mdt Start the testfs-MDT0000 service (by using the disk label), but abort the recovery process. +.SH NOTES +If the Service Tags tool (from the sun-servicetag package) can be found in +/opt/sun/servicetag/bin/stclient an inventory service tag will be created +reflecting the Lustre service being provided. If this tool cannot be found +.B mount.lustre +will silently ignore it and no service tag is created. The +.BR stclient (1) +tool only creates the local service tag. No information is sent to the asset +management system until you run the Registration Client to collect the tags +and then upload them to the inventory system using your inventory system account. +See https://inventory.sun.com/ for more details on a web-based, free, IT asset +management system. .SH BUGS Not very many mount options can be changed with .BR "-o remount" . diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 79b35fd9c7ea28c95269e4f5dfd7710d8389b127..84f9242c522b2abc09e3d8d8159452e51da34c4a 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -295,6 +295,14 @@ static inline void spin_unlock_bh(spinlock_t *l) {} static inline void spin_lock_irqsave(spinlock_t *a, unsigned long b) {} static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {} +typedef spinlock_t rwlock_t; +#define RW_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#define read_lock(l) spin_lock(l) +#define read_unlock(l) spin_unlock(l) +#define write_lock(l) spin_lock(l) +#define write_unlock(l) spin_unlock(l) +#define rwlock_init(l) spin_lock_init(l) + #define min(x,y) ((x)<(y) ? (x) : (y)) #define max(x,y) ((x)>(y) ? (x) : (y)) diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 1ff876555b64da2250c92eda834a0387d275ec55..d9e4499e6123cc7e6fd460b627fbcb22e511ee6b 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -220,7 +220,8 @@ extern void __d_move(struct dentry *dentry, struct dentry *target); #endif #define CheckWriteback(page, cmd) \ - (!(!PageWriteback(page) && cmd == OBD_BRW_WRITE)) + ((!PageWriteback(page) && (cmd & OBD_BRW_READ)) || \ + (PageWriteback(page) && (cmd & OBD_BRW_WRITE))) #else /* 2.4.. */ @@ -538,5 +539,11 @@ int ll_unregister_blkdev(unsigned int dev, const char *name) #define ll_invalidate_bdev(a,b) invalidate_bdev((a)) #endif +#ifdef HAVE_FS_RENAME_DOES_D_MOVE +#define LL_RENAME_DOES_D_MOVE FS_RENAME_DOES_D_MOVE +#else +#define LL_RENAME_DOES_D_MOVE FS_ODD_RENAME +#endif + #endif /* __KERNEL__ */ #endif /* _COMPAT25_H */ diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index 7a63f2e8de4655151ab4ac6ca95840c275ee48f1..c531c059fee6d50fb0eca9510271eae1e541b307 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -50,6 +50,7 @@ enum { LPROC_LL_FSYNC, LPROC_LL_SETATTR, LPROC_LL_TRUNC, + LPROC_LL_LOCKLESS_TRUNC, LPROC_LL_FLOCK, #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index 75a77cd333d271e23c2c68ecd229a526c456dfde..dcb5ba12eb65b25afe7344be7927b39c311328f9 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -38,10 +38,6 @@ #include <linux/timer.h> #endif -void obd_zombie_impexp_init(void); -void obd_zombie_impexp_cull(void); -extern void (*obd_zombie_impexp_notify)(void); - /* obdo.c */ #ifdef __KERNEL__ void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned ia_valid); diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index cceec77bf20d299e745f131c5ad9e2ef7c7b0c19..4f97708ae67bc9d1b2dcff502dc5b0dba10bba23 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -160,7 +160,6 @@ enum lprocfs_fields_flags { struct lprocfs_stats { unsigned int ls_num; /* # of counters */ - unsigned int ls_percpu_size; int ls_flags; /* See LPROCFS_STATS_FLAG_* */ spinlock_t ls_lock; /* Lock used only when there are * no percpu stats areas */ @@ -201,6 +200,25 @@ static inline int opcode_offset(__u32 opc) { (LDLM_LAST_OPC - LDLM_FIRST_OPC) + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); + } else if (opc < FLD_LAST_OPC) { + /* FLD opcode */ + return (opc - FLD_FIRST_OPC + + (LLOG_LAST_OPC - LLOG_FIRST_OPC) + + (OBD_LAST_OPC - OBD_FIRST_OPC) + + (MGS_LAST_OPC - MGS_FIRST_OPC) + + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + + (MDS_LAST_OPC - MDS_FIRST_OPC) + + (OST_LAST_OPC - OST_FIRST_OPC)); + } else if (opc < SEQ_LAST_OPC) { + /* SEQ opcode */ + return (opc - SEQ_FIRST_OPC + + (FLD_LAST_OPC - FLD_FIRST_OPC) + + (LLOG_LAST_OPC - LLOG_FIRST_OPC) + + (OBD_LAST_OPC - OBD_FIRST_OPC) + + (MGS_LAST_OPC - MGS_FIRST_OPC) + + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + + (MDS_LAST_OPC - MDS_FIRST_OPC) + + (OST_LAST_OPC - OST_FIRST_OPC)); } else { /* Unknown Opcode */ return -1; @@ -212,7 +230,9 @@ static inline int opcode_offset(__u32 opc) { (LDLM_LAST_OPC - LDLM_FIRST_OPC) + \ (MGS_LAST_OPC - MGS_FIRST_OPC) + \ (OBD_LAST_OPC - OBD_FIRST_OPC) + \ - (LLOG_LAST_OPC - LLOG_FIRST_OPC)) + (LLOG_LAST_OPC - LLOG_FIRST_OPC) + \ + (FLD_LAST_OPC - FLD_FIRST_OPC) + \ + (SEQ_LAST_OPC - SEQ_FIRST_OPC)) #define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \ (EXTRA_LAST_OPC - EXTRA_FIRST_OPC)) @@ -345,7 +365,7 @@ struct obd_export; extern int lprocfs_add_clear_entry(struct obd_device * obd, cfs_proc_dir_entry_t *entry); extern int lprocfs_exp_setup(struct obd_export *exp, - lnet_nid_t peer_nid, int *newnid); + lnet_nid_t *peer_nid, int *newnid); extern int lprocfs_exp_cleanup(struct obd_export *exp); extern int lprocfs_add_simple(struct proc_dir_entry *root, char *name, read_proc_t *read_proc, @@ -494,14 +514,14 @@ extern struct rw_semaphore _lprocfs_lock; * the import in a client obd_device for a lprocfs entry */ #define LPROCFS_CLIMP_CHECK(obd) do { \ typecheck(struct obd_device *, obd); \ - mutex_down(&(obd)->u.cli.cl_sem); \ + down_read(&(obd)->u.cli.cl_sem); \ if ((obd)->u.cli.cl_import == NULL) { \ - mutex_up(&(obd)->u.cli.cl_sem); \ + up_read(&(obd)->u.cli.cl_sem); \ return -ENODEV; \ } \ } while(0) #define LPROCFS_CLIMP_EXIT(obd) \ - mutex_up(&(obd)->u.cli.cl_sem); + up_read(&(obd)->u.cli.cl_sem); /* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only @@ -586,7 +606,8 @@ static inline void lprocfs_free_obd_stats(struct obd_device *obddev) struct obd_export; static inline int lprocfs_add_clear_entry(struct obd_export *exp) { return 0; } -static inline int lprocfs_exp_setup(struct obd_export *exp) +static inline int lprocfs_exp_setup(struct obd_export *exp, + lnet_nid_t *peer_nid, int *newnid) { return 0; } static inline int lprocfs_exp_cleanup(struct obd_export *exp) { return 0; } diff --git a/lustre/include/lustre/.cvsignore b/lustre/include/lustre/.cvsignore index 282522db0342d8750454b3dc162493b5fc709cc8..ce1315325158902272830a434d727a16332d89fd 100644 --- a/lustre/include/lustre/.cvsignore +++ b/lustre/include/lustre/.cvsignore @@ -1,2 +1,3 @@ +lustre_build_version.h Makefile Makefile.in diff --git a/lustre/include/lustre/liblustreapi.h b/lustre/include/lustre/liblustreapi.h index 4d6c802ae3d102b3fc46e6e118b8d668a0b98e52..f1758fd39a33781012092575f808456ca590115d 100644 --- a/lustre/include/lustre/liblustreapi.h +++ b/lustre/include/lustre/liblustreapi.h @@ -59,6 +59,8 @@ struct find_param { int size_sign; unsigned long long size_units; int size_check; + uid_t uid; + gid_t gid; unsigned long zeroend:1, recursive:1, @@ -66,7 +68,11 @@ struct find_param { obds_printed:1, exclude_pattern:1, exclude_type:1, - have_fileinfo:1; + have_fileinfo:1, + exclude_gid:1, + exclude_uid:1, + check_gid:1, + check_uid:1; int verbose; int quiet; @@ -99,10 +105,12 @@ extern int llapi_obd_statfs(char *path, __u32 type, __u32 index, extern int llapi_ping(char *obd_type, char *obd_name); extern int llapi_target_check(int num_types, char **obd_types, char *dir); extern int llapi_catinfo(char *dir, char *keyword, char *node_name); +extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid); +extern int llapi_file_get_lov_fuuid(int fd, struct obd_uuid *lov_uuid); extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count); extern int llapi_is_lustre_mnttype(const char *type); extern int parse_size(char *optarg, unsigned long long *size, - unsigned long long *size_units); + unsigned long long *size_units, int bytes_spec); struct mntent; #define HAVE_LLAPI_IS_LUSTRE_MNT extern int llapi_is_lustre_mnt(struct mntent *mnt); diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 8d4d57b2bbefcb9895400ed76d3310d8550768ac..a90431422dd5cdd5282b5d657f1630a83a9cee04 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -33,8 +33,7 @@ * * We assume all nodes are either little-endian or big-endian, and we * always send messages in the sender's native format. The receiver - * detects the message format by checking the 'magic' field of the message - * (see lustre_msg_swabbed() below). + * detects the message format by checking the 'magic' field of the message. * * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines, * implemented either here, inline (trivial implementations) or in @@ -66,6 +65,8 @@ #include <lustre/lustre_user.h> #include <lustre_ver.h> +#include <libcfs/kp30.h> + /* * this file contains all data structures used in Lustre interfaces: * - obdo and obd_request records @@ -110,6 +111,8 @@ #define MGS_REQUEST_PORTAL 26 #define MGS_REPLY_PORTAL 27 #define OST_REQUEST_PORTAL 28 +#define FLD_REQUEST_PORTAL 29 +#define SEQ_METADATA_PORTAL 30 #define SVC_KILLED 1 #define SVC_EVENT 2 @@ -241,6 +244,9 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); #define DLM_LOCKREPLY_OFF 1 /* lockrep offset */ #define DLM_REPLY_REC_OFF 2 /* reply record offset */ +/* only use in req->rq_{req,rep}_swab_mask */ +#define MSG_PTLRPC_HEADER_OFF 31 + /* Flags that are operation-specific go in the top 16 bits. */ #define MSG_OP_FLAG_MASK 0xffff0000 #define MSG_OP_FLAG_SHIFT 16 @@ -322,8 +328,6 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); #define ECHO_CONNECT_SUPPORTED (0) #define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT) -#define MAX_QUOTA_COUNT32 (0xffffffffULL) - #define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\ ((patch)<<8) + (fix)) #define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255) @@ -499,11 +503,18 @@ struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */ #define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */ #define OBD_MD_REINT (0x0000000200000000ULL) /* reintegrate oa */ +#define OBD_MD_MEA (0x0000000400000000ULL) /* CMD split EA */ #define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */ #define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */ #define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */ #define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */ +#define OBD_MD_FLRMTPERM (0x0000010000000000ULL) /* remote permission */ +#define OBD_MD_FLMDSCAPA (0x0000020000000000ULL) /* MDS capability */ +#define OBD_MD_FLOSSCAPA (0x0000040000000000ULL) /* OSS capability */ +#define OBD_MD_FLCKSPLIT (0x0000080000000000ULL) /* Check split on server */ +#define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */ + #define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \ OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \ @@ -640,8 +651,9 @@ typedef enum { REINT_UNLINK = 4, REINT_RENAME = 5, REINT_OPEN = 6, -// REINT_CLOSE = 7, -// REINT_WRITE = 8, + REINT_SETXATTR = 7, +// REINT_CLOSE = 8, +// REINT_WRITE = 9, REINT_MAX } mds_reint_t; @@ -667,15 +679,241 @@ typedef enum { /* This FULL lock is useful to take on unlink sort of operations */ #define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1) -struct ll_fid { - __u64 id; /* holds object id */ - __u32 generation; /* holds object generation */ +extern void lustre_swab_ll_fid (struct ll_fid *fid); - __u32 f_type; /* holds object type or stripe idx when passing it to - * OST for saving into EA. */ +struct lu_fid { + __u64 f_seq; /* holds fid sequence. Lustre should support 2^64 + * objects, thus even if one sequence has one object we + * reach this value. */ + __u32 f_oid; /* fid number within its sequence. */ + __u32 f_ver; /* holds fid version. */ }; -extern void lustre_swab_ll_fid (struct ll_fid *fid); +#define DFID "[0x%16.16"LPF64"x/0x%8.8x:0x%8.8x]" + +#define PFID(fid) \ + fid_seq(fid), \ + fid_oid(fid), \ + fid_ver(fid) + +enum { + /** put FID sequence at this offset in ldlm_res_id. */ + LUSTRE_RES_ID_SEQ_OFF = 0, + /** put FID oid at this offset in ldlm_res_id. */ + LUSTRE_RES_ID_OID_OFF = 1, + /** put FID version at this offset in ldlm_res_id. */ + LUSTRE_RES_ID_VER_OFF = 2, + /** put pdo hash at this offset in ldlm_res_id. */ + LUSTRE_RES_ID_HSH_OFF = 3 +}; + +typedef __u64 seqno_t; + +struct lu_range { + __u64 lr_start; + __u64 lr_end; +}; + +static inline __u64 range_space(struct lu_range *r) +{ + return r->lr_end - r->lr_start; +} + +static inline void range_zero(struct lu_range *r) +{ + r->lr_start = r->lr_end = 0; +} + +static inline int range_within(struct lu_range *r, + __u64 s) +{ + return s >= r->lr_start && s < r->lr_end; +} + +static inline void range_alloc(struct lu_range *r, + struct lu_range *s, + __u64 w) +{ + r->lr_start = s->lr_start; + r->lr_end = s->lr_start + w; + s->lr_start += w; +} +static inline int range_is_sane(struct lu_range *r) +{ + return (r->lr_end >= r->lr_start); +} + +static inline int range_is_zero(struct lu_range *r) +{ + return (r->lr_start == 0 && r->lr_end == 0); +} + +static inline int range_is_exhausted(struct lu_range *r) +{ + return range_space(r) == 0; +} + +#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x]" + +#define PRANGE(range) \ + (range)->lr_start, \ + (range)->lr_end + +enum { + /* + * This is how may FIDs may be allocated in one sequence. + */ + LUSTRE_SEQ_MAX_WIDTH = 0x0000000000004000ULL, +}; + +enum lu_cli_type { + LUSTRE_SEQ_METADATA, + LUSTRE_SEQ_DATA +}; + +struct lu_client_seq { + /* Sequence-controller export. */ + struct obd_export *lcs_exp; + struct semaphore lcs_sem; + + /* + * Range of allowed for allocation sequences. When using lu_client_seq + * on clients, this contains meta-sequence range. And for servers this + * contains super-sequence range. + */ + struct lu_range lcs_space; + + /* This holds last allocated fid in last obtained seq */ + struct lu_fid lcs_fid; + + /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */ + enum lu_cli_type lcs_type; + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with procfs. + */ + char lcs_name[80]; + + /* + * Sequence width, that is how many objects may be allocated in one + * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH. + */ + __u64 lcs_width; + +}; + +/* + * fid constants + */ +enum { + /* initial fid id value */ + LUSTRE_FID_INIT_OID = 1UL +}; + +extern void lustre_swab_lu_fid(struct lu_fid *fid); + +/* get object sequence */ +static inline __u64 fid_seq(const struct lu_fid *fid) +{ + return fid->f_seq; +} + +/* get object id */ +static inline __u32 fid_oid(const struct lu_fid *fid) +{ + return fid->f_oid; +} + +/* get object version */ +static inline __u32 fid_ver(const struct lu_fid *fid) +{ + return fid->f_ver; +} + +static inline void fid_init(struct lu_fid *fid) +{ + memset(fid, 0, sizeof(*fid)); +} + +/* Normal FID sequence starts from this value, i.e. 1<<33 */ +#define FID_SEQ_START 0x200000000ULL + +/* IDIF sequence starts from this value, i.e. 1<<32 */ +#define IDIF_SEQ_START 0x100000000ULL + +/** + * Check if a fid is igif or not. + * \param fid the fid to be tested. + * \return true if the fid is a igif; otherwise false. + */ +static inline int fid_is_igif(const struct lu_fid *fid) +{ + return fid_seq(fid) > 0 && fid_seq(fid) < IDIF_SEQ_START; +} + +/** + * Check if a fid is idif or not. + * \param fid the fid to be tested. + * \return true if the fid is a idif; otherwise false. + */ +static inline int fid_is_idif(const struct lu_fid *fid) +{ + return fid_seq(fid) >= IDIF_SEQ_START && fid_seq(fid) < FID_SEQ_START; +} + +/** + * Check if a fid is zero. + * \param fid the fid to be tested. + * \return true if the fid is zero; otherwise false. + */ +static inline int fid_is_zero(const struct lu_fid *fid) +{ + return fid_seq(fid) == 0 && fid_oid(fid) == 0; +} + +/** + * Get inode number from a igif. + * \param fid a igif to get inode number from. + * \return inode number for the igif. + */ +static inline ino_t lu_igif_ino(const struct lu_fid *fid) +{ + return fid_seq(fid); +} + +/** + * Get inode generation from a igif. + * \param fid a igif to get inode generation from. + * \return inode generation for the igif. + */ +static inline __u32 lu_igif_gen(const struct lu_fid *fid) +{ + return fid_oid(fid); +} + +/** + * Check if two fids are equal or not. + * \param f0 the first fid + * \param f1 the second fid + * \return true if the two fids are equal; otherwise false. + */ +static inline int lu_fid_eq(const struct lu_fid *f0, + const struct lu_fid *f1) +{ + /* Check that there is no alignment padding. */ + CLASSERT(sizeof *f0 == + sizeof f0->f_seq + sizeof f0->f_oid + sizeof f0->f_ver); + LASSERTF(fid_is_igif(f0) || fid_ver(f0) == 0, DFID"\n", PFID(f0)); + LASSERTF(fid_is_igif(f1) || fid_ver(f1) == 0, DFID"\n", PFID(f1)); + return memcmp(f0, f1, sizeof *f0) == 0; +} + +void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src); +void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src); + +struct ldlm_res_id * +fid_build_reg_res_name(const struct lu_fid *f, struct ldlm_res_id *name); +int fid_res_name_eq(const struct lu_fid *f, const struct ldlm_res_id *name); #define MDS_STATUS_CONN 1 #define MDS_STATUS_LOV 2 @@ -763,13 +1001,49 @@ struct mds_body { extern void lustre_swab_mds_body (struct mds_body *b); +/* struct mdt_body is only used for size checking. + * mdt_body & mds_body should have the same size. + */ +struct mdt_body { + struct lu_fid fid1; + struct lu_fid fid2; + struct lustre_handle handle; + __u64 valid; + __u64 size; /* Offset, in the case of MDS_READPAGE */ + __u64 mtime; + __u64 atime; + __u64 ctime; + __u64 blocks; /* XID, in the case of MDS_READPAGE */ + __u64 ioepoch; + __u64 ino; /* for 1.6 compatibility */ + __u32 fsuid; + __u32 fsgid; + __u32 capability; + __u32 mode; + __u32 uid; + __u32 gid; + __u32 flags; /* from vfs for pin/unpin, MDS_BFLAG for close */ + __u32 rdev; + __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */ + __u32 generation; /* for 1.6 compatibility */ + __u32 suppgid; + __u32 eadatasize; + __u32 aclsize; + __u32 max_mdsize; + __u32 max_cookiesize; + __u32 padding_4; /* also fix lustre_swab_mdt_body */ +}; + #define Q_QUOTACHECK 0x800100 #define Q_INITQUOTA 0x800101 /* init slave limits */ #define Q_GETOINFO 0x800102 /* get obd quota info */ #define Q_GETOQUOTA 0x800103 /* get obd quotas */ +#define Q_FINVALIDATE 0x800104 /* invalidate operational quotas */ + +#define Q_TYPEMATCH(id, type) \ + ((id) == (type) || (id) == UGQUOTA) -#define Q_TYPESET(oqc, type) \ - ((oqc)->qc_type == type || (oqc)->qc_type == UGQUOTA) +#define Q_TYPESET(oqc, type) Q_TYPEMATCH((oqc)->qc_type, type) #define Q_GETOCMD(oqc) \ ((oqc)->qc_cmd == Q_GETOINFO || (oqc)->qc_cmd == Q_GETOQUOTA) @@ -853,6 +1127,7 @@ struct mds_rec_setattr { #define MDS_ATTR_KILL_SGID 0x1000ULL /* = 4096 */ #define MDS_ATTR_CTIME_SET 0x2000ULL /* = 8192 */ #define MDS_ATTR_FROM_OPEN 0x4000ULL /* = 16384, called from open path, ie O_TRUNC */ +#define MDS_ATTR_BLOCKS 0x8000ULL /* = 32768 */ extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa); @@ -896,6 +1171,35 @@ struct mds_rec_create { extern void lustre_swab_mds_rec_create (struct mds_rec_create *cr); +struct mdt_rec_create { + __u32 cr_opcode; + __u32 cr_fsuid; + __u32 cr_fsgid; + __u32 cr_cap; + __u32 cr_suppgid1; + __u32 cr_suppgid2; + struct lu_fid cr_fid1; + struct lu_fid cr_fid2; + struct lustre_handle cr_old_handle; /* handle in case of open replay */ + __u64 cr_time; + __u64 cr_rdev; + __u64 cr_ioepoch; + __u64 cr_padding_1; /* pad for 64 bits*/ + __u32 cr_mode; + __u32 cr_bias; + __u32 cr_flags; /* for use with open */ + __u32 cr_padding_2; /* pad for 64 bits*/ + __u32 cr_padding_3; /* pad for 64 bits*/ + __u32 cr_padding_4; /* pad for 64 bits*/ +}; + +struct mdt_epoch { + struct lustre_handle handle; + __u64 ioepoch; + __u32 flags; + __u32 padding; +}; + struct mds_rec_join { struct ll_fid jr_fid; __u64 jr_headsize; @@ -903,6 +1207,12 @@ struct mds_rec_join { extern void lustre_swab_mds_rec_join (struct mds_rec_join *jr); +struct mdt_rec_join { + struct lu_fid jr_fid; + __u64 jr_headsize; +}; + + struct mds_rec_link { __u32 lk_opcode; __u32 lk_fsuid; @@ -921,6 +1231,28 @@ struct mds_rec_link { extern void lustre_swab_mds_rec_link (struct mds_rec_link *lk); +struct mdt_rec_link { + __u32 lk_opcode; + __u32 lk_fsuid; + __u32 lk_fsgid; + __u32 lk_cap; + __u32 lk_suppgid1; + __u32 lk_suppgid2; + struct lu_fid lk_fid1; + struct lu_fid lk_fid2; + __u64 lk_time; + __u64 lk_padding_1; + __u64 lk_padding_2; + __u64 lk_padding_3; + __u64 lk_padding_4; + __u32 lk_bias; + __u32 lk_padding_5; + __u32 lk_padding_6; + __u32 lk_padding_7; + __u32 lk_padding_8; + __u32 lk_padding_9; +}; + struct mds_rec_unlink { __u32 ul_opcode; __u32 ul_fsuid; @@ -939,6 +1271,28 @@ struct mds_rec_unlink { extern void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul); +struct mdt_rec_unlink { + __u32 ul_opcode; + __u32 ul_fsuid; + __u32 ul_fsgid; + __u32 ul_cap; + __u32 ul_suppgid1; + __u32 ul_padding2; + struct lu_fid ul_fid1; + struct lu_fid ul_fid2; + __u64 ul_time; + __u64 ul_padding_2; + __u64 ul_padding_3; + __u64 ul_padding_4; + __u64 ul_padding_5; + __u32 ul_bias; + __u32 ul_mode; + __u32 ul_padding_6; + __u32 ul_padding_7; + __u32 ul_padding_8; + __u32 ul_padding_9; +}; + struct mds_rec_rename { __u32 rn_opcode; __u32 rn_fsuid; @@ -957,6 +1311,97 @@ struct mds_rec_rename { extern void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn); +struct mdt_rec_rename { + __u32 rn_opcode; + __u32 rn_fsuid; + __u32 rn_fsgid; + __u32 rn_cap; + __u32 rn_suppgid1; + __u32 rn_suppgid2; + struct lu_fid rn_fid1; + struct lu_fid rn_fid2; + __u64 rn_time; + __u64 rn_padding_1; + __u64 rn_padding_2; + __u64 rn_padding_3; + __u64 rn_padding_4; + __u32 rn_bias; /* some operation flags */ + __u32 rn_mode; /* cross-ref rename has mode */ + __u32 rn_padding_5; + __u32 rn_padding_6; + __u32 rn_padding_7; + __u32 rn_padding_8; +}; + +struct mdt_rec_setattr { + __u32 sa_opcode; + __u32 sa_fsuid; + __u32 sa_fsgid; + __u32 sa_cap; + __u32 sa_suppgid; + __u32 sa_padding_1; + struct lu_fid sa_fid; + __u64 sa_valid; + __u32 sa_uid; + __u32 sa_gid; + __u64 sa_size; + __u64 sa_blocks; + __u64 sa_mtime; + __u64 sa_atime; + __u64 sa_ctime; + __u32 sa_attr_flags; + __u32 sa_mode; + __u32 sa_padding_2; + __u32 sa_padding_3; + __u32 sa_padding_4; + __u32 sa_padding_5; +}; + +struct mdt_rec_setxattr { + __u32 sx_opcode; + __u32 sx_fsuid; + __u32 sx_fsgid; + __u32 sx_cap; + __u32 sx_suppgid1; + __u32 sx_suppgid2; + struct lu_fid sx_fid; + __u64 sx_padding_1; /* These three members are lu_fid size */ + __u32 sx_padding_2; + __u32 sx_padding_3; + __u64 sx_valid; + __u64 sx_padding_4; + __u64 sx_padding_5; + __u64 sx_padding_6; + __u64 sx_padding_7; + __u32 sx_size; + __u32 sx_flags; + __u32 sx_padding_8; + __u32 sx_padding_9; + __u32 sx_padding_10; + __u32 sx_padding_11; +}; + +/* + * capa related definitions + */ +#define CAPA_HMAC_MAX_LEN 64 +#define CAPA_HMAC_KEY_MAX_LEN 56 + +/* NB take care when changing the sequence of elements this struct, + * because the offset info is used in find_capa() */ +struct lustre_capa { + struct lu_fid lc_fid; /* fid */ + __u64 lc_opc; /* operations allowed */ + __u32 lc_uid; /* uid, it is obsolete, but maybe used in + * future, reserve it for 64-bits aligned.*/ + __u32 lc_flags; /* HMAC algorithm & flags */ + __u32 lc_keyid; /* key used for the capability */ + __u32 lc_timeout; /* capa timeout value (sec) */ + __u64 lc_expiry; /* expiry time (sec) */ + __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /* HMAC */ +} __attribute__((packed)); + + /* * LOV data structures */ @@ -1553,23 +1998,7 @@ struct qunit_data_old2 { #warning "remove quota code above for format absolete in new release" #endif -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) -struct qunit_data_old { - __u32 qd_id; /* ID appiles to (uid, gid) */ - __u32 qd_type; /* Quota type (USRQUOTA, GRPQUOTA) */ - __u32 qd_count; /* acquire/release count (bytes for block quota) */ - __u32 qd_isblk; /* Block quota or file quota */ -}; -#else -#warning "remove quota code above for format absolete in new release" -#endif - extern void lustre_swab_qdata(struct qunit_data *d); -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) -extern void lustre_swab_qdata_old(struct qunit_data_old *d); -#else -#warning "remove quota code above for format absolete in new release" -#endif #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 9, 0, 0) extern void lustre_swab_qdata_old2(struct qunit_data_old2 *d); #else @@ -1585,6 +2014,25 @@ typedef enum { QUOTA_DQREL = 602, } quota_cmd_t; + +enum fld_rpc_opc { + FLD_QUERY = 600, + FLD_LAST_OPC, + FLD_FIRST_OPC = FLD_QUERY +}; + +enum seq_rpc_opc { + SEQ_QUERY = 700, + SEQ_LAST_OPC, + SEQ_FIRST_OPC = SEQ_QUERY +}; + +enum seq_op { + SEQ_ALLOC_SUPER = 0, + SEQ_ALLOC_META = 1 +}; + + #define JOIN_FILE_ALIGN 4096 #define QUOTA_REQUEST 1 diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index 354127c0e5ffc756a1b27611f10dd878fe8df7a1..f9b2b2cf7a7282208b60b0a8fa0b874024c81352 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -77,6 +77,10 @@ struct obd_statfs; #define LL_IOC_OBD_STATFS IOC_OBD_STATFS #define IOC_MDC_GETSTRIPE IOC_MDC_GETFILESTRIPE +/* Do not define O_CHECK_STALE as 0200000000, + * which is conflict with MDS_OPEN_OWNEROVERRIDE */ +#define O_CHECK_STALE 020000000 /* hopefully this does not conflict */ + #define O_LOV_DELAY_CREATE 0100000000 /* hopefully this does not conflict */ #define O_JOIN_FILE 0400000000 /* hopefully this does not conflict */ @@ -129,6 +133,19 @@ struct ll_recreate_obj { __u32 lrc_ost_idx; }; +struct ll_fid { + __u64 id; /* holds object id */ + __u32 generation; /* holds object generation */ + __u32 f_type; /* holds object type or stripe idx when passing it to + * OST for saving into EA. */ +}; + +struct filter_fid { + struct ll_fid ff_fid; /* ff_fid.f_type == file stripe number */ + __u64 ff_objid; + __u64 ff_group; +}; + struct obd_uuid { char uuid[40]; }; @@ -171,12 +188,11 @@ static inline char *obd_uuid2str(struct obd_uuid *uuid) #define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */ #define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */ /* lustre-specific control commands */ -#define LUSTRE_Q_INVALIDATE 0x80000b /* invalidate quota data */ +#define LUSTRE_Q_INVALIDATE 0x80000b /* invalidate quota data */ +#define LUSTRE_Q_FINVALIDATE 0x80000c /* invalidate filter quota data */ #define UGQUOTA 2 /* set both USRQUOTA and GRPQUOTA */ -#define QFMT_LDISKFS 2 /* QFMT_VFS_V0(2), quota format for ldiskfs */ - struct if_quotacheck { char obd_type[16]; struct obd_uuid obd_uuid; diff --git a/lustre/include/lustre_cache.h b/lustre/include/lustre_cache.h index 291d88293675b7ccefe54c20424aad5fbf8c4f29..d5a5337bd2ee5aa49d70c8c54a7ff43f0315baa9 100644 --- a/lustre/include/lustre_cache.h +++ b/lustre/include/lustre_cache.h @@ -13,6 +13,7 @@ struct osc_async_page; struct page_removal_cb_element { struct list_head prce_list; obd_page_removal_cb_t prce_callback; + atomic_t prce_refcnt; }; typedef int (*cache_iterate_extents_cb_t)(struct lustre_cache *, @@ -27,6 +28,7 @@ struct lustre_cache { struct list_head lc_locks_list; spinlock_t lc_locks_list_lock; struct list_head lc_page_removal_callback_list; + rwlock_t lc_page_removal_cb_lock; /* iterate vs modify list */ struct obd_device *lc_obd; obd_pin_extent_cb lc_pin_extent_cb; }; diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h index 805cf40fba2c5aa4681612a184e9ae458407f16b..70394bba557ceaa57904291015753b6273cff8bb 100644 --- a/lustre/include/lustre_disk.h +++ b/lustre/include/lustre_disk.h @@ -53,14 +53,14 @@ #define LDD_F_PARAM 0x0400 /* process as lctl conf_param */ enum ldd_mount_type { - LDD_MT_EXT3 = 0, + LDD_MT_EXT3 = 0, LDD_MT_LDISKFS, - LDD_MT_SMFS, + LDD_MT_SMFS, LDD_MT_REISERFS, LDD_MT_LDISKFS2, LDD_MT_LAST }; - + static inline char *mt_str(enum ldd_mount_type mt) { static char *mount_type_string[] = { @@ -84,7 +84,7 @@ struct lustre_disk_data { __u32 ldd_feature_compat; /* compatible feature flags */ __u32 ldd_feature_rocompat;/* read-only compatible feature flags */ __u32 ldd_feature_incompat;/* incompatible feature flags */ - + __u32 ldd_config_ver; /* config rewrite count - not used */ __u32 ldd_flags; /* LDD_SV_TYPE */ __u32 ldd_svindex; /* server index (0001), must match @@ -93,7 +93,7 @@ struct lustre_disk_data { char ldd_fsname[64]; /* filesystem this server is part of */ char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/ __u8 ldd_uuid[40]; /* server UUID (COMPAT_146) */ - + /*200*/ char ldd_userdata[1024 - 200]; /* arbitrary user string */ /*1024*/__u8 ldd_padding[4096 - 1024]; /*4096*/char ldd_mount_opts[4096]; /* target fs mount opts */ @@ -151,6 +151,8 @@ struct lustre_mount_data { #define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */ #define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers, no other services */ +#define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing + existing MGS services */ #define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) @@ -167,7 +169,7 @@ struct lustre_mount_data { * 2^n * CFS_PAGE_SIZE * 8 for the number of bits that fit an order-n allocation. */ #define LR_MAX_CLIENTS (CFS_PAGE_SIZE * 8) - + /* COMPAT_146 */ #define OBD_COMPAT_OST 0x00000002 /* this is an OST (temporary) */ #define OBD_COMPAT_MDT 0x00000004 /* this is an MDT (temporary) */ @@ -180,6 +182,8 @@ struct lustre_mount_data { #define OBD_INCOMPAT_OST 0x00000002 /* this is an OST */ #define OBD_INCOMPAT_MDT 0x00000004 /* this is an MDT */ #define OBD_INCOMPAT_COMMON_LR 0x00000008 /* common last_rvcd format */ +#define OBD_INCOMPAT_FID 0x00000010 /* FID is enabled */ +#define OBD_INCOMPAT_SOM 0x00000020 /* Size-On-MDS is enabled */ /* Data stored per server at the head of the last_rcvd file. In le32 order. diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 55696b55a19ade645f24f2219a307239184f3053..56c769b995c727c80c343ad113f0f7f7dd07de0b 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -152,12 +152,6 @@ typedef enum { #define LDLM_CB_BLOCKING 1 #define LDLM_CB_CANCELING 2 -/* position flag of skip list pointers */ -#define LDLM_SL_HEAD(skip_list) ((skip_list)->next != NULL) -#define LDLM_SL_TAIL(skip_list) ((skip_list)->prev != NULL) -#define LDLM_SL_EMPTY(skip_list) ((skip_list)->next == NULL && \ - (skip_list)->prev == NULL) - /* compatibility matrix */ #define LCK_COMPAT_EX LCK_NL #define LCK_COMPAT_PW (LCK_COMPAT_EX | LCK_CR) @@ -251,6 +245,7 @@ struct ldlm_pool { atomic_t pl_grant_speed; /* Grant speed (GR-CR) per T. */ __u64 pl_server_lock_volume; /* Server lock volume. * Protected by pl_lock */ + __u64 pl_client_lock_volume; /* Client lock volue. */ atomic_t pl_lock_volume_factor; /* Lock volume factor. */ time_t pl_recalc_time; /* Time when last slv from @@ -271,7 +266,7 @@ typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **, struct ldlm_valblock_ops { int (*lvbo_init)(struct ldlm_resource *res); - int (*lvbo_update)(struct ldlm_resource *res, struct lustre_msg *m, + int (*lvbo_update)(struct ldlm_resource *res, struct ptlrpc_request *r, int buf_idx, int increase); }; @@ -336,6 +331,8 @@ struct ldlm_namespace { unsigned ns_max_nolock_size; struct adaptive_timeout ns_at_estimate;/* estimated lock callback time*/ + /* backward link to obd, required for ldlm pool to store new SLV. */ + struct obd_device *ns_obd; }; static inline int ns_is_client(struct ldlm_namespace *ns) @@ -414,11 +411,8 @@ struct ldlm_lock { /* protected by ns_hash_lock. FIXME */ struct list_head l_lru; - /* protected by lr_lock */ - struct list_head l_res_link; // position in one of three res lists - - struct list_head l_sl_mode; // skip pointer for request mode - struct list_head l_sl_policy; // skip pointer for inodebits + /* protected by lr_lock, linkage to resource's lock queues */ + struct list_head l_res_link; struct ldlm_interval *l_tree_node; /* tree node for ldlm_extent */ @@ -477,6 +471,10 @@ struct ldlm_lock { struct list_head l_cp_ast; struct ldlm_lock *l_blocking_lock; int l_bl_ast_run; + + /* protected by lr_lock, linkages to "skip lists" */ + struct list_head l_sl_mode; + struct list_head l_sl_policy; }; struct ldlm_resource { @@ -617,7 +615,7 @@ int ldlm_request_cancel(struct ptlrpc_request *req, int ldlm_del_waiting_lock(struct ldlm_lock *lock); int ldlm_refresh_waiting_lock(struct ldlm_lock *lock); int ldlm_get_ref(void); -void ldlm_put_ref(int force); +void ldlm_put_ref(void); /* ldlm_lock.c */ ldlm_processing_policy ldlm_get_processing_policy(struct ldlm_resource *res); @@ -636,12 +634,12 @@ static inline struct ldlm_lock *ldlm_handle2lock(struct lustre_handle *h) } static inline int ldlm_res_lvbo_update(struct ldlm_resource *res, - struct lustre_msg *m, int buf_idx, + struct ptlrpc_request *r, int buf_idx, int increase) { if (res->lr_namespace->ns_lvbo && res->lr_namespace->ns_lvbo->lvbo_update) { - return res->lr_namespace->ns_lvbo->lvbo_update(res, m, buf_idx, + return res->lr_namespace->ns_lvbo->lvbo_update(res, r, buf_idx, increase); } return 0; @@ -681,6 +679,8 @@ void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_allow_match(struct ldlm_lock *lock); +int ldlm_lock_fast_match(struct ldlm_lock *, int, obd_off, obd_off, void **); +void ldlm_lock_fast_release(void *, int); ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags, struct ldlm_res_id *, ldlm_type_t type, ldlm_policy_data_t *, ldlm_mode_t mode, @@ -696,16 +696,20 @@ void ldlm_lock_dump_handle(int level, struct lustre_handle *); void ldlm_unlink_lock_skiplist(struct ldlm_lock *req); /* resource.c */ -struct ldlm_namespace *ldlm_namespace_new(char *name, ldlm_side_t client, - ldlm_appetite_t apt); +struct ldlm_namespace * +ldlm_namespace_new(struct obd_device *obd, char *name, + ldlm_side_t client, ldlm_appetite_t apt); int ldlm_namespace_cleanup(struct ldlm_namespace *ns, int flags); -int ldlm_namespace_free(struct ldlm_namespace *ns, int force); -void ldlm_namespace_move(struct ldlm_namespace *ns, ldlm_side_t client); -struct ldlm_namespace *ldlm_namespace_first(ldlm_side_t client); +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, int force); +void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client); +void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client); +void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client); +struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client); +void ldlm_namespace_get_locked(struct ldlm_namespace *ns); +void ldlm_namespace_put_locked(struct ldlm_namespace *ns, int wakeup); void ldlm_namespace_get(struct ldlm_namespace *ns); void ldlm_namespace_put(struct ldlm_namespace *ns, int wakeup); -void ldlm_namespace_get_nolock(struct ldlm_namespace *ns); -void ldlm_namespace_put_nolock(struct ldlm_namespace *ns, int wakeup); int ldlm_proc_setup(void); #ifdef LPROCFS void ldlm_proc_cleanup(void); @@ -825,9 +829,12 @@ int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, void ldlm_pool_fini(struct ldlm_pool *pl); int ldlm_pool_setup(struct ldlm_pool *pl, int limit); int ldlm_pool_recalc(struct ldlm_pool *pl); +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl); __u64 ldlm_pool_get_slv(struct ldlm_pool *pl); +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl); __u32 ldlm_pool_get_limit(struct ldlm_pool *pl); void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv); +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv); void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit); void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock); void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock); diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index 7ece7a25bfe96d2045e19398eb1a46eda17b0081..43fc0bb0e303ba1d9f690368a6fb5bd96883adbf 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -10,12 +10,10 @@ #include <lprocfs_status.h> /* Data stored per client in the last_rcvd file. In le32 order. */ -struct mds_client_data; - struct mds_export_data { struct list_head med_open_head; spinlock_t med_open_lock; /* lock med_open_head, mfd_list*/ - struct mds_client_data *med_mcd; + struct lsd_client_data *med_lcd; __u64 med_ibits_known; loff_t med_lr_off; int med_lr_idx; @@ -43,10 +41,9 @@ struct ec_export_data { /* echo client */ }; /* In-memory access to client data from OST struct */ -struct filter_client_data; struct filter_export_data { spinlock_t fed_lock; /* protects fed_open_head */ - struct filter_client_data *fed_fcd; + struct lsd_client_data *fed_lcd; loff_t fed_lr_off; int fed_lr_idx; long fed_dirty; /* in bytes */ @@ -88,6 +85,7 @@ struct obd_export { struct ldlm_export_data exp_ldlm_data; struct list_head exp_outstanding_replies; time_t exp_last_request_time; + struct list_head exp_req_replay_queue; spinlock_t exp_lock; /* protects flags int below */ /* ^ protects exp_outstanding_replies too */ __u64 exp_connect_flags; diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index fb663237e745b82a9c15f7f1517256f368adfe7f..1937beefbafff056ef45ce85b263642ef105e278 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -64,7 +64,7 @@ struct obd_import_conn { __u64 oic_last_attempt; /* jiffies, 64-bit */ }; -#define IMP_AT_MAX_PORTALS 4 +#define IMP_AT_MAX_PORTALS 8 struct imp_at { int iat_portal[IMP_AT_MAX_PORTALS]; struct adaptive_timeout iat_net_latency; diff --git a/lustre/include/lustre_mds.h b/lustre/include/lustre_mds.h index 47d5073807ab7ba5ce0bcf7fc1e2079f381bedd7..99c0186d34151e82fb724b82a16aabe4437edb1a 100644 --- a/lustre/include/lustre_mds.h +++ b/lustre/include/lustre_mds.h @@ -171,8 +171,8 @@ struct obd_client_handle; void mdc_set_open_replay_data(struct obd_client_handle *och, struct ptlrpc_request *open_req); void mdc_clear_open_replay_data(struct obd_client_handle *och); -int mdc_close(struct obd_export *, struct obdo *, struct obd_client_handle *, - struct ptlrpc_request **); +int mdc_close(struct obd_export *, struct mdc_op_data *, struct obdo *, + struct obd_client_handle *, struct ptlrpc_request **); int mdc_readpage(struct obd_export *exp, struct ll_fid *mdc_fid, __u64 offset, struct page *, struct ptlrpc_request **); int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data, @@ -196,10 +196,10 @@ int mdc_resource_get_unused(struct obd_export *exp, struct ll_fid *fid, void mdc_store_inode_generation(struct ptlrpc_request *req, int reqoff, int repoff); int mdc_llog_process(struct obd_export *, char *logname, llog_cb_t, void *data); -int mdc_done_writing(struct obd_export *exp, struct obdo *); +int mdc_done_writing(struct obd_export *, struct mdc_op_data *, struct obdo *); -static inline void mdc_pack_fid(struct ll_fid *fid, obd_id ino, __u32 gen, - int type) +static inline void ll_pack_fid(struct ll_fid *fid, obd_id ino, __u32 gen, + int type) { fid->id = ino; fid->generation = gen; diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 1bd2eabc5e4bae950f303d49456a723208dfc1d5..7e57a14173c86f08b9a891dd20b37f27d2f2e422 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -373,7 +373,7 @@ static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index) } static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index) -{ +{ LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); return req->rq_req_swab_mask & (1 << index); } @@ -384,6 +384,17 @@ static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index) return req->rq_rep_swab_mask & (1 << index); } +static inline int lustre_req_need_swab(struct ptlrpc_request *req) +{ + return req->rq_req_swab_mask & (1 << MSG_PTLRPC_HEADER_OFF); +} + +static inline int lustre_rep_need_swab(struct ptlrpc_request *req) +{ + return req->rq_rep_swab_mask & (1 << MSG_PTLRPC_HEADER_OFF); +} + + static inline const char * ptlrpc_rqphase2str(struct ptlrpc_request *req) { @@ -530,6 +541,7 @@ struct ptlrpc_service { int srv_watchdog_factor; /* soft watchdog timeout mutiplier */ unsigned srv_cpu_affinity:1; /* bind threads to CPUs */ unsigned srv_at_check:1; /* check early replies */ + cfs_time_t srv_at_checktime; /* debug */ __u32 srv_req_portal; __u32 srv_rep_portal; @@ -637,6 +649,7 @@ static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc) #define PTLRPC_REPLY_EARLY 0x02 int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); int ptlrpc_reply(struct ptlrpc_request *req); +int ptlrpc_send_error(struct ptlrpc_request *req, int difficult); int ptlrpc_error(struct ptlrpc_request *req); void ptlrpc_resend_req(struct ptlrpc_request *request); int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); @@ -779,7 +792,6 @@ int lustre_msg_buflen(struct lustre_msg *m, int n); void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len); int lustre_msg_bufcount(struct lustre_msg *m); char *lustre_msg_string (struct lustre_msg *m, int n, int max_len); -void *lustre_swab_buf(struct lustre_msg *, int n, int minlen, void *swabber); void *lustre_swab_reqbuf(struct ptlrpc_request *req, int n, int minlen, void *swabber); void *lustre_swab_repbuf(struct ptlrpc_request *req, int n, int minlen, diff --git a/lustre/include/lustre_quota.h b/lustre/include/lustre_quota.h index eb8b227d99e5c7531914fbff67751d6c0fd74c4b..a2d363596bc722adba39a5f5bcc40ef83d589d50 100644 --- a/lustre/include/lustre_quota.h +++ b/lustre/include/lustre_quota.h @@ -148,14 +148,21 @@ static inline int lustre_quota_convert(struct lustre_quota_info *lqi, typedef int (*dqacq_handler_t) (struct obd_device * obd, struct qunit_data * qd, int opc); + +/* user quota is turned on on filter */ +#define LQC_USRQUOTA_FLAG (1 << 0) +/* group quota is turned on on filter */ +#define LQC_GRPQUOTA_FLAG (1 << 1) + +#define UGQUOTA2LQC(id) ((Q_TYPEMATCH(id, USRQUOTA) ? LQC_USRQUOTA_FLAG : 0) | \ + (Q_TYPEMATCH(id, GRPQUOTA) ? LQC_GRPQUOTA_FLAG : 0)) + struct lustre_quota_ctxt { struct super_block *lqc_sb; /* superblock this applies to */ struct obd_import *lqc_import; /* import used to send dqacq/dqrel RPC */ dqacq_handler_t lqc_handler; /* dqacq/dqrel RPC handler, only for quota master */ + unsigned long lqc_flags; /* quota flags */ unsigned long lqc_recovery:1, /* Doing recovery */ - lqc_atype:2, /* Turn on user/group quota at setup automatically, - * 0: none, 1: user quota, 2: group quota, 3: both */ - lqc_status:1, /* Quota status. 0:Off, 1:On */ lqc_switch_qs:1; /* the function of change qunit size * 0:Off, 1:On */ unsigned long lqc_iunit_sz; /* original unit size of file quota and diff --git a/lustre/include/lustre_ver.h.in b/lustre/include/lustre_ver.h.in index 1c63510944527635cf9dd3a665a92309023b731c..9027021efa6cf32fa017f758ef8e3644d3e791ad 100644 --- a/lustre/include/lustre_ver.h.in +++ b/lustre/include/lustre_ver.h.in @@ -9,6 +9,10 @@ #define LUSTRE_PATCH @AC_LUSTRE_PATCH@ #define LUSTRE_FIX @AC_LUSTRE_FIX@ #define LUSTRE_VERSION_STRING "@AC_LUSTRE_VERSION_STRING@" +#define CLIENT_URN "@AC_LUSTRE_CLIENT_URN@" +#define MDS_URN "@AC_LUSTRE_MDS_URN@" +#define MGS_URN "@AC_LUSTRE_MGS_URN@" +#define OSS_URN "@AC_LUSTRE_OSS_URN@" #define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX) diff --git a/lustre/include/obd.h b/lustre/include/obd.h index d6de0c92c4cc9a2e519b0c0b30ea7161dcb1aa69..4d8c4e8abf087f3259a4445c6404639edfdce2f9 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -244,6 +244,7 @@ struct obd_device_target { struct super_block *obt_sb; atomic_t obt_quotachecking; struct lustre_quota_ctxt obt_qctxt; + lustre_quota_version_t obt_qfmt; }; typedef void (*obd_pin_extent_cb)(void *data); @@ -337,7 +338,7 @@ struct mdc_rpc_lock; struct obd_import; struct lustre_cache; struct client_obd { - struct semaphore cl_sem; + struct rw_semaphore cl_sem; struct obd_uuid cl_target_uuid; struct obd_import *cl_import; /* ptlrpc connection state */ int cl_conn_count; @@ -421,6 +422,10 @@ struct client_obd { /* used by quotacheck */ int cl_qchk_stat; /* quotacheck stat of the peer */ + + /* sequence manager */ + struct lu_client_seq *cl_seq; + atomic_t cl_resends; /* resend count */ /* Cache of triples */ struct lustre_cache *cl_cache; @@ -793,7 +798,7 @@ struct obd_device { cfs_waitq_t obd_llog_waitq; struct list_head obd_exports; int obd_num_exports; - spinlock_t nid_lock; + spinlock_t obd_nid_lock; struct ldlm_namespace *obd_namespace; struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ /* a spinlock is OK for what we do now, may need a semaphore later */ @@ -856,6 +861,11 @@ struct obd_device { unsigned int obd_cntr_base; atomic_t obd_evict_inprogress; cfs_waitq_t obd_evict_inprogress_waitq; + + /* Ldlm pool part. Save last calculated SLV and Limit. */ + rwlock_t obd_pool_lock; + int obd_pool_limit; + __u64 obd_pool_slv; }; #define OBD_OPT_FORCE 0x0001 @@ -878,13 +888,27 @@ enum obd_cleanup_stage { }; /* get/set_info keys */ -#define KEY_MDS_CONN "mds_conn" -#define KEY_NEXT_ID "next_id" -#define KEY_LOVDESC "lovdesc" -#define KEY_INIT_RECOV "initial_recov" -#define KEY_INIT_RECOV_BACKUP "init_recov_bk" +#define KEY_MDS_CONN "mds_conn" +#define KEY_NEXT_ID "next_id" +#define KEY_LOVDESC "lovdesc" +#define KEY_INIT_RECOV "initial_recov" +#define KEY_INIT_RECOV_BACKUP "init_recov_bk" #define KEY_LOV_IDX "lov_idx" #define KEY_LAST_ID "last_id" +#define KEY_LOCK_TO_STRIPE "lock_to_stripe" +#define KEY_CHECKSUM "checksum" +#define KEY_READONLY "readonly" +#define KEY_UNLINKED "unlinked" +#define KEY_EVICT_BY_NID "evict_by_nid" +#define KEY_REGISTER_TARGET "register_target" +#define KEY_SET_FS "set_fs" +#define KEY_CLEAR_FS "clear_fs" +#define KEY_SET_INFO "set_info" +#define KEY_BLOCKSIZE "blocksize" +#define KEY_BLOCKSIZE_BITS "blocksize_bits" +#define KEY_MAX_EASIZE "max_ea_size" +/* XXX unused */ +#define KEY_ASYNC "async" struct obd_ops { struct module *o_owner; @@ -919,6 +943,10 @@ struct obd_ops { struct obd_connect_data *ocd); int (*o_disconnect)(struct obd_export *exp); + /* Initialize/finalize fids infrastructure. */ + int (*o_fid_init)(struct obd_export *exp); + int (*o_fid_fini)(struct obd_export *exp); + int (*o_statfs)(struct obd_device *obd, struct obd_statfs *osfs, __u64 max_age, __u32 flags); int (*o_statfs_async)(struct obd_device *obd, struct obd_info *oinfo, @@ -959,6 +987,14 @@ struct obd_ops { struct obd_async_page_ops *ops, void *data, void **res, int nocache, struct lustre_handle *lockh); + int (*o_reget_short_lock)(struct obd_export *exp, + struct lov_stripe_md *lsm, + void **res, int rw, + obd_off start, obd_off end, + void **cookie); + int (*o_release_short_lock)(struct obd_export *exp, + struct lov_stripe_md *lsm, obd_off end, + void *cookie, int rw); int (*o_queue_async_io)(struct obd_export *exp, struct lov_stripe_md *lsm, struct lov_oinfo *loi, void *cookie, @@ -1033,7 +1069,7 @@ struct obd_ops { int (*o_llog_finish)(struct obd_device *obd, int count); /* metadata-only methods */ - int (*o_pin)(struct obd_export *, obd_id ino, __u32 gen, int type, + int (*o_pin)(struct obd_export *, struct ll_fid *, struct obd_client_handle *, int flag); int (*o_unpin)(struct obd_export *, struct obd_client_handle *, int); @@ -1098,7 +1134,7 @@ static inline struct lsm_operations *lsm_op_find(int magic) case LOV_MAGIC_JOIN: return &lsm_join_ops; default: - CERROR("Cannot recognize lsm_magic %d", magic); + CERROR("Cannot recognize lsm_magic %x\n", magic); return NULL; } } @@ -1113,15 +1149,18 @@ static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno, int error) { if (error) { - CERROR("%s: transno "LPD64" commit error: %d\n", + CERROR("%s: transno "LPU64" commit error: %d\n", obd->obd_name, transno, error); return; } - CDEBUG(D_HA, "%s: transno "LPD64" committed\n", - obd->obd_name, transno); if (transno > obd->obd_last_committed) { + CDEBUG(D_HA, "%s: transno "LPU64" committed\n", + obd->obd_name, transno); obd->obd_last_committed = transno; ptlrpc_commit_replies (obd); + } else { + CDEBUG(D_INFO, "%s: transno "LPU64" committed\n", + obd->obd_name, transno); } } diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index e2a99e71665a61ef8735d66fd4e0dad194c697e2..d5405c6059b6a54d15b270a9e3f3a337340eb56c 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -84,6 +84,10 @@ char *obd_export_nid2str(struct obd_export *exp); int obd_export_evict_by_nid(struct obd_device *obd, char *nid); int obd_export_evict_by_uuid(struct obd_device *obd, char *uuid); +int obd_zombie_impexp_init(void); +void obd_zombie_impexp_stop(void); +void obd_zombie_impexp_cull(void); + /* obd_config.c */ int class_process_config(struct lustre_cfg *lcfg); int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, @@ -688,6 +692,30 @@ static inline int obd_disconnect(struct obd_export *exp) RETURN(rc); } +static inline int obd_fid_init(struct obd_export *exp) +{ + int rc; + ENTRY; + + OBD_CHECK_OP(exp->exp_obd, fid_init, 0); + EXP_COUNTER_INCREMENT(exp, fid_init); + + rc = OBP(exp->exp_obd, fid_init)(exp); + RETURN(rc); +} + +static inline int obd_fid_fini(struct obd_export *exp) +{ + int rc; + ENTRY; + + OBD_CHECK_OP(exp->exp_obd, fid_fini, 0); + EXP_COUNTER_INCREMENT(exp, fid_fini); + + rc = OBP(exp->exp_obd, fid_fini)(exp); + RETURN(rc); +} + static inline int obd_ping(struct obd_export *exp) { int rc; @@ -979,6 +1007,34 @@ static inline int obd_prep_async_page(struct obd_export *exp, RETURN(ret); } +static inline int obd_reget_short_lock(struct obd_export *exp, + struct lov_stripe_md *lsm, + void **res, int rw, + obd_off start, obd_off end, + void **cookie) +{ + ENTRY; + + OBD_CHECK_OP(exp->exp_obd, reget_short_lock, -EOPNOTSUPP); + EXP_COUNTER_INCREMENT(exp, reget_short_lock); + + RETURN(OBP(exp->exp_obd, reget_short_lock)(exp, lsm, res, rw, + start, end, cookie)); +} + +static inline int obd_release_short_lock(struct obd_export *exp, + struct lov_stripe_md *lsm, obd_off end, + void *cookie, int rw) +{ + ENTRY; + + OBD_CHECK_OP(exp->exp_obd, release_short_lock, -EOPNOTSUPP); + EXP_COUNTER_INCREMENT(exp, release_short_lock); + + RETURN(OBP(exp->exp_obd, release_short_lock)(exp, lsm, end, + cookie, rw)); +} + static inline int obd_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm, struct lov_oinfo *loi, void *cookie, @@ -1243,8 +1299,8 @@ static inline int obd_join_lru(struct obd_export *exp, RETURN(rc); } -static inline int obd_pin(struct obd_export *exp, obd_id ino, __u32 gen, - int type, struct obd_client_handle *handle, int flag) +static inline int obd_pin(struct obd_export *exp, struct ll_fid *fid, + struct obd_client_handle *handle, int flag) { int rc; ENTRY; @@ -1252,7 +1308,7 @@ static inline int obd_pin(struct obd_export *exp, obd_id ino, __u32 gen, EXP_CHECK_OP(exp, pin); EXP_COUNTER_INCREMENT(exp, pin); - rc = OBP(exp->exp_obd, pin)(exp, ino, gen, type, handle, flag); + rc = OBP(exp->exp_obd, pin)(exp, fid, handle, flag); RETURN(rc); } diff --git a/lustre/include/obd_ost.h b/lustre/include/obd_ost.h index d2b399e0cec3c11341399eb7bda4dbe4cf7fc680..6277d569b528c41faf73ac1b1ff6d449ac681ab3 100644 --- a/lustre/include/obd_ost.h +++ b/lustre/include/obd_ost.h @@ -38,4 +38,18 @@ int osc_extent_blocking_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new, void *data, int flag); +static inline struct ldlm_res_id *osc_build_res_name(__u64 id, __u64 gr, + struct ldlm_res_id *name) +{ + memset(name, 0, sizeof *name); + name->name[0] = id; + name->name[2] = gr; + return name; +} + +static inline int osc_res_name_eq(__u64 id, __u64 gr, struct ldlm_res_id *name) +{ + return name->name[0] == id && name->name[2] == gr; +} + #endif diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index e97b58d908a8c0d6390d0088fabf1409f211bbf1..6e401d280467a0fd9607cfb1ffcca982a8f44db2 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -53,8 +53,9 @@ extern int obd_race_state; extern unsigned int obd_alloc_fail_rate; /* Timeout definitions */ -#define OBD_TIMEOUT_DEFAULT 100 -#define LDLM_TIMEOUT_DEFAULT 20 +#define OBD_TIMEOUT_DEFAULT 100 +#define LDLM_TIMEOUT_DEFAULT 20 +#define MDS_LDLM_TIMEOUT_DEFAULT 6 #ifdef CRAY_XT3 #define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */ #endif @@ -153,6 +154,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b #define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x13c #define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x13d +#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 @@ -228,6 +230,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a #define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b #define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c +#define OBD_FAIL_OSC_DIO_PAUSE 0x40d #define OBD_FAIL_PTLRPC 0x500 #define OBD_FAIL_PTLRPC_ACK 0x501 @@ -246,6 +249,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_OBD_LOGD_NET 0x602 #define OBD_FAIL_OBD_QC_CALLBACK_NET 0x603 #define OBD_FAIL_OBD_DQACQ 0x604 +#define OBD_FAIL_OBD_LLOG_SETUP 0x605 #define OBD_FAIL_TGT_REPLY_NET 0x700 #define OBD_FAIL_TGT_CONN_RACE 0x701 @@ -253,6 +257,8 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_TGT_DELAY_CONNECT 0x703 #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 #define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 +#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 +#define OBD_FAIL_TGT_REPLAY_DROP 0x707 #define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 @@ -266,11 +272,6 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_MGS_PAUSE_REQ 0x904 #define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) -#define OBD_FAIL_QUOTA_QD_COUNT_32BIT 0xA00 -#else -#warning "remove quota code above for format obsolete in new release" -#endif #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 9, 0, 0) #define OBD_FAIL_QUOTA_WITHOUT_CHANGE_QS 0xA01 #else @@ -621,7 +622,8 @@ do { \ #define OBD_SLAB_FREE_PTR(ptr, slab) \ OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr)) -#define KEY_IS(str) (keylen >= strlen(str) && strcmp(key, str) == 0) +#define KEY_IS(str) \ + (keylen >= (sizeof(str) - 1) && memcmp(key, str, sizeof(str) - 1) == 0) /* Wrapper for contiguous page frame allocation */ #define OBD_PAGES_ALLOC(ptr, order, gfp_mask) \ diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config index 5f7cef88a04df0ebc1cb1b62e3eb879c9b35c877..3ae4b96cff7535d83407d762890c9521b9fb51da 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686-bigsmp.config @@ -3394,7 +3394,7 @@ CONFIG_LOG_BUF_SHIFT=17 # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_HIGHMEM is not set CONFIG_DEBUG_BUGVERBOSE=y -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y # CONFIG_DEBUG_VM is not set # CONFIG_FRAME_POINTER is not set diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686.config index 8ef7035996a433664eaa2d37e59b2b2e4092b2d8..ded8f04556df1076a4eb79f5d3d264c45650fb68 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-i686.config @@ -3394,7 +3394,7 @@ CONFIG_LOG_BUF_SHIFT=17 # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_HIGHMEM is not set CONFIG_DEBUG_BUGVERBOSE=y -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y # CONFIG_DEBUG_VM is not set # CONFIG_FRAME_POINTER is not set diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config index 18b56e46933f0ff5127992a34e2044374f8b28ae..f2598edcd4ac54672760ac69c590175ccb951882 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64-smp.config @@ -2976,7 +2976,7 @@ CONFIG_LOG_BUF_SHIFT=18 # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_KOBJECT is not set -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y # CONFIG_DEBUG_VM is not set # CONFIG_FRAME_POINTER is not set diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64.config index 8037b01ee0a281cd9a6d06ee4601c28f03419b11..51b18dfdfbbfbadaf9649c0bb64368d41bd297bc 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.16-2.6-sles10-x86_64.config @@ -2964,7 +2964,7 @@ CONFIG_LOG_BUF_SHIFT=18 # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_KOBJECT is not set -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_DEBUG_FS=y # CONFIG_DEBUG_VM is not set # CONFIG_FRAME_POINTER is not set diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686-smp.config index 24286e7cd49c7b41f4024a5dd0d2b54932d6b68f..ceb6da3555aecfbac1ed0123f1e9e3a1e56963a3 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686-smp.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.18-prep -# Sat Nov 24 08:00:02 2007 +# Fri Jun 27 01:40:54 2008 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -187,6 +187,7 @@ CONFIG_EDD=m CONFIG_EFI_VARS=y CONFIG_DELL_RBU=m CONFIG_DCDBAS=m +# CONFIG_ISCSI_IBFT_FIND is not set # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set @@ -397,6 +398,7 @@ CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y CONFIG_XFRM=y +CONFIG_XFRM_NALGO=m CONFIG_XFRM_USER=y CONFIG_NET_KEY=m CONFIG_INET=y @@ -481,6 +483,7 @@ CONFIG_IPV6=m CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y +# CONFIG_IPV6_OPTIMISTIC_DAD is not set CONFIG_INET6_AH=m CONFIG_INET6_ESP=m CONFIG_INET6_IPCOMP=m @@ -798,12 +801,15 @@ CONFIG_FIB_RULES=y # Wireless # CONFIG_CFG80211=m +CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y CONFIG_NET_WIRELESS_RTNETLINK=y CONFIG_MAC80211=m +CONFIG_MAC80211_RCSIMPLE=y CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set CONFIG_MAC80211_DEBUG=y +# CONFIG_MAC80211_HT_DEBUG is not set # CONFIG_MAC80211_VERBOSE_DEBUG is not set # CONFIG_MAC80211_LOWTX_FRAME_DUMP is not set # CONFIG_TKIP_DEBUG is not set @@ -1126,15 +1132,16 @@ CONFIG_SCSI_LOGGING=y # CONFIG_SCSI_SPI_ATTRS=m CONFIG_SCSI_FC_ATTRS=m -CONFIG_SCSI_ISCSI_ATTRS=m +# CONFIG_SCSI_ISCSI_ATTRS is not set CONFIG_SCSI_SAS_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m +# CONFIG_SCSI_SAS_ATA is not set # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set # # SCSI low-level drivers # -CONFIG_ISCSI_TCP=m +# CONFIG_ISCSI_TCP is not set CONFIG_BLK_DEV_3W_XXXX_RAID=m CONFIG_SCSI_3W_9XXX=m CONFIG_SCSI_ACARD=m @@ -1185,7 +1192,7 @@ CONFIG_SCSI_SYM53C8XX_MMIO=y # CONFIG_SCSI_IPR is not set CONFIG_SCSI_QLOGIC_1280=m CONFIG_SCSI_QLA_FC=m -CONFIG_SCSI_QLA_ISCSI=m +# CONFIG_SCSI_QLA_ISCSI is not set CONFIG_SCSI_LPFC=m CONFIG_SCSI_DC395x=m # CONFIG_SCSI_DC390T is not set @@ -1219,6 +1226,7 @@ CONFIG_SATA_ULI=m CONFIG_SATA_VIA=m CONFIG_SATA_VITESSE=m CONFIG_SATA_INIC162X=m +CONFIG_PATA_ACPI=m # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set @@ -1228,6 +1236,7 @@ CONFIG_SATA_INIC162X=m # CONFIG_PATA_CS5520 is not set # CONFIG_PATA_CS5530 is not set # CONFIG_PATA_CS5535 is not set +CONFIG_PATA_CS5536=m # CONFIG_PATA_CYPRESS is not set # CONFIG_PATA_EFAR is not set # CONFIG_ATA_GENERIC is not set @@ -1244,6 +1253,7 @@ CONFIG_PATA_MARVELL=m # CONFIG_PATA_OLDPIIX is not set # CONFIG_PATA_NETCELL is not set # CONFIG_PATA_NS87410 is not set +CONFIG_PATA_NS87415=m # CONFIG_PATA_OPTI is not set # CONFIG_PATA_OPTIDMA is not set # CONFIG_PATA_PCMCIA is not set @@ -1280,6 +1290,8 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_EMC=m CONFIG_DM_MULTIPATH_RDAC=m +CONFIG_DM_MULTIPATH_HP=m +# CONFIG_DM_UEVENT is not set # # Fusion MPT device support @@ -1291,6 +1303,7 @@ CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set CONFIG_FIREWIRE=m CONFIG_FIREWIRE_OHCI=m CONFIG_FIREWIRE_SBP2=m @@ -1437,13 +1450,16 @@ CONFIG_QLA3XXX=m # Ethernet (10000 Mbit) # CONFIG_CHELSIO_T1=m -CONFIG_CHELSIO_T3=m +# CONFIG_CHELSIO_T3 is not set +CONFIG_IXGBE=m CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y CONFIG_MYRI10GE=m CONFIG_NETXEN_NIC=m +CONFIG_BNX2X=m +# CONFIG_MLX4_CORE is not set # # Token Ring devices @@ -1522,9 +1538,11 @@ CONFIG_BCM43XX_DMA_AND_PIO_MODE=y CONFIG_ZD1211RW=m # CONFIG_ZD1211RW_DEBUG is not set CONFIG_NET_WIRELESS=y -CONFIG_IWLWIFI=y -CONFIG_IWLWIFI_DEBUG=y CONFIG_IWL4965=m +# CONFIG_IWL4965_QOS is not set +# CONFIG_IWL4965_SPECTRUM_MEASUREMENT is not set +# CONFIG_IWL4965_SENSITIVITY is not set +# CONFIG_IWL4965_DEBUG is not set # CONFIG_IWL3945 is not set # @@ -1904,6 +1922,7 @@ CONFIG_IBMASR=m # CONFIG_WAFER_WDT is not set CONFIG_I6300ESB_WDT=m CONFIG_I8XX_TCO=m +CONFIG_HP_WATCHDOG=m # CONFIG_SC1200_WDT is not set # CONFIG_60XX_WDT is not set # CONFIG_SBC8360_WDT is not set @@ -1979,10 +1998,6 @@ CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set # CONFIG_HPET_MMAP is not set CONFIG_HANGCHECK_TIMER=m - -# -# TPM devices -# # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set @@ -2514,6 +2529,7 @@ CONFIG_USB_RTL8150=m CONFIG_USB_USBNET=m CONFIG_USB_NET_AX8817X=m CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_DM9601=m CONFIG_USB_NET_GL620A=m CONFIG_USB_NET_NET1080=m CONFIG_USB_NET_PLUSB=m @@ -2645,32 +2661,7 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_IDE_DISK=y CONFIG_LEDS_TRIGGER_HEARTBEAT=m - -# -# InfiniBand support -# -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_MAD=m -CONFIG_INFINIBAND_USER_ACCESS=m -CONFIG_INFINIBAND_ADDR_TRANS=y -CONFIG_INFINIBAND_MTHCA=m -CONFIG_INFINIBAND_MTHCA_DEBUG=y -CONFIG_INFINIBAND_AMSO1100=m -# CONFIG_INFINIBAND_AMSO1100_DEBUG is not set -CONFIG_INFINIBAND_CXGB3=m -# CONFIG_INFINIBAND_CXGB3_DEBUG is not set -CONFIG_INFINIBAND_IPOIB=m -# CONFIG_INFINIBAND_IPOIB_CM is not set -CONFIG_INFINIBAND_IPOIB_DEBUG=y -# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set -CONFIG_INFINIBAND_SRP=m -CONFIG_INFINIBAND_ISER=m -CONFIG_INFINIBAND_SDP=m -# CONFIG_INFINIBAND_SDP_DEBUG is not set -CONFIG_INFINIBAND_VNIC=m -# CONFIG_INFINIBAND_VNIC_DEBUG is not set -CONFIG_INFINIBAND_VNIC_STATS=y -CONFIG_INFINIBAND_MADEYE=m +# CONFIG_INFINIBAND is not set # # EDAC - error detection and reporting (RAS) (EXPERIMENTAL) @@ -2686,6 +2677,8 @@ CONFIG_EDAC_AMD76X=m CONFIG_EDAC_E7XXX=m CONFIG_EDAC_E752X=m CONFIG_EDAC_I82875P=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I5000=m CONFIG_EDAC_I82860=m CONFIG_EDAC_K8=m CONFIG_EDAC_R82600=m @@ -2818,6 +2811,7 @@ CONFIG_CONFIGFS_FS=m # # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set +CONFIG_ECRYPT_FS=m CONFIG_HFS_FS=m CONFIG_HFSPLUS_FS=m # CONFIG_BEFS_FS is not set @@ -2977,6 +2971,7 @@ CONFIG_SCHEDSTATS=y # CONFIG_DEBUG_RWSEMS is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -3028,7 +3023,16 @@ CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT=y # Cryptographic options # CONFIG_CRYPTO=y +CONFIG_CRYPTO_API=m +CONFIG_CRYPTO_ALGAPI=m +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_BLKCIPHER=m +CONFIG_CRYPTO_SEQIV=m +CONFIG_CRYPTO_HASH=m +CONFIG_CRYPTO_MANAGER=m CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_NHMAC=m +CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=m @@ -3037,6 +3041,10 @@ CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_TWOFISH=m @@ -3053,6 +3061,7 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_CRC32C=y # CONFIG_CRYPTO_TEST is not set +CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_SIGNATURE=y CONFIG_CRYPTO_SIGNATURE_DSA=y CONFIG_CRYPTO_MPILIB=y @@ -3073,7 +3082,6 @@ CONFIG_LIBCRC32C=y CONFIG_AUDIT_GENERIC=y CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=m -CONFIG_GENERIC_ALLOCATOR=y CONFIG_REED_SOLOMON=m CONFIG_REED_SOLOMON_DEC16=y CONFIG_TEXTSEARCH=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686.config index a451029bf30a21ddde351c39962f27976eeb89c9..903bc6c9d0a530b50057942b95ed875c98451ce0 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-i686.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.18-prep -# Sat Nov 24 08:01:09 2007 +# Fri Jun 27 01:41:47 2008 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -178,6 +178,7 @@ CONFIG_EDD=m CONFIG_EFI_VARS=y CONFIG_DELL_RBU=m CONFIG_DCDBAS=m +# CONFIG_ISCSI_IBFT_FIND is not set # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set @@ -385,6 +386,7 @@ CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y CONFIG_XFRM=y +CONFIG_XFRM_NALGO=m CONFIG_XFRM_USER=y CONFIG_NET_KEY=m CONFIG_INET=y @@ -469,6 +471,7 @@ CONFIG_IPV6=m CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y +# CONFIG_IPV6_OPTIMISTIC_DAD is not set CONFIG_INET6_AH=m CONFIG_INET6_ESP=m CONFIG_INET6_IPCOMP=m @@ -786,12 +789,15 @@ CONFIG_FIB_RULES=y # Wireless # CONFIG_CFG80211=m +CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y CONFIG_NET_WIRELESS_RTNETLINK=y CONFIG_MAC80211=m +CONFIG_MAC80211_RCSIMPLE=y CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set CONFIG_MAC80211_DEBUG=y +# CONFIG_MAC80211_HT_DEBUG is not set # CONFIG_MAC80211_VERBOSE_DEBUG is not set # CONFIG_MAC80211_LOWTX_FRAME_DUMP is not set # CONFIG_TKIP_DEBUG is not set @@ -1117,6 +1123,7 @@ CONFIG_SCSI_FC_ATTRS=m # CONFIG_SCSI_ISCSI_ATTRS is not set CONFIG_SCSI_SAS_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m +# CONFIG_SCSI_SAS_ATA is not set # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set # @@ -1207,6 +1214,7 @@ CONFIG_SATA_ULI=m CONFIG_SATA_VIA=m CONFIG_SATA_VITESSE=m CONFIG_SATA_INIC162X=m +CONFIG_PATA_ACPI=m # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set @@ -1216,6 +1224,7 @@ CONFIG_SATA_INIC162X=m # CONFIG_PATA_CS5520 is not set # CONFIG_PATA_CS5530 is not set # CONFIG_PATA_CS5535 is not set +CONFIG_PATA_CS5536=m # CONFIG_PATA_CYPRESS is not set # CONFIG_PATA_EFAR is not set # CONFIG_ATA_GENERIC is not set @@ -1232,6 +1241,7 @@ CONFIG_PATA_MARVELL=m # CONFIG_PATA_OLDPIIX is not set # CONFIG_PATA_NETCELL is not set # CONFIG_PATA_NS87410 is not set +CONFIG_PATA_NS87415=m # CONFIG_PATA_OPTI is not set # CONFIG_PATA_OPTIDMA is not set # CONFIG_PATA_PCMCIA is not set @@ -1268,6 +1278,8 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_EMC=m CONFIG_DM_MULTIPATH_RDAC=m +CONFIG_DM_MULTIPATH_HP=m +# CONFIG_DM_UEVENT is not set # # Fusion MPT device support @@ -1279,6 +1291,7 @@ CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set CONFIG_FIREWIRE=m CONFIG_FIREWIRE_OHCI=m CONFIG_FIREWIRE_SBP2=m @@ -1427,12 +1440,15 @@ CONFIG_QLA3XXX=m # CONFIG_CHELSIO_T1=m # CONFIG_CHELSIO_T3 is not set +CONFIG_IXGBE=m CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y CONFIG_MYRI10GE=m CONFIG_NETXEN_NIC=m +CONFIG_BNX2X=m +# CONFIG_MLX4_CORE is not set # # Token Ring devices @@ -1511,9 +1527,11 @@ CONFIG_BCM43XX_DMA_AND_PIO_MODE=y CONFIG_ZD1211RW=m # CONFIG_ZD1211RW_DEBUG is not set CONFIG_NET_WIRELESS=y -CONFIG_IWLWIFI=y -CONFIG_IWLWIFI_DEBUG=y CONFIG_IWL4965=m +# CONFIG_IWL4965_QOS is not set +# CONFIG_IWL4965_SPECTRUM_MEASUREMENT is not set +# CONFIG_IWL4965_SENSITIVITY is not set +# CONFIG_IWL4965_DEBUG is not set # CONFIG_IWL3945 is not set # @@ -1896,6 +1914,7 @@ CONFIG_IBMASR=m # CONFIG_WAFER_WDT is not set CONFIG_I6300ESB_WDT=m CONFIG_I8XX_TCO=m +CONFIG_HP_WATCHDOG=m # CONFIG_SC1200_WDT is not set # CONFIG_60XX_WDT is not set # CONFIG_SBC8360_WDT is not set @@ -1972,10 +1991,6 @@ CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set # CONFIG_HPET_MMAP is not set CONFIG_HANGCHECK_TIMER=m - -# -# TPM devices -# # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set @@ -2507,6 +2522,7 @@ CONFIG_USB_RTL8150=m CONFIG_USB_USBNET=m CONFIG_USB_NET_AX8817X=m CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_DM9601=m CONFIG_USB_NET_GL620A=m CONFIG_USB_NET_NET1080=m CONFIG_USB_NET_PLUSB=m @@ -2638,10 +2654,6 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_IDE_DISK=y CONFIG_LEDS_TRIGGER_HEARTBEAT=m - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set # @@ -2658,6 +2670,8 @@ CONFIG_EDAC_AMD76X=m CONFIG_EDAC_E7XXX=m CONFIG_EDAC_E752X=m CONFIG_EDAC_I82875P=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I5000=m CONFIG_EDAC_I82860=m CONFIG_EDAC_K8=m CONFIG_EDAC_R82600=m @@ -2790,6 +2804,7 @@ CONFIG_CONFIGFS_FS=m # # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set +CONFIG_ECRYPT_FS=m CONFIG_HFS_FS=m CONFIG_HFSPLUS_FS=m # CONFIG_BEFS_FS is not set @@ -2949,6 +2964,7 @@ CONFIG_SCHEDSTATS=y # CONFIG_DEBUG_RWSEMS is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -2998,7 +3014,16 @@ CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT=y # Cryptographic options # CONFIG_CRYPTO=y +CONFIG_CRYPTO_API=m +CONFIG_CRYPTO_ALGAPI=m +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_BLKCIPHER=m +CONFIG_CRYPTO_SEQIV=m +CONFIG_CRYPTO_HASH=m +CONFIG_CRYPTO_MANAGER=m CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_NHMAC=m +CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=m @@ -3007,6 +3032,10 @@ CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_TWOFISH=m @@ -3023,6 +3052,7 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_CRC32C=y # CONFIG_CRYPTO_TEST is not set +CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_SIGNATURE=y CONFIG_CRYPTO_SIGNATURE_DSA=y CONFIG_CRYPTO_MPILIB=y @@ -3043,7 +3073,6 @@ CONFIG_LIBCRC32C=y CONFIG_AUDIT_GENERIC=y CONFIG_ZLIB_INFLATE=y CONFIG_ZLIB_DEFLATE=m -CONFIG_GENERIC_ALLOCATOR=y CONFIG_REED_SOLOMON=m CONFIG_REED_SOLOMON_DEC16=y CONFIG_TEXTSEARCH=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64-smp.config index e866d45692eae44bc87e1354bdad348217fe1eb0..125e389ffae8587cf0f6f7c4220864e971d8696b 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64-smp.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.18-prep -# Sat Nov 24 07:57:21 2007 +# Fri Jun 27 01:42:44 2008 # CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -166,6 +166,7 @@ CONFIG_IA64_MCA_RECOVERY=m CONFIG_PERFMON=y CONFIG_IA64_PALINFO=y CONFIG_SGI_SN=y +# CONFIG_IA64_HP_AML_NFW is not set # # SN Devices @@ -289,6 +290,7 @@ CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y CONFIG_XFRM=y +CONFIG_XFRM_NALGO=m CONFIG_XFRM_USER=y CONFIG_NET_KEY=m CONFIG_INET=y @@ -373,6 +375,7 @@ CONFIG_IPV6=m CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y +# CONFIG_IPV6_OPTIMISTIC_DAD is not set CONFIG_INET6_AH=m CONFIG_INET6_ESP=m CONFIG_INET6_IPCOMP=m @@ -690,12 +693,15 @@ CONFIG_FIB_RULES=y # Wireless # CONFIG_CFG80211=m +CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y CONFIG_NET_WIRELESS_RTNETLINK=y CONFIG_MAC80211=m +CONFIG_MAC80211_RCSIMPLE=y CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set CONFIG_MAC80211_DEBUG=y +# CONFIG_MAC80211_HT_DEBUG is not set # CONFIG_MAC80211_VERBOSE_DEBUG is not set # CONFIG_MAC80211_LOWTX_FRAME_DUMP is not set # CONFIG_TKIP_DEBUG is not set @@ -914,6 +920,7 @@ CONFIG_SCSI_FC_ATTRS=m # CONFIG_SCSI_ISCSI_ATTRS is not set CONFIG_SCSI_SAS_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m +# CONFIG_SCSI_SAS_ATA is not set # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set # @@ -996,6 +1003,7 @@ CONFIG_SATA_ULI=m CONFIG_SATA_VIA=m CONFIG_SATA_VITESSE=m CONFIG_SATA_INIC162X=m +CONFIG_PATA_ACPI=m # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set @@ -1020,6 +1028,7 @@ CONFIG_PATA_MARVELL=m # CONFIG_PATA_OLDPIIX is not set # CONFIG_PATA_NETCELL is not set # CONFIG_PATA_NS87410 is not set +CONFIG_PATA_NS87415=m # CONFIG_PATA_OPTI is not set # CONFIG_PATA_OPTIDMA is not set # CONFIG_PATA_PCMCIA is not set @@ -1056,6 +1065,8 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_EMC=m CONFIG_DM_MULTIPATH_RDAC=m +CONFIG_DM_MULTIPATH_HP=m +# CONFIG_DM_UEVENT is not set # # Fusion MPT device support @@ -1067,6 +1078,7 @@ CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set CONFIG_FIREWIRE=m CONFIG_FIREWIRE_OHCI=m CONFIG_FIREWIRE_SBP2=m @@ -1204,12 +1216,15 @@ CONFIG_QLA3XXX=m # CONFIG_CHELSIO_T1=m # CONFIG_CHELSIO_T3 is not set +CONFIG_IXGBE=m CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y CONFIG_MYRI10GE=m CONFIG_NETXEN_NIC=m +CONFIG_BNX2X=m +# CONFIG_MLX4_CORE is not set # # Token Ring devices @@ -1273,9 +1288,11 @@ CONFIG_HOSTAP_CS=m CONFIG_ZD1211RW=m # CONFIG_ZD1211RW_DEBUG is not set CONFIG_NET_WIRELESS=y -CONFIG_IWLWIFI=y -CONFIG_IWLWIFI_DEBUG=y CONFIG_IWL4965=m +# CONFIG_IWL4965_QOS is not set +# CONFIG_IWL4965_SPECTRUM_MEASUREMENT is not set +# CONFIG_IWL4965_SENSITIVITY is not set +# CONFIG_IWL4965_DEBUG is not set # CONFIG_IWL3945 is not set # @@ -1682,10 +1699,6 @@ CONFIG_MAX_RAW_DEVS=8192 # CONFIG_HPET is not set CONFIG_HANGCHECK_TIMER=m CONFIG_MMTIMER=y - -# -# TPM devices -# # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set @@ -2193,6 +2206,7 @@ CONFIG_USB_RTL8150=m CONFIG_USB_USBNET=m CONFIG_USB_NET_AX8817X=m CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_DM9601=m CONFIG_USB_NET_GL620A=m CONFIG_USB_NET_NET1080=m CONFIG_USB_NET_PLUSB=m @@ -2323,10 +2337,6 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_IDE_DISK=y CONFIG_LEDS_TRIGGER_HEARTBEAT=m - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set # @@ -2460,6 +2470,7 @@ CONFIG_CONFIGFS_FS=m # # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set +CONFIG_ECRYPT_FS=m CONFIG_HFS_FS=m CONFIG_HFSPLUS_FS=m # CONFIG_BEFS_FS is not set @@ -2678,7 +2689,16 @@ CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT=y # Cryptographic options # CONFIG_CRYPTO=y +CONFIG_CRYPTO_API=m +CONFIG_CRYPTO_ALGAPI=m +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_BLKCIPHER=m +CONFIG_CRYPTO_SEQIV=m +CONFIG_CRYPTO_HASH=m +CONFIG_CRYPTO_MANAGER=m CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_NHMAC=m +CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=m @@ -2687,6 +2707,10 @@ CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_TWOFISH=m @@ -2702,6 +2726,7 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_CRC32C=y # CONFIG_CRYPTO_TEST is not set +CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_SIGNATURE=y CONFIG_CRYPTO_SIGNATURE_DSA=y CONFIG_CRYPTO_MPILIB=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64.config index 2ffc74e7b246a759b459880e5c2b050becfda255..852a070ecef113d7507209296ff14da7f59fdbae 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-ia64.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.18-prep -# Sat Nov 24 07:58:17 2007 +# Fri Jun 27 01:43:25 2008 # CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -160,6 +160,7 @@ CONFIG_IA64_MCA_RECOVERY=m CONFIG_PERFMON=y CONFIG_IA64_PALINFO=y CONFIG_SGI_SN=y +# CONFIG_IA64_HP_AML_NFW is not set # # SN Devices @@ -282,6 +283,7 @@ CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y CONFIG_XFRM=y +CONFIG_XFRM_NALGO=m CONFIG_XFRM_USER=y CONFIG_NET_KEY=m CONFIG_INET=y @@ -366,6 +368,7 @@ CONFIG_IPV6=m CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y +# CONFIG_IPV6_OPTIMISTIC_DAD is not set CONFIG_INET6_AH=m CONFIG_INET6_ESP=m CONFIG_INET6_IPCOMP=m @@ -683,12 +686,15 @@ CONFIG_FIB_RULES=y # Wireless # CONFIG_CFG80211=m +CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y CONFIG_NET_WIRELESS_RTNETLINK=y CONFIG_MAC80211=m +CONFIG_MAC80211_RCSIMPLE=y CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set CONFIG_MAC80211_DEBUG=y +# CONFIG_MAC80211_HT_DEBUG is not set # CONFIG_MAC80211_VERBOSE_DEBUG is not set # CONFIG_MAC80211_LOWTX_FRAME_DUMP is not set # CONFIG_TKIP_DEBUG is not set @@ -907,6 +913,7 @@ CONFIG_SCSI_FC_ATTRS=m # CONFIG_SCSI_ISCSI_ATTRS is not set CONFIG_SCSI_SAS_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m +# CONFIG_SCSI_SAS_ATA is not set # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set # @@ -989,6 +996,7 @@ CONFIG_SATA_ULI=m CONFIG_SATA_VIA=m CONFIG_SATA_VITESSE=m CONFIG_SATA_INIC162X=m +CONFIG_PATA_ACPI=m # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set @@ -1013,6 +1021,7 @@ CONFIG_PATA_MARVELL=m # CONFIG_PATA_OLDPIIX is not set # CONFIG_PATA_NETCELL is not set # CONFIG_PATA_NS87410 is not set +CONFIG_PATA_NS87415=m # CONFIG_PATA_OPTI is not set # CONFIG_PATA_OPTIDMA is not set # CONFIG_PATA_PCMCIA is not set @@ -1049,6 +1058,8 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_EMC=m CONFIG_DM_MULTIPATH_RDAC=m +CONFIG_DM_MULTIPATH_HP=m +# CONFIG_DM_UEVENT is not set # # Fusion MPT device support @@ -1060,6 +1071,7 @@ CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set CONFIG_FIREWIRE=m CONFIG_FIREWIRE_OHCI=m CONFIG_FIREWIRE_SBP2=m @@ -1198,12 +1210,15 @@ CONFIG_QLA3XXX=m # CONFIG_CHELSIO_T1=m # CONFIG_CHELSIO_T3 is not set +CONFIG_IXGBE=m CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y CONFIG_MYRI10GE=m CONFIG_NETXEN_NIC=m +CONFIG_BNX2X=m +# CONFIG_MLX4_CORE is not set # # Token Ring devices @@ -1267,9 +1282,11 @@ CONFIG_HOSTAP_CS=m CONFIG_ZD1211RW=m # CONFIG_ZD1211RW_DEBUG is not set CONFIG_NET_WIRELESS=y -CONFIG_IWLWIFI=y -CONFIG_IWLWIFI_DEBUG=y CONFIG_IWL4965=m +# CONFIG_IWL4965_QOS is not set +# CONFIG_IWL4965_SPECTRUM_MEASUREMENT is not set +# CONFIG_IWL4965_SENSITIVITY is not set +# CONFIG_IWL4965_DEBUG is not set # CONFIG_IWL3945 is not set # @@ -1679,10 +1696,6 @@ CONFIG_MAX_RAW_DEVS=8192 # CONFIG_HPET is not set CONFIG_HANGCHECK_TIMER=m CONFIG_MMTIMER=y - -# -# TPM devices -# # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set @@ -2190,6 +2203,7 @@ CONFIG_USB_RTL8150=m CONFIG_USB_USBNET=m CONFIG_USB_NET_AX8817X=m CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_DM9601=m CONFIG_USB_NET_GL620A=m CONFIG_USB_NET_NET1080=m CONFIG_USB_NET_PLUSB=m @@ -2320,10 +2334,6 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_IDE_DISK=y CONFIG_LEDS_TRIGGER_HEARTBEAT=m - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set # @@ -2457,6 +2467,7 @@ CONFIG_CONFIGFS_FS=m # # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set +CONFIG_ECRYPT_FS=m CONFIG_HFS_FS=m CONFIG_HFSPLUS_FS=m # CONFIG_BEFS_FS is not set @@ -2674,7 +2685,16 @@ CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT=y # Cryptographic options # CONFIG_CRYPTO=y +CONFIG_CRYPTO_API=m +CONFIG_CRYPTO_ALGAPI=m +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_BLKCIPHER=m +CONFIG_CRYPTO_SEQIV=m +CONFIG_CRYPTO_HASH=m +CONFIG_CRYPTO_MANAGER=m CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_NHMAC=m +CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=m @@ -2683,6 +2703,10 @@ CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_TWOFISH=m @@ -2698,6 +2722,7 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_CRC32C=y # CONFIG_CRYPTO_TEST is not set +CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_SIGNATURE=y CONFIG_CRYPTO_SIGNATURE_DSA=y CONFIG_CRYPTO_MPILIB=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64-smp.config index be86ee86a7a94b67cf739305adc563c3ee5c0938..d97b3e30f9e5d7e47b365740af2f1cf69b21da78 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64-smp.config @@ -335,6 +335,7 @@ CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y CONFIG_XFRM=y +CONFIG_XFRM_NALGO=m CONFIG_XFRM_USER=y CONFIG_NET_KEY=m CONFIG_INET=y @@ -419,6 +420,7 @@ CONFIG_IPV6=m CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y +# CONFIG_IPV6_OPTIMISTIC_DAD is not set CONFIG_INET6_AH=m CONFIG_INET6_ESP=m CONFIG_INET6_IPCOMP=m @@ -736,12 +738,15 @@ CONFIG_FIB_RULES=y # Wireless # CONFIG_CFG80211=m +CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y CONFIG_NET_WIRELESS_RTNETLINK=y CONFIG_MAC80211=m +CONFIG_MAC80211_RCSIMPLE=y CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set CONFIG_MAC80211_DEBUG=y +# CONFIG_MAC80211_HT_DEBUG is not set # CONFIG_MAC80211_VERBOSE_DEBUG is not set # CONFIG_MAC80211_LOWTX_FRAME_DUMP is not set # CONFIG_TKIP_DEBUG is not set @@ -1063,6 +1068,7 @@ CONFIG_SCSI_FC_ATTRS=m # CONFIG_SCSI_ISCSI_ATTRS is not set CONFIG_SCSI_SAS_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m +# CONFIG_SCSI_SAS_ATA is not set # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set # @@ -1148,6 +1154,7 @@ CONFIG_SATA_ULI=m CONFIG_SATA_VIA=m CONFIG_SATA_VITESSE=m CONFIG_SATA_INIC162X=m +CONFIG_PATA_ACPI=m # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set @@ -1172,6 +1179,7 @@ CONFIG_PATA_MARVELL=m # CONFIG_PATA_OLDPIIX is not set # CONFIG_PATA_NETCELL is not set # CONFIG_PATA_NS87410 is not set +CONFIG_PATA_NS87415=m # CONFIG_PATA_OPTI is not set # CONFIG_PATA_OPTIDMA is not set # CONFIG_PATA_PCMCIA is not set @@ -1208,6 +1216,8 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_EMC=m CONFIG_DM_MULTIPATH_RDAC=m +CONFIG_DM_MULTIPATH_HP=m +# CONFIG_DM_UEVENT is not set # # Fusion MPT device support @@ -1219,6 +1229,7 @@ CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set CONFIG_FIREWIRE=m CONFIG_FIREWIRE_OHCI=m CONFIG_FIREWIRE_SBP2=m @@ -1366,12 +1377,15 @@ CONFIG_QLA3XXX=m # CONFIG_CHELSIO_T1=m # CONFIG_CHELSIO_T3 is not set +CONFIG_IXGBE=m CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y CONFIG_MYRI10GE=m CONFIG_NETXEN_NIC=m +CONFIG_BNX2X=m +# CONFIG_MLX4_CORE is not set # # Token Ring devices @@ -1449,9 +1463,11 @@ CONFIG_BCM43XX_DMA_AND_PIO_MODE=y CONFIG_ZD1211RW=m # CONFIG_ZD1211RW_DEBUG is not set CONFIG_NET_WIRELESS=y -CONFIG_IWLWIFI=y -CONFIG_IWLWIFI_DEBUG=y CONFIG_IWL4965=m +# CONFIG_IWL4965_QOS is not set +# CONFIG_IWL4965_SPECTRUM_MEASUREMENT is not set +# CONFIG_IWL4965_SENSITIVITY is not set +# CONFIG_IWL4965_DEBUG is not set # CONFIG_IWL3945 is not set # @@ -1819,6 +1835,7 @@ CONFIG_IBMASR=m # CONFIG_WAFER_WDT is not set CONFIG_I6300ESB_WDT=m CONFIG_I8XX_TCO=m +CONFIG_HP_WATCHDOG=m # CONFIG_SC1200_WDT is not set # CONFIG_60XX_WDT is not set # CONFIG_SBC8360_WDT is not set @@ -1885,10 +1902,6 @@ CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set # CONFIG_HPET_MMAP is not set CONFIG_HANGCHECK_TIMER=m - -# -# TPM devices -# # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set @@ -2412,6 +2425,7 @@ CONFIG_USB_RTL8150=m CONFIG_USB_USBNET=m CONFIG_USB_NET_AX8817X=m CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_DM9601=m CONFIG_USB_NET_GL620A=m CONFIG_USB_NET_NET1080=m CONFIG_USB_NET_PLUSB=m @@ -2543,10 +2557,6 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_IDE_DISK=y CONFIG_LEDS_TRIGGER_HEARTBEAT=m - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set # @@ -2560,6 +2570,8 @@ CONFIG_EDAC=y # CONFIG_EDAC_DEBUG is not set CONFIG_EDAC_MM_EDAC=m CONFIG_EDAC_E752X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I5000=m CONFIG_EDAC_K8=m CONFIG_EDAC_POLL=y @@ -2614,6 +2626,7 @@ CONFIG_INTEL_IOATDMA=m CONFIG_EDD=m CONFIG_DELL_RBU=m CONFIG_DCDBAS=m +# CONFIG_ISCSI_IBFT_FIND is not set # # File systems @@ -2697,6 +2710,7 @@ CONFIG_CONFIGFS_FS=m # # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set +CONFIG_ECRYPT_FS=m CONFIG_HFS_FS=m CONFIG_HFSPLUS_FS=m # CONFIG_BEFS_FS is not set @@ -2856,6 +2870,7 @@ CONFIG_SCHEDSTATS=y # CONFIG_DEBUG_RWSEMS is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -2897,7 +2912,16 @@ CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT=y # Cryptographic options # CONFIG_CRYPTO=y +CONFIG_CRYPTO_API=m +CONFIG_CRYPTO_ALGAPI=m +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_BLKCIPHER=m +CONFIG_CRYPTO_SEQIV=m +CONFIG_CRYPTO_HASH=m +CONFIG_CRYPTO_MANAGER=m CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_NHMAC=m +CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=m @@ -2906,6 +2930,10 @@ CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_TWOFISH=m @@ -2922,6 +2950,7 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_CRC32C=y # CONFIG_CRYPTO_TEST is not set +CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_SIGNATURE=y CONFIG_CRYPTO_SIGNATURE_DSA=y CONFIG_CRYPTO_MPILIB=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64.config index 99b4df7fe4dfa7e4b5d9057a7789be187b6a9f1d..abde45b4978afff321ae8cc4910852f9b47e56d0 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.18-2.6-rhel5-x86_64.config @@ -314,6 +314,7 @@ CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y CONFIG_XFRM=y +CONFIG_XFRM_NALGO=m CONFIG_XFRM_USER=y CONFIG_NET_KEY=m CONFIG_INET=y @@ -398,6 +399,7 @@ CONFIG_IPV6=m CONFIG_IPV6_PRIVACY=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y +# CONFIG_IPV6_OPTIMISTIC_DAD is not set CONFIG_INET6_AH=m CONFIG_INET6_ESP=m CONFIG_INET6_IPCOMP=m @@ -715,12 +717,15 @@ CONFIG_FIB_RULES=y # Wireless # CONFIG_CFG80211=m +CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y CONFIG_NET_WIRELESS_RTNETLINK=y CONFIG_MAC80211=m +CONFIG_MAC80211_RCSIMPLE=y CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set CONFIG_MAC80211_DEBUG=y +# CONFIG_MAC80211_HT_DEBUG is not set # CONFIG_MAC80211_VERBOSE_DEBUG is not set # CONFIG_MAC80211_LOWTX_FRAME_DUMP is not set # CONFIG_TKIP_DEBUG is not set @@ -1042,6 +1047,7 @@ CONFIG_SCSI_FC_ATTRS=m # CONFIG_SCSI_ISCSI_ATTRS is not set CONFIG_SCSI_SAS_ATTRS=m CONFIG_SCSI_SAS_LIBSAS=m +# CONFIG_SCSI_SAS_ATA is not set # CONFIG_SCSI_SAS_LIBSAS_DEBUG is not set # @@ -1127,6 +1133,7 @@ CONFIG_SATA_ULI=m CONFIG_SATA_VIA=m CONFIG_SATA_VITESSE=m CONFIG_SATA_INIC162X=m +CONFIG_PATA_ACPI=m # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set @@ -1151,6 +1158,7 @@ CONFIG_PATA_MARVELL=m # CONFIG_PATA_OLDPIIX is not set # CONFIG_PATA_NETCELL is not set # CONFIG_PATA_NS87410 is not set +CONFIG_PATA_NS87415=m # CONFIG_PATA_OPTI is not set # CONFIG_PATA_OPTIDMA is not set # CONFIG_PATA_PCMCIA is not set @@ -1187,6 +1195,8 @@ CONFIG_DM_ZERO=m CONFIG_DM_MULTIPATH=m CONFIG_DM_MULTIPATH_EMC=m CONFIG_DM_MULTIPATH_RDAC=m +CONFIG_DM_MULTIPATH_HP=m +# CONFIG_DM_UEVENT is not set # # Fusion MPT device support @@ -1198,6 +1208,7 @@ CONFIG_FUSION_SAS=m CONFIG_FUSION_MAX_SGE=40 CONFIG_FUSION_CTL=m CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set CONFIG_FIREWIRE=m CONFIG_FIREWIRE_OHCI=m CONFIG_FIREWIRE_SBP2=m @@ -1346,12 +1357,15 @@ CONFIG_QLA3XXX=m # CONFIG_CHELSIO_T1=m # CONFIG_CHELSIO_T3 is not set +CONFIG_IXGBE=m CONFIG_IXGB=m CONFIG_IXGB_NAPI=y CONFIG_S2IO=m CONFIG_S2IO_NAPI=y CONFIG_MYRI10GE=m CONFIG_NETXEN_NIC=m +CONFIG_BNX2X=m +# CONFIG_MLX4_CORE is not set # # Token Ring devices @@ -1429,9 +1443,11 @@ CONFIG_BCM43XX_DMA_AND_PIO_MODE=y CONFIG_ZD1211RW=m # CONFIG_ZD1211RW_DEBUG is not set CONFIG_NET_WIRELESS=y -CONFIG_IWLWIFI=y -CONFIG_IWLWIFI_DEBUG=y CONFIG_IWL4965=m +# CONFIG_IWL4965_QOS is not set +# CONFIG_IWL4965_SPECTRUM_MEASUREMENT is not set +# CONFIG_IWL4965_SENSITIVITY is not set +# CONFIG_IWL4965_DEBUG is not set # CONFIG_IWL3945 is not set # @@ -1802,6 +1818,7 @@ CONFIG_IBMASR=m # CONFIG_WAFER_WDT is not set CONFIG_I6300ESB_WDT=m CONFIG_I8XX_TCO=m +CONFIG_HP_WATCHDOG=m # CONFIG_SC1200_WDT is not set # CONFIG_60XX_WDT is not set # CONFIG_SBC8360_WDT is not set @@ -1869,10 +1886,6 @@ CONFIG_HPET=y # CONFIG_HPET_RTC_IRQ is not set # CONFIG_HPET_MMAP is not set CONFIG_HANGCHECK_TIMER=m - -# -# TPM devices -# # CONFIG_TCG_TPM is not set # CONFIG_TELCLOCK is not set @@ -2396,6 +2409,7 @@ CONFIG_USB_RTL8150=m CONFIG_USB_USBNET=m CONFIG_USB_NET_AX8817X=m CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_DM9601=m CONFIG_USB_NET_GL620A=m CONFIG_USB_NET_NET1080=m CONFIG_USB_NET_PLUSB=m @@ -2527,10 +2541,6 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=m CONFIG_LEDS_TRIGGER_IDE_DISK=y CONFIG_LEDS_TRIGGER_HEARTBEAT=m - -# -# InfiniBand support -# # CONFIG_INFINIBAND is not set # @@ -2544,6 +2554,8 @@ CONFIG_EDAC=y # CONFIG_EDAC_DEBUG is not set CONFIG_EDAC_MM_EDAC=m CONFIG_EDAC_E752X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I5000=m CONFIG_EDAC_K8=m CONFIG_EDAC_POLL=y @@ -2598,6 +2610,7 @@ CONFIG_INTEL_IOATDMA=m CONFIG_EDD=m CONFIG_DELL_RBU=m CONFIG_DCDBAS=m +# CONFIG_ISCSI_IBFT_FIND is not set # # File systems @@ -2681,6 +2694,7 @@ CONFIG_CONFIGFS_FS=m # # CONFIG_ADFS_FS is not set # CONFIG_AFFS_FS is not set +CONFIG_ECRYPT_FS=m CONFIG_HFS_FS=m CONFIG_HFSPLUS_FS=m # CONFIG_BEFS_FS is not set @@ -2840,6 +2854,7 @@ CONFIG_SCHEDSTATS=y # CONFIG_DEBUG_RWSEMS is not set # CONFIG_DEBUG_LOCK_ALLOC is not set # CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_DEBUG_KOBJECT is not set @@ -2881,7 +2896,16 @@ CONFIG_SECURITY_SELINUX_ENABLE_SECMARK_DEFAULT=y # Cryptographic options # CONFIG_CRYPTO=y +CONFIG_CRYPTO_API=m +CONFIG_CRYPTO_ALGAPI=m +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_BLKCIPHER=m +CONFIG_CRYPTO_SEQIV=m +CONFIG_CRYPTO_HASH=m +CONFIG_CRYPTO_MANAGER=m CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_NHMAC=m +CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_MD4=m CONFIG_CRYPTO_MD5=m @@ -2890,6 +2914,10 @@ CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_WP512=m CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_DES=m CONFIG_CRYPTO_BLOWFISH=m CONFIG_CRYPTO_TWOFISH=m @@ -2906,6 +2934,7 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRYPTO_MICHAEL_MIC=m CONFIG_CRYPTO_CRC32C=y # CONFIG_CRYPTO_TEST is not set +CONFIG_CRYPTO_AUTHENC=m CONFIG_CRYPTO_SIGNATURE=y CONFIG_CRYPTO_SIGNATURE_DSA=y CONFIG_CRYPTO_MPILIB=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686-bigsmp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686-bigsmp.config index 555489db8a22168422a67c7c24eb0e7f74d5a3a8..70240e43f0277a67b1283d847e1c2a9791ec0e9c 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686-bigsmp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686-bigsmp.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22.5 -# Wed Feb 20 01:04:06 2008 +# Linux kernel version: 2.6.22.14 +# Wed May 7 01:22:58 2008 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -61,7 +61,7 @@ CONFIG_SYSCTL=y CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y -# CONFIG_KALLSYMS_ALL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_HOTPLUG=y CONFIG_PRINTK=y @@ -72,7 +72,6 @@ CONFIG_FUTEX=y CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y @@ -251,8 +250,7 @@ CONFIG_PM=y CONFIG_PM_LEGACY=y # CONFIG_PM_DEBUG is not set # CONFIG_PM_SYSFS_DEPRECATED is not set -CONFIG_SOFTWARE_SUSPEND=y -CONFIG_PM_STD_PARTITION="" +# CONFIG_SOFTWARE_SUSPEND is not set CONFIG_SUSPEND_SMP=y # @@ -3029,8 +3027,8 @@ CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y CONFIG_TMPFS=y # CONFIG_TMPFS_POSIX_ACL is not set -CONFIG_HUGETLBFS=y -CONFIG_HUGETLB_PAGE=y +# CONFIG_HUGETLBFS is not set +# CONFIG_HUGETLB_PAGE is not set CONFIG_RAMFS=y CONFIG_CONFIGFS_FS=m @@ -3190,35 +3188,38 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y -# CONFIG_SCHEDSTATS is not set +CONFIG_SCHEDSTATS=y # CONFIG_TIMER_STATS is not set -# CONFIG_DEBUG_SLAB is not set -# CONFIG_DEBUG_RT_MUTEXES is not set +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUG_SLAB_LEAK=y +CONFIG_DEBUG_RT_MUTEXES=y +CONFIG_DEBUG_PI_LIST=y # CONFIG_RT_MUTEX_TESTER is not set CONFIG_DEBUG_SPINLOCK=y -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_PROVE_LOCKING is not set +CONFIG_DEBUG_MUTEXES=y +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +CONFIG_LOCKDEP=y +CONFIG_DEBUG_LOCKDEP=y +CONFIG_TRACE_IRQFLAGS=y CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set -# CONFIG_DEBUG_HIGHMEM is not set +CONFIG_DEBUG_HIGHMEM=y CONFIG_DEBUG_BUGVERBOSE=y -# CONFIG_DEBUG_INFO is not set -# CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_LIST is not set -# CONFIG_FRAME_POINTER is not set +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_LIST=y +CONFIG_FRAME_POINTER=y CONFIG_FORCED_INLINING=y # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set CONFIG_EARLY_PRINTK=y -# CONFIG_DEBUG_STACKOVERFLOW is not set +CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_STACK_USAGE is not set - -# -# Page alloc debug is incompatible with Software Suspend on i386 -# +CONFIG_DEBUG_PAGEALLOC=y # CONFIG_DEBUG_RODATA is not set # CONFIG_4KSTACKS is not set CONFIG_X86_FIND_SMP_CONFIG=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686.config index cc945369a66ef691e4d329569d42585a5cde4b13..20d4c9904f58d7a4c9b6db7fde069b672c73ffb1 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-i686.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22.5 -# Wed Feb 20 01:06:07 2008 +# Linux kernel version: 2.6.22.14 +# Wed May 7 01:38:41 2008 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y @@ -59,7 +59,7 @@ CONFIG_SYSCTL=y CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y -# CONFIG_KALLSYMS_ALL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_HOTPLUG=y CONFIG_PRINTK=y @@ -70,7 +70,6 @@ CONFIG_FUTEX=y CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y @@ -213,10 +212,7 @@ CONFIG_SPARSEMEM_MANUAL=y CONFIG_SPARSEMEM=y CONFIG_HAVE_MEMORY_PRESENT=y CONFIG_SPARSEMEM_STATIC=y - -# -# Memory hotplug is currently incompatible with Software Suspend -# +# CONFIG_MEMORY_HOTPLUG is not set CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 @@ -247,8 +243,7 @@ CONFIG_PM=y CONFIG_PM_LEGACY=y # CONFIG_PM_DEBUG is not set # CONFIG_PM_SYSFS_DEPRECATED is not set -CONFIG_SOFTWARE_SUSPEND=y -CONFIG_PM_STD_PARTITION="" +# CONFIG_SOFTWARE_SUSPEND is not set # # ACPI (Advanced Configuration and Power Interface) Support @@ -3024,8 +3019,8 @@ CONFIG_PROC_SYSCTL=y CONFIG_SYSFS=y CONFIG_TMPFS=y # CONFIG_TMPFS_POSIX_ACL is not set -CONFIG_HUGETLBFS=y -CONFIG_HUGETLB_PAGE=y +# CONFIG_HUGETLBFS is not set +# CONFIG_HUGETLB_PAGE is not set CONFIG_RAMFS=y CONFIG_CONFIGFS_FS=m @@ -3185,35 +3180,38 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y -# CONFIG_SCHEDSTATS is not set +CONFIG_SCHEDSTATS=y # CONFIG_TIMER_STATS is not set -# CONFIG_DEBUG_SLAB is not set -# CONFIG_DEBUG_RT_MUTEXES is not set +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUG_SLAB_LEAK=y +CONFIG_DEBUG_RT_MUTEXES=y +CONFIG_DEBUG_PI_LIST=y # CONFIG_RT_MUTEX_TESTER is not set CONFIG_DEBUG_SPINLOCK=y -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_PROVE_LOCKING is not set +CONFIG_DEBUG_MUTEXES=y +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +CONFIG_LOCKDEP=y +CONFIG_DEBUG_LOCKDEP=y +CONFIG_TRACE_IRQFLAGS=y CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set -# CONFIG_DEBUG_HIGHMEM is not set +CONFIG_DEBUG_HIGHMEM=y CONFIG_DEBUG_BUGVERBOSE=y -# CONFIG_DEBUG_INFO is not set -# CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_LIST is not set -# CONFIG_FRAME_POINTER is not set +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_LIST=y +CONFIG_FRAME_POINTER=y CONFIG_FORCED_INLINING=y # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set CONFIG_EARLY_PRINTK=y -# CONFIG_DEBUG_STACKOVERFLOW is not set +CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_STACK_USAGE is not set - -# -# Page alloc debug is incompatible with Software Suspend on i386 -# +CONFIG_DEBUG_PAGEALLOC=y # CONFIG_DEBUG_RODATA is not set # CONFIG_4KSTACKS is not set CONFIG_DOUBLEFAULT=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64-smp.config index 63f31dc3fae92527298b7fa46b80a45326cbae84..6bf541c3b38ba1f2de2f327e146e690c2a4e2ab3 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64-smp.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22.5 -# Tue Feb 19 23:02:41 2008 +# Linux kernel version: 2.6.22.14 +# Wed May 7 01:15:09 2008 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -68,7 +68,7 @@ CONFIG_SYSCTL=y CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y -# CONFIG_KALLSYMS_ALL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_HOTPLUG=y CONFIG_PRINTK=y @@ -79,7 +79,6 @@ CONFIG_FUTEX=y CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y @@ -3081,30 +3080,36 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y -# CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set -# CONFIG_DEBUG_SLAB is not set -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_RT_MUTEX_TESTER is not set +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUG_SLAB_LEAK=y +CONFIG_DEBUG_RT_MUTEXES=y +CONFIG_DEBUG_PI_LIST=y +CONFIG_RT_MUTEX_TESTER=y CONFIG_DEBUG_SPINLOCK=y -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_PROVE_LOCKING is not set +CONFIG_DEBUG_MUTEXES=y +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +CONFIG_LOCKDEP=y +CONFIG_DEBUG_LOCKDEP=y +CONFIG_TRACE_IRQFLAGS=y CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set CONFIG_DEBUG_BUGVERBOSE=y -# CONFIG_DEBUG_INFO is not set -# CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_LIST is not set -# CONFIG_FRAME_POINTER is not set +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_LIST=y +CONFIG_FRAME_POINTER=y CONFIG_FORCED_INLINING=y # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_DEBUG_RODATA is not set # CONFIG_IOMMU_DEBUG is not set -# CONFIG_DEBUG_STACKOVERFLOW is not set +CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_STACK_USAGE is not set # diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64.config index 0fead82d420c75ea45499d9b1c913e80f94bc77e..e62bc4b861f752582d41c3dfd5e22d1eacc5dca6 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.22-2.6-sles10-x86_64.config @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.22.5 -# Wed Feb 20 01:00:44 2008 +# Linux kernel version: 2.6.22.14 +# Wed May 7 01:18:20 2008 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -67,7 +67,7 @@ CONFIG_SYSCTL=y CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y -# CONFIG_KALLSYMS_ALL is not set +CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y CONFIG_HOTPLUG=y CONFIG_PRINTK=y @@ -78,7 +78,6 @@ CONFIG_FUTEX=y CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y CONFIG_EVENTFD=y CONFIG_SHMEM=y CONFIG_VM_EVENT_COUNTERS=y @@ -3064,30 +3063,36 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set CONFIG_DETECT_SOFTLOCKUP=y -# CONFIG_SCHEDSTATS is not set -# CONFIG_TIMER_STATS is not set -# CONFIG_DEBUG_SLAB is not set -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_RT_MUTEX_TESTER is not set +CONFIG_SCHEDSTATS=y +CONFIG_TIMER_STATS=y +CONFIG_DEBUG_SLAB=y +CONFIG_DEBUG_SLAB_LEAK=y +CONFIG_DEBUG_RT_MUTEXES=y +CONFIG_DEBUG_PI_LIST=y +CONFIG_RT_MUTEX_TESTER=y CONFIG_DEBUG_SPINLOCK=y -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_PROVE_LOCKING is not set +CONFIG_DEBUG_MUTEXES=y +CONFIG_DEBUG_LOCK_ALLOC=y +CONFIG_PROVE_LOCKING=y +CONFIG_LOCKDEP=y +CONFIG_DEBUG_LOCKDEP=y +CONFIG_TRACE_IRQFLAGS=y CONFIG_DEBUG_SPINLOCK_SLEEP=y # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set CONFIG_DEBUG_BUGVERBOSE=y -# CONFIG_DEBUG_INFO is not set -# CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_LIST is not set -# CONFIG_FRAME_POINTER is not set +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_LIST=y +CONFIG_FRAME_POINTER=y CONFIG_FORCED_INLINING=y # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_DEBUG_RODATA is not set # CONFIG_IOMMU_DEBUG is not set -# CONFIG_DEBUG_STACKOVERFLOW is not set +CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_STACK_USAGE is not set # diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config index 9971cfae0f63dd06f971f30e699f4c9caf17122b..1cd6e57678ce011276b64253fd0bd53cc0b20416 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-i686-smp.config @@ -2817,7 +2817,7 @@ CONFIG_MAGIC_SYSRQ=y # CONFIG_DEBUG_SPINLOCK is not set # CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_HIGHMEM is not set -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_FRAME_POINTER is not set # CONFIG_KDB is not set diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config index c205dc48e07075a90169aa1333d641fbaee9ebd4..6132cf5419c7fd771db71ea8bcb6c75ebbcedd5c 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64-smp.config @@ -2364,7 +2364,7 @@ CONFIG_MAGIC_SYSRQ=y # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_IA64_DEBUG_CMPXCHG is not set # CONFIG_IA64_DEBUG_IRQ is not set -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_SYSVIPC_COMPAT=y # diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config index c205dc48e07075a90169aa1333d641fbaee9ebd4..6132cf5419c7fd771db71ea8bcb6c75ebbcedd5c 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ia64.config @@ -2364,7 +2364,7 @@ CONFIG_MAGIC_SYSRQ=y # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_IA64_DEBUG_CMPXCHG is not set # CONFIG_IA64_DEBUG_IRQ is not set -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_SYSVIPC_COMPAT=y # diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config index 76b4290fae6247649f39e0d6cd0f9ca9a689a834..149908b267362c86f0d6ac9d7ce98e40cbc4b341 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc-pseries64.config @@ -1392,7 +1392,7 @@ CONFIG_KDB=y CONFIG_KDB_MODULES=y CONFIG_KDB_OFF=y # CONFIG_PPCDBG is not set -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_IRQSTACKS=y # diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config index b5e692be4e9851c00d84cbe3729795a3c3817309..c36face9ee7fa20c5f9c952ded080d2525b7da8d 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.5-2.6-suse-ppc.config @@ -1391,7 +1391,7 @@ CONFIG_KDB=y CONFIG_KDB_MODULES=y CONFIG_KDB_OFF=y # CONFIG_PPCDBG is not set -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_IRQSTACKS=y # diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config index be52d349d8238507cc047d6910627bacc1f0a39d..df54dfe97f988491bcbadf5868ffffb0d617a709 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-i686-smp.config @@ -2390,7 +2390,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y CONFIG_DEBUG_HIGHMEM=y -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y # CONFIG_FRAME_POINTER is not set CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config index 6fab5b2412c6901f8f6e87071f18a27bb119ba6c..c1fe7ac05b55eea71941c08496a5a3fb3584f54a 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64-smp.config @@ -1967,7 +1967,7 @@ CONFIG_MAGIC_SYSRQ=y # CONFIG_DEBUG_SLAB is not set CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_KPROBES=y CONFIG_IA64_GRANULE_16MB=y # CONFIG_IA64_GRANULE_64MB is not set diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config index 0345fbe6b003f57cb81aaa144c7e89d29ec8b9d2..4584cf42d8d95b1cf76a844a4c963e90cc706296 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-ia64.config @@ -1967,7 +1967,7 @@ CONFIG_MAGIC_SYSRQ=y # CONFIG_DEBUG_SLAB is not set CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_KPROBES=y CONFIG_IA64_GRANULE_16MB=y # CONFIG_IA64_GRANULE_64MB is not set diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config index 72d0c0deb0c9de403f4c42ff73d57067a9aa686d..5df35582e9f08797cdc5733fb9b410a2886af277 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64-smp.config @@ -2150,7 +2150,7 @@ CONFIG_MAGIC_SYSRQ=y # CONFIG_DEBUG_SLAB is not set CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_INIT_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_IOMMU_DEBUG is not set diff --git a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config index 00293e2360b13b077e88df688a95e0f3646e587a..e1ee70c31d3c7c4f375017eb4e2fbd4a33db8db7 100644 --- a/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config +++ b/lustre/kernel_patches/kernel_configs/kernel-2.6.9-2.6-rhel4-x86_64.config @@ -2150,7 +2150,7 @@ CONFIG_MAGIC_SYSRQ=y # CONFIG_DEBUG_SLAB is not set CONFIG_DEBUG_SPINLOCK=y CONFIG_DEBUG_SPINLOCK_SLEEP=y -# CONFIG_DEBUG_INFO is not set +CONFIG_DEBUG_INFO=y CONFIG_INIT_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_IOMMU_DEBUG is not set diff --git a/lustre/kernel_patches/patches/jbd-journal-chksum-2.6-sles10.patch b/lustre/kernel_patches/patches/jbd-journal-chksum-2.6-sles10.patch index 4f4f0693bcb12fe00f3d14ae41a77095f0950886..1a6f326a486d6c22a2a813b2aab03c3a7f886021 100644 --- a/lustre/kernel_patches/patches/jbd-journal-chksum-2.6-sles10.patch +++ b/lustre/kernel_patches/patches/jbd-journal-chksum-2.6-sles10.patch @@ -360,7 +360,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c #endif /* -@@ -307,6 +308,37 @@ int journal_skip_recovery(journal_t *jou +@@ -307,6 +308,38 @@ int journal_skip_recovery(journal_t *jou return err; } @@ -391,6 +391,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c + *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, + obh->b_size); + } ++ put_bh(obh); + } + return 0; +} @@ -398,7 +399,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { -@@ -318,6 +350,7 @@ static int do_one_pass(journal_t *journa +@@ -318,6 +351,7 @@ static int do_one_pass(journal_t *journa struct buffer_head * bh; unsigned int sequence; int blocktype; @@ -406,7 +407,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c /* Precompute the maximum metadata descriptors in a descriptor block */ int MAX_BLOCKS_PER_DESC; -@@ -409,9 +442,24 @@ static int do_one_pass(journal_t *journa +@@ -409,9 +443,24 @@ static int do_one_pass(journal_t *journa switch(blocktype) { case JFS_DESCRIPTOR_BLOCK: /* If it is a valid descriptor block, replay it @@ -433,7 +434,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c next_log_block += count_tags(bh, journal->j_blocksize); wrap(journal, next_log_block); -@@ -506,9 +554,97 @@ static int do_one_pass(journal_t *journa +@@ -506,9 +555,97 @@ static int do_one_pass(journal_t *journa continue; case JFS_COMMIT_BLOCK: @@ -533,7 +534,7 @@ Index: linux-2.6.16.53-0.16/fs/jbd/recovery.c brelse(bh); next_commit_ID++; continue; -@@ -543,9 +679,10 @@ static int do_one_pass(journal_t *journa +@@ -543,9 +680,10 @@ static int do_one_pass(journal_t *journa * transaction marks the end of the valid log. */ diff --git a/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch index 90ede90000214d95d63efc3497910e6c8555bdd0..c2240611c41e35df86814f07d44c078aa214f28f 100644 --- a/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch +++ b/lustre/kernel_patches/patches/jbd-journal-chksum-2.6.18-vanilla.patch @@ -368,7 +368,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c #endif /* -@@ -307,6 +308,37 @@ int journal_skip_recovery(journal_t *jou +@@ -307,6 +308,38 @@ int journal_skip_recovery(journal_t *jou return err; } @@ -399,6 +399,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c + *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, + obh->b_size); + } ++ put_bh(obh); + } + return 0; +} @@ -406,7 +407,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass) { -@@ -318,6 +350,7 @@ static int do_one_pass(journal_t *journa +@@ -318,6 +351,7 @@ static int do_one_pass(journal_t *journa struct buffer_head * bh; unsigned int sequence; int blocktype; @@ -414,7 +415,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c /* Precompute the maximum metadata descriptors in a descriptor block */ int MAX_BLOCKS_PER_DESC; -@@ -409,9 +442,24 @@ static int do_one_pass(journal_t *journa +@@ -409,9 +443,24 @@ static int do_one_pass(journal_t *journa switch(blocktype) { case JFS_DESCRIPTOR_BLOCK: /* If it is a valid descriptor block, replay it @@ -441,7 +442,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c next_log_block += count_tags(bh, journal->j_blocksize); wrap(journal, next_log_block); -@@ -506,9 +554,97 @@ static int do_one_pass(journal_t *journa +@@ -506,9 +555,97 @@ static int do_one_pass(journal_t *journa continue; case JFS_COMMIT_BLOCK: @@ -541,7 +542,7 @@ Index: linux-2.6.18.8/fs/jbd/recovery.c brelse(bh); next_commit_ID++; continue; -@@ -544,9 +680,10 @@ static int do_one_pass(journal_t *journa +@@ -544,9 +681,10 @@ static int do_one_pass(journal_t *journa * transaction marks the end of the valid log. */ diff --git a/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch b/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch index 6d584b4b7ac872f118a18e064f56fcd6f33c03a0..f198a43d9ef5a09a66854a4be29ce38c7ec0172f 100644 --- a/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch +++ b/lustre/kernel_patches/patches/qsnet-rhel4-2.6.patch @@ -1693,7 +1693,7 @@ Index: linux-269-5502/include/linux/ptrack.h + .ptrack_list = LIST_HEAD_INIT(tsk.ptrack_list) + +#else -+#define ptrack_call_callbacks (phase, child) (0) ++#define ptrack_call_callbacks(phase, child) (0) + +#define INIT_TASK_PTRACK(tsk) + diff --git a/lustre/kernel_patches/patches/qsnet-suse-2.6.patch b/lustre/kernel_patches/patches/qsnet-suse-2.6.patch index b312ab0332811613b7f1b5bddffdb8398aef9faa..27b5a52413d2b2902f5230a09fb9476e1f2e0c9a 100644 --- a/lustre/kernel_patches/patches/qsnet-suse-2.6.patch +++ b/lustre/kernel_patches/patches/qsnet-suse-2.6.patch @@ -991,7 +991,7 @@ Index: LINUX-SRC-TREE/include/linux/ptrack.h + .ptrack_list = LIST_HEAD_INIT(tsk.ptrack_list) + +#else -+#define ptrack_call_callbacks (phase, child) (0) ++#define ptrack_call_callbacks(phase, child) (0) + +#define INIT_TASK_PTRACK(tsk) + diff --git a/lustre/kernel_patches/targets/2.6-rhel4.target.in b/lustre/kernel_patches/targets/2.6-rhel4.target.in index c7226f69d0fe38590e45f916cc76fed3660c2186..4822946c4a526d68be5b21f51de74bb18c3e226b 100644 --- a/lustre/kernel_patches/targets/2.6-rhel4.target.in +++ b/lustre/kernel_patches/targets/2.6-rhel4.target.in @@ -1,5 +1,5 @@ lnxmaj="2.6.9" -lnxrel="67.0.7.EL" +lnxrel="67.0.20.EL" KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2 SERIES=2.6-rhel4.series diff --git a/lustre/kernel_patches/targets/2.6-rhel5.target.in b/lustre/kernel_patches/targets/2.6-rhel5.target.in index 90df0f944509db9096e70e8ee08b2795d28b3a79..ccaa05e9cfb60057e99ebb87e319a1fad3f52aca 100644 --- a/lustre/kernel_patches/targets/2.6-rhel5.target.in +++ b/lustre/kernel_patches/targets/2.6-rhel5.target.in @@ -1,5 +1,5 @@ lnxmaj="2.6.18" -lnxrel="53.1.14.el5" +lnxrel="53.1.21.el5" KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2 SERIES=2.6-rhel5.series diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index ac4b99289ec966b2420987260d23023aa0d78163..eb903492270650851e8a08caaa083642283d1b4e 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -3,14 +3,14 @@ SERIES VERSION COMMENT SUPPORTED KERNELS: 2.6-suse SLES9 before SP1 already in SLES9 SP1 kernel 2.6-suse-newer SLES9: 2.6.5-7.311 extra patches for SLES9 after SP1 -2.6-rhel4 RHEL4: 2.6.9-67.0.7.EL +2.6-rhel4 RHEL4: 2.6.9-67.0.20.EL 2.6-sles10 SLES10: 2.6.16.54-0.2.5 -2.6-rhel5 RHEL5: 2.6.18-53.1.14.el5 +2.6-rhel5 RHEL5: 2.6.18-92.1.6.el5 2.6.18-vanilla kernel.org: 2.6.18.8 2.6.22-vanilla kernel.org: 2.6.22.14 CLIENT SUPPORT FOR UNPATCHED KERNELS: - kernel.org 2.6.16-2.6.19 + kernel.org 2.6.16-2.6.22 RHEL4: 2.6.9-42.0.8EL NB - The patches in the 2.6-suse series are already in the SLES9 SP1 diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index cc0e3aa92020ef3fb79f5ff842604157a5b08548..2db142f4f152560aaf4b7ddd0597fa010a596e3b 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -408,170 +408,169 @@ ldlm_extent_compat_queue(struct list_head *queue, struct ldlm_lock *req, compat = 0; } } - RETURN(compat); - } - - /* for waiting queue */ - list_for_each(tmp, queue) { - check_contention = 1; - - lock = list_entry(tmp, struct ldlm_lock, l_res_link); + } else { + /* for waiting queue */ + list_for_each(tmp, queue) { + check_contention = 1; - if (req == lock) - break; + lock = list_entry(tmp, struct ldlm_lock, l_res_link); - if (unlikely(scan)) { - /* We only get here if we are queuing GROUP lock - and met some incompatible one. The main idea of this - code is to insert GROUP lock past compatible GROUP - lock in the waiting queue or if there is not any, - then in front of first non-GROUP lock */ - if (lock->l_req_mode != LCK_GROUP) { - /* Ok, we hit non-GROUP lock, there should be no - more GROUP locks later on, queue in front of - first non-GROUP lock */ - - ldlm_resource_insert_lock_after(lock, req); - list_del_init(&lock->l_res_link); - ldlm_resource_insert_lock_after(req, lock); - compat = 0; - break; - } - if (req->l_policy_data.l_extent.gid == - lock->l_policy_data.l_extent.gid) { - /* found it */ - ldlm_resource_insert_lock_after(lock, req); - compat = 0; + if (req == lock) break; - } - continue; - } - /* locks are compatible, overlap doesn't matter */ - if (lockmode_compat(lock->l_req_mode, req_mode)) { - if (req_mode == LCK_PR && - ((lock->l_policy_data.l_extent.start <= - req->l_policy_data.l_extent.start) && - (lock->l_policy_data.l_extent.end >= - req->l_policy_data.l_extent.end))) { - /* If we met a PR lock just like us or wider, - and nobody down the list conflicted with - it, that means we can skip processing of - the rest of the list and safely place - ourselves at the end of the list, or grant - (dependent if we met an conflicting locks - before in the list). - In case of 1st enqueue only we continue - traversing if there is something conflicting - down the list because we need to make sure - that something is marked as AST_SENT as well, - in cse of empy worklist we would exit on - first conflict met. */ - /* There IS a case where such flag is - not set for a lock, yet it blocks - something. Luckily for us this is - only during destroy, so lock is - exclusive. So here we are safe */ - if (!(lock->l_flags & LDLM_FL_AST_SENT)) { - RETURN(compat); + if (unlikely(scan)) { + /* We only get here if we are queuing GROUP lock + and met some incompatible one. The main idea of this + code is to insert GROUP lock past compatible GROUP + lock in the waiting queue or if there is not any, + then in front of first non-GROUP lock */ + if (lock->l_req_mode != LCK_GROUP) { + /* Ok, we hit non-GROUP lock, there should be no + more GROUP locks later on, queue in front of + first non-GROUP lock */ + + ldlm_resource_insert_lock_after(lock, req); + list_del_init(&lock->l_res_link); + ldlm_resource_insert_lock_after(req, lock); + compat = 0; + break; + } + if (req->l_policy_data.l_extent.gid == + lock->l_policy_data.l_extent.gid) { + /* found it */ + ldlm_resource_insert_lock_after(lock, req); + compat = 0; + break; } + continue; } - /* non-group locks are compatible, overlap doesn't - matter */ - if (likely(req_mode != LCK_GROUP)) - continue; + /* locks are compatible, overlap doesn't matter */ + if (lockmode_compat(lock->l_req_mode, req_mode)) { + if (req_mode == LCK_PR && + ((lock->l_policy_data.l_extent.start <= + req->l_policy_data.l_extent.start) && + (lock->l_policy_data.l_extent.end >= + req->l_policy_data.l_extent.end))) { + /* If we met a PR lock just like us or wider, + and nobody down the list conflicted with + it, that means we can skip processing of + the rest of the list and safely place + ourselves at the end of the list, or grant + (dependent if we met an conflicting locks + before in the list). + In case of 1st enqueue only we continue + traversing if there is something conflicting + down the list because we need to make sure + that something is marked as AST_SENT as well, + in cse of empy worklist we would exit on + first conflict met. */ + /* There IS a case where such flag is + not set for a lock, yet it blocks + something. Luckily for us this is + only during destroy, so lock is + exclusive. So here we are safe */ + if (!(lock->l_flags & LDLM_FL_AST_SENT)) { + RETURN(compat); + } + } - /* If we are trying to get a GROUP lock and there is - another one of this kind, we need to compare gid */ - if (req->l_policy_data.l_extent.gid == - lock->l_policy_data.l_extent.gid) { - /* If existing lock with matched gid is granted, - we grant new one too. */ - if (lock->l_req_mode == lock->l_granted_mode) - RETURN(2); + /* non-group locks are compatible, overlap doesn't + matter */ + if (likely(req_mode != LCK_GROUP)) + continue; - /* Otherwise we are scanning queue of waiting - * locks and it means current request would - * block along with existing lock (that is - * already blocked. - * If we are in nonblocking mode - return - * immediately */ - if (*flags & LDLM_FL_BLOCK_NOWAIT) { - compat = -EWOULDBLOCK; - goto destroylock; + /* If we are trying to get a GROUP lock and there is + another one of this kind, we need to compare gid */ + if (req->l_policy_data.l_extent.gid == + lock->l_policy_data.l_extent.gid) { + /* If existing lock with matched gid is granted, + we grant new one too. */ + if (lock->l_req_mode == lock->l_granted_mode) + RETURN(2); + + /* Otherwise we are scanning queue of waiting + * locks and it means current request would + * block along with existing lock (that is + * already blocked. + * If we are in nonblocking mode - return + * immediately */ + if (*flags & LDLM_FL_BLOCK_NOWAIT) { + compat = -EWOULDBLOCK; + goto destroylock; + } + /* If this group lock is compatible with another + * group lock on the waiting list, they must be + * together in the list, so they can be granted + * at the same time. Otherwise the later lock + * can get stuck behind another, incompatible, + * lock. */ + ldlm_resource_insert_lock_after(lock, req); + /* Because 'lock' is not granted, we can stop + * processing this queue and return immediately. + * There is no need to check the rest of the + * list. */ + RETURN(0); } - /* If this group lock is compatible with another - * group lock on the waiting list, they must be - * together in the list, so they can be granted - * at the same time. Otherwise the later lock - * can get stuck behind another, incompatible, - * lock. */ - ldlm_resource_insert_lock_after(lock, req); - /* Because 'lock' is not granted, we can stop - * processing this queue and return immediately. - * There is no need to check the rest of the - * list. */ - RETURN(0); } - } - if (unlikely(req_mode == LCK_GROUP && - (lock->l_req_mode != lock->l_granted_mode))) { - scan = 1; - compat = 0; - if (lock->l_req_mode != LCK_GROUP) { - /* Ok, we hit non-GROUP lock, there should - * be no more GROUP locks later on, queue in - * front of first non-GROUP lock */ - - ldlm_resource_insert_lock_after(lock, req); - list_del_init(&lock->l_res_link); - ldlm_resource_insert_lock_after(req, lock); - break; - } - if (req->l_policy_data.l_extent.gid == - lock->l_policy_data.l_extent.gid) { - /* found it */ - ldlm_resource_insert_lock_after(lock, req); - break; + if (unlikely(req_mode == LCK_GROUP && + (lock->l_req_mode != lock->l_granted_mode))) { + scan = 1; + compat = 0; + if (lock->l_req_mode != LCK_GROUP) { + /* Ok, we hit non-GROUP lock, there should + * be no more GROUP locks later on, queue in + * front of first non-GROUP lock */ + + ldlm_resource_insert_lock_after(lock, req); + list_del_init(&lock->l_res_link); + ldlm_resource_insert_lock_after(req, lock); + break; + } + if (req->l_policy_data.l_extent.gid == + lock->l_policy_data.l_extent.gid) { + /* found it */ + ldlm_resource_insert_lock_after(lock, req); + break; + } + continue; } - continue; - } - if (unlikely(lock->l_req_mode == LCK_GROUP)) { - /* If compared lock is GROUP, then requested is PR/PW/ - * so this is not compatible; extent range does not - * matter */ - if (*flags & LDLM_FL_BLOCK_NOWAIT) { - compat = -EWOULDBLOCK; - goto destroylock; - } else { - *flags |= LDLM_FL_NO_TIMEOUT; - } - } else if (lock->l_policy_data.l_extent.end < req_start || - lock->l_policy_data.l_extent.start > req_end) { - /* if a non group lock doesn't overlap skip it */ - continue; - } else if (lock->l_req_extent.end < req_start || - lock->l_req_extent.start > req_end) - /* false contention, the requests doesn't really overlap */ + if (unlikely(lock->l_req_mode == LCK_GROUP)) { + /* If compared lock is GROUP, then requested is PR/PW/ + * so this is not compatible; extent range does not + * matter */ + if (*flags & LDLM_FL_BLOCK_NOWAIT) { + compat = -EWOULDBLOCK; + goto destroylock; + } else { + *flags |= LDLM_FL_NO_TIMEOUT; + } + } else if (lock->l_policy_data.l_extent.end < req_start || + lock->l_policy_data.l_extent.start > req_end) { + /* if a non group lock doesn't overlap skip it */ + continue; + } else if (lock->l_req_extent.end < req_start || + lock->l_req_extent.start > req_end) + /* false contention, the requests doesn't really overlap */ check_contention = 0; - if (!work_list) - RETURN(0); + if (!work_list) + RETURN(0); - /* don't count conflicting glimpse locks */ - if (lock->l_req_mode == LCK_PR && - lock->l_policy_data.l_extent.start == 0 && - lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF) - check_contention = 0; + /* don't count conflicting glimpse locks */ + if (lock->l_req_mode == LCK_PR && + lock->l_policy_data.l_extent.start == 0 && + lock->l_policy_data.l_extent.end == OBD_OBJECT_EOF) + check_contention = 0; - *contended_locks += check_contention; + *contended_locks += check_contention; - compat = 0; - if (lock->l_blocking_ast) - ldlm_add_ast_work_item(lock, req, work_list); + compat = 0; + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, req, work_list); + } } if (ldlm_check_contention(req, *contended_locks) && diff --git a/lustre/ldlm/ldlm_flock.c b/lustre/ldlm/ldlm_flock.c index bd92a84d8b6a960efde7256ec11e4ce8c27ad3d0..dddd257f16500145afbb6daa403a8badbd41efa1 100644 --- a/lustre/ldlm/ldlm_flock.c +++ b/lustre/ldlm/ldlm_flock.c @@ -42,6 +42,7 @@ #define l_flock_waitq l_lru static struct list_head ldlm_flock_waitq = CFS_LIST_HEAD_INIT(ldlm_flock_waitq); +spinlock_t ldlm_flock_waitq_lock = SPIN_LOCK_UNLOCKED; int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, void *data, int flag); @@ -82,6 +83,7 @@ ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, int flags) LDLM_DEBUG(lock, "ldlm_flock_destroy(mode: %d, flags: 0x%x)", mode, flags); + /* Safe to not lock here, since it should be empty anyway */ LASSERT(list_empty(&lock->l_flock_waitq)); list_del_init(&lock->l_res_link); @@ -107,6 +109,7 @@ ldlm_flock_deadlock(struct ldlm_lock *req, struct ldlm_lock *blocking_lock) pid_t blocking_pid = blocking_lock->l_policy_data.l_flock.pid; struct ldlm_lock *lock; + spin_lock(&ldlm_flock_waitq_lock); restart: list_for_each_entry(lock, &ldlm_flock_waitq, l_flock_waitq) { if ((lock->l_policy_data.l_flock.pid != blocking_pid) || @@ -116,11 +119,14 @@ restart: blocking_pid = lock->l_policy_data.l_flock.blocking_pid; blocking_export = (struct obd_export *)(long) lock->l_policy_data.l_flock.blocking_export; - if (blocking_pid == req_pid && blocking_export == req_export) + if (blocking_pid == req_pid && blocking_export == req_export) { + spin_unlock(&ldlm_flock_waitq_lock); return 1; + } goto restart; } + spin_unlock(&ldlm_flock_waitq_lock); return 0; } @@ -225,7 +231,9 @@ reprocess: (long)(void *)lock->l_export; LASSERT(list_empty(&req->l_flock_waitq)); + spin_lock(&ldlm_flock_waitq_lock); list_add_tail(&req->l_flock_waitq, &ldlm_flock_waitq); + spin_unlock(&ldlm_flock_waitq_lock); ldlm_resource_add_lock(res, &res->lr_waiting, req); *flags |= LDLM_FL_BLOCK_GRANTED; @@ -242,7 +250,9 @@ reprocess: /* In case we had slept on this lock request take it off of the * deadlock detection waitq. */ + spin_lock(&ldlm_flock_waitq_lock); list_del_init(&req->l_flock_waitq); + spin_unlock(&ldlm_flock_waitq_lock); /* Scan the locks owned by this process that overlap this request. * We may have to merge or split existing locks. */ @@ -341,7 +351,7 @@ reprocess: * and restart processing this lock. */ if (!new2) { unlock_res_and_lock(req); - new2 = ldlm_lock_create(ns, res->lr_name, LDLM_FLOCK, + new2 = ldlm_lock_create(ns, res->lr_name, LDLM_FLOCK, lock->l_granted_mode, NULL, NULL, NULL, NULL, 0); lock_res_and_lock(req); @@ -454,7 +464,9 @@ ldlm_flock_interrupted_wait(void *data) lock = ((struct ldlm_flock_wait_data *)data)->fwd_lock; /* take lock off the deadlock detection waitq. */ + spin_lock(&ldlm_flock_waitq_lock); list_del_init(&lock->l_flock_waitq); + spin_unlock(&ldlm_flock_waitq_lock); /* client side - set flag to prevent lock from being put on lru list */ lock->l_flags |= LDLM_FL_CBPENDING; @@ -484,6 +496,21 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, int flags, void *data) CDEBUG(D_DLMTRACE, "flags: 0x%x data: %p getlk: %p\n", flags, data, getlk); + /* Import invalidation. We need to actually release the lock + * references being held, so that it can go away. No point in + * holding the lock even if app still believes it has it, since + * server already dropped it anyway. Only for granted locks too. */ + lock_res_and_lock(lock); + if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) == + (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) { + unlock_res_and_lock(lock); + if (lock->l_req_mode == lock->l_granted_mode && + lock->l_granted_mode != LCK_NL) + ldlm_lock_decref_internal(lock, lock->l_req_mode); + RETURN(0); + } + unlock_res_and_lock(lock); + LASSERT(flags != LDLM_FL_WAIT_NOREPROC); if (!(flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | @@ -517,13 +544,22 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, int flags, void *data) RETURN(rc); granted: + /* before flock's complete ast gets here, the flock + * can possibly be freed by another thread + */ + if (lock->l_destroyed) { + LDLM_DEBUG(lock, "already destroyed by another thread"); + RETURN(0); + } LDLM_DEBUG(lock, "client-side enqueue granted"); ns = lock->l_resource->lr_namespace; lock_res_and_lock(lock); /* take lock off the deadlock detection waitq. */ + spin_lock(&ldlm_flock_waitq_lock); list_del_init(&lock->l_flock_waitq); + spin_unlock(&ldlm_flock_waitq_lock); /* ldlm_lock_enqueue() has already placed lock on the granted list. */ list_del_init(&lock->l_res_link); @@ -572,8 +608,8 @@ int ldlm_flock_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, ns = lock->l_resource->lr_namespace; /* take lock off the deadlock detection waitq. */ - lock_res_and_lock(lock); + spin_lock(&ldlm_flock_waitq_lock); list_del_init(&lock->l_flock_waitq); - unlock_res_and_lock(lock); + spin_unlock(&ldlm_flock_waitq_lock); RETURN(0); } diff --git a/lustre/ldlm/ldlm_inodebits.c b/lustre/ldlm/ldlm_inodebits.c index c378c28ff5b295744c87f2063869425d611e5dcb..67d72aede942538ce248fda94d3f7541eb79d83b 100644 --- a/lustre/ldlm/ldlm_inodebits.c +++ b/lustre/ldlm/ldlm_inodebits.c @@ -37,7 +37,7 @@ static int ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req, struct list_head *work_list) { - struct list_head *tmp, *tmp_tail; + struct list_head *tmp; struct ldlm_lock *lock; ldlm_mode_t req_mode = req->l_req_mode; __u64 req_bits = req->l_policy_data.l_inodebits.bits; @@ -47,28 +47,36 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req, LASSERT(req_bits); /* There is no sense in lock with no bits set, I think. Also such a lock would be compatible with any other bit lock */ + list_for_each(tmp, queue) { + struct list_head *mode_tail; + lock = list_entry(tmp, struct ldlm_lock, l_res_link); if (req == lock) RETURN(compat); + /* last lock in mode group */ + LASSERT(lock->l_sl_mode.prev != NULL); + mode_tail = &list_entry(lock->l_sl_mode.prev, + struct ldlm_lock, + l_sl_mode)->l_res_link; + /* locks are compatible, bits don't matter */ if (lockmode_compat(lock->l_req_mode, req_mode)) { - /* jump to next mode group */ - if (LDLM_SL_HEAD(&lock->l_sl_mode)) - tmp = &list_entry(lock->l_sl_mode.next, - struct ldlm_lock, - l_sl_mode)->l_res_link; + /* jump to last lock in mode group */ + tmp = mode_tail; continue; } - tmp_tail = tmp; - if (LDLM_SL_HEAD(&lock->l_sl_mode)) - tmp_tail = &list_entry(lock->l_sl_mode.next, - struct ldlm_lock, - l_sl_mode)->l_res_link; for (;;) { + struct list_head *head; + + /* last lock in policy group */ + tmp = &list_entry(lock->l_sl_policy.prev, + struct ldlm_lock, + l_sl_policy)->l_res_link; + /* locks with bits overlapped are conflicting locks */ if (lock->l_policy_data.l_inodebits.bits & req_bits) { /* conflicting policy */ @@ -76,36 +84,26 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req, RETURN(0); compat = 0; + + /* add locks of the policy group to + * @work_list as blocking locks for + * @req */ if (lock->l_blocking_ast) - ldlm_add_ast_work_item(lock, req, + ldlm_add_ast_work_item(lock, req, work_list); - /* add all members of the policy group */ - if (LDLM_SL_HEAD(&lock->l_sl_policy)) { - do { - tmp = lock->l_res_link.next; - lock = list_entry(tmp, - struct ldlm_lock, - l_res_link); - if (lock->l_blocking_ast) - ldlm_add_ast_work_item( - lock, - req, - work_list); - } while (!LDLM_SL_TAIL(&lock->l_sl_policy)); - } - } else if (LDLM_SL_HEAD(&lock->l_sl_policy)) { - /* jump to next policy group */ - tmp = &list_entry(lock->l_sl_policy.next, - struct ldlm_lock, - l_sl_policy)->l_res_link; + head = &lock->l_sl_policy; + list_for_each_entry(lock, head, l_sl_policy) + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, req, + work_list); } - if (tmp == tmp_tail) + if (tmp == mode_tail) break; - else - tmp = tmp->next; + + tmp = tmp->next; lock = list_entry(tmp, struct ldlm_lock, l_res_link); - } /* for locks in a mode group */ - } /* for each lock in the queue */ + } /* loop over policy groups within one mode group */ + } /* loop over mode groups within @queue */ RETURN(compat); } diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index 1e8cf31a48b689637dfbdb2a9878fa1768dfcf63..50dca935f33e6ba5ec6d978a440b520528e5fc6e 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -56,8 +56,9 @@ int ldlm_get_enq_timeout(struct ldlm_lock *lock); int ldlm_resource_putref_locked(struct ldlm_resource *res); void ldlm_resource_insert_lock_after(struct ldlm_lock *original, struct ldlm_lock *new); -int ldlm_namespace_free_prior(struct ldlm_namespace *ns); -int ldlm_namespace_free_post(struct ldlm_namespace *ns, int force); +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, int force); +void ldlm_namespace_free_post(struct ldlm_namespace *ns); /* ldlm_lock.c */ diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index bdb1ac731c7e9d832e962ea11454d5b33feef528..c5293b3ba0c55fe8ff5312f1595af855f93f4125 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -239,7 +239,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) RETURN(-EINVAL); } - sema_init(&cli->cl_sem, 1); + init_rwsem(&cli->cl_sem); sema_init(&cli->cl_mgc_sem, 1); cli->cl_conn_count = 0; memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), @@ -345,7 +345,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) err_import: class_destroy_import(imp); err_ldlm: - ldlm_put_ref(0); + ldlm_put_ref(); err: RETURN(rc); @@ -354,7 +354,7 @@ err: int client_obd_cleanup(struct obd_device *obddev) { ENTRY; - ldlm_put_ref(obddev->obd_force); + ldlm_put_ref(); RETURN(0); } @@ -371,7 +371,7 @@ int client_connect_import(struct lustre_handle *dlm_handle, int rc; ENTRY; - mutex_down(&cli->cl_sem); + down_write(&cli->cl_sem); rc = class_connect(dlm_handle, obd, cluuid); if (rc) GOTO(out_sem, rc); @@ -383,7 +383,7 @@ int client_connect_import(struct lustre_handle *dlm_handle, if (obd->obd_namespace != NULL) CERROR("already have namespace!\n"); - obd->obd_namespace = ldlm_namespace_new(obd->obd_name, + obd->obd_namespace = ldlm_namespace_new(obd, obd->obd_name, LDLM_NAMESPACE_CLIENT, LDLM_NAMESPACE_GREEDY); if (obd->obd_namespace == NULL) @@ -418,7 +418,7 @@ int client_connect_import(struct lustre_handle *dlm_handle, if (rc) { out_ldlm: - ldlm_namespace_free_prior(obd->obd_namespace); + ldlm_namespace_free_prior(obd->obd_namespace, imp, 0); to_be_freed = obd->obd_namespace; obd->obd_namespace = NULL; out_disco: @@ -428,9 +428,9 @@ out_disco: class_export_put(exp); } out_sem: - mutex_up(&cli->cl_sem); + up_write(&cli->cl_sem); if (to_be_freed) - ldlm_namespace_free_post(to_be_freed, 0); + ldlm_namespace_free_post(to_be_freed); return rc; } @@ -452,7 +452,7 @@ int client_disconnect_export(struct obd_export *exp) cli = &obd->u.cli; imp = cli->cl_import; - mutex_down(&cli->cl_sem); + down_write(&cli->cl_sem); if (!cli->cl_conn_count) { CERROR("disconnecting disconnected device (%s)\n", obd->obd_name); @@ -480,7 +480,8 @@ int client_disconnect_export(struct obd_export *exp) ldlm_cli_cancel_unused(obd->obd_namespace, NULL, obd->obd_force ? LDLM_FL_LOCAL_ONLY:0, NULL); - ldlm_namespace_free_prior(obd->obd_namespace); + ldlm_namespace_free_prior(obd->obd_namespace, imp, + obd->obd_force); to_be_freed = obd->obd_namespace; } @@ -502,9 +503,9 @@ int client_disconnect_export(struct obd_export *exp) if (!rc && err) rc = err; out_sem: - mutex_up(&cli->cl_sem); + up_write(&cli->cl_sem); if (to_be_freed) - ldlm_namespace_free_post(to_be_freed, obd->obd_force); + ldlm_namespace_free_post(to_be_freed); RETURN(rc); } @@ -581,7 +582,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) int rc = 0, abort_recovery; struct obd_connect_data *data; int size[2] = { sizeof(struct ptlrpc_body), sizeof(*data) }; - lnet_nid_t client_nid = 0; + lnet_nid_t *client_nid = NULL; ENTRY; OBD_RACE(OBD_FAIL_TGT_CONN_RACE); @@ -793,7 +794,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) /* Tell the client if we support replayable requests */ if (target->obd_replayable) lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_REPLAYABLE); - client_nid = req->rq_peer.nid; + client_nid = &req->rq_peer.nid; if (export == NULL) { if (target->obd_recovering) { @@ -808,7 +809,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) } else { dont_check_exports: rc = obd_connect(&conn, target, &cluuid, data, - &client_nid); + client_nid); } } else { rc = obd_reconnect(export, target, &cluuid, data); @@ -972,6 +973,47 @@ void target_destroy_export(struct obd_export *exp) * Recovery functions */ +static int target_exp_enqueue_req_replay(struct ptlrpc_request *req) +{ + __u64 transno = lustre_msg_get_transno(req->rq_reqmsg); + struct obd_export *exp = req->rq_export; + struct ptlrpc_request *reqiter; + int dup = 0; + + LASSERT(exp); + + spin_lock(&exp->exp_lock); + list_for_each_entry(reqiter, &exp->exp_req_replay_queue, + rq_replay_list) { + if (lustre_msg_get_transno(reqiter->rq_reqmsg) == transno) { + dup = 1; + break; + } + } + + if (dup) { + /* we expect it with RESENT and REPLAY flags */ + if ((lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY)) != (MSG_RESENT | MSG_REPLAY)) + CERROR("invalid flags %x of resent replay\n", + lustre_msg_get_flags(req->rq_reqmsg)); + } else { + list_add_tail(&req->rq_replay_list, &exp->exp_req_replay_queue); + } + + spin_unlock(&exp->exp_lock); + return dup; +} + +static void target_exp_dequeue_req_replay(struct ptlrpc_request *req) +{ + LASSERT(!list_empty(&req->rq_replay_list)); + LASSERT(req->rq_export); + + spin_lock(&req->rq_export->exp_lock); + list_del_init(&req->rq_replay_list); + spin_unlock(&req->rq_export->exp_lock); +} static void target_release_saved_req(struct ptlrpc_request *req) { @@ -1016,6 +1058,7 @@ static void abort_recovery_queue(struct obd_device *obd) list_for_each_safe(tmp, n, &obd->obd_recovery_queue) { req = list_entry(tmp, struct ptlrpc_request, rq_list); + target_exp_dequeue_req_replay(req); list_del(&req->rq_list); DEBUG_REQ(D_ERROR, req, "aborted:"); req->rq_status = -ENOTCONN; @@ -1065,6 +1108,7 @@ void target_cleanup_recovery(struct obd_device *obd) list_for_each_safe(tmp, n, &obd->obd_recovery_queue) { req = list_entry(tmp, struct ptlrpc_request, rq_list); + target_exp_dequeue_req_replay(req); list_del(&req->rq_list); target_release_saved_req(req); } @@ -1277,6 +1321,7 @@ static void process_recovery_queue(struct obd_device *obd) } continue; } + target_exp_dequeue_req_replay(req); list_del_init(&req->rq_list); obd->obd_requests_queued_for_recovery--; spin_unlock_bh(&obd->obd_processing_task_lock); @@ -1313,6 +1358,7 @@ int target_queue_recovery_request(struct ptlrpc_request *req, __u64 transno = lustre_msg_get_transno(req->rq_reqmsg); struct ptlrpc_request *saved_req; struct lustre_msg *reqmsg; + int rc = 0; /* CAVEAT EMPTOR: The incoming request message has been swabbed * (i.e. buflens etc are in my own byte order), but type-dependent @@ -1350,20 +1396,12 @@ int target_queue_recovery_request(struct ptlrpc_request *req, /* Processing the queue right now, don't re-add. */ LASSERT(list_empty(&req->rq_list)); spin_unlock_bh(&obd->obd_processing_task_lock); - OBD_FREE(reqmsg, req->rq_reqlen); - OBD_FREE(saved_req, sizeof *saved_req); - return 1; + GOTO(err_free, rc = 1); } - /* A resent, replayed request that is still on the queue; just drop it. - The queued request will handle this. */ - if ((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT|MSG_REPLAY)) == - (MSG_RESENT | MSG_REPLAY)) { - DEBUG_REQ(D_ERROR, req, "dropping resent queued req"); + if (unlikely(OBD_FAIL_CHECK(OBD_FAIL_TGT_REPLAY_DROP))) { spin_unlock_bh(&obd->obd_processing_task_lock); - OBD_FREE(reqmsg, req->rq_reqlen); - OBD_FREE(saved_req, sizeof *saved_req); - return 0; + GOTO(err_free, rc = 0); } memcpy(saved_req, req, sizeof *req); @@ -1372,6 +1410,13 @@ int target_queue_recovery_request(struct ptlrpc_request *req, req->rq_reqmsg = reqmsg; class_export_get(req->rq_export); CFS_INIT_LIST_HEAD(&req->rq_list); + CFS_INIT_LIST_HEAD(&req->rq_replay_list); + + if (target_exp_enqueue_req_replay(req)) { + spin_unlock_bh(&obd->obd_processing_task_lock); + DEBUG_REQ(D_ERROR, req, "dropping resent queued req"); + GOTO(err_exp, rc = 0); + } /* XXX O(n^2) */ list_for_each(tmp, &obd->obd_recovery_queue) { @@ -1383,6 +1428,15 @@ int target_queue_recovery_request(struct ptlrpc_request *req, inserted = 1; break; } + + if (unlikely(lustre_msg_get_transno(reqiter->rq_reqmsg) == + transno)) { + spin_unlock_bh(&obd->obd_processing_task_lock); + DEBUG_REQ(D_ERROR, req, "dropping replay: transno " + "has been claimed by another client"); + target_exp_dequeue_req_replay(req); + GOTO(err_exp, rc = 0); + } } if (!inserted) { @@ -1408,6 +1462,13 @@ int target_queue_recovery_request(struct ptlrpc_request *req, process_recovery_queue(obd); return 0; + +err_exp: + class_export_put(req->rq_export); +err_free: + OBD_FREE(reqmsg, req->rq_reqlen); + OBD_FREE(saved_req, sizeof(*saved_req)); + return rc; } struct obd_device * target_req2obd(struct ptlrpc_request *req) @@ -1501,24 +1562,29 @@ static inline struct ldlm_pool *ldlm_exp2pl(struct obd_export *exp) int target_pack_pool_reply(struct ptlrpc_request *req) { - struct ldlm_pool *pl; + struct obd_device *obd; ENTRY; - - if (!req->rq_export || !req->rq_export->exp_obd || - !req->rq_export->exp_obd->obd_namespace || - !exp_connect_lru_resize(req->rq_export)) { + + /* + * Check that we still have all structures alive as this may + * be some late rpc in shutdown time. + */ + if (unlikely(!req->rq_export || !req->rq_export->exp_obd || + !exp_connect_lru_resize(req->rq_export))) { lustre_msg_set_slv(req->rq_repmsg, 0); lustre_msg_set_limit(req->rq_repmsg, 0); RETURN(0); } - - pl = ldlm_exp2pl(req->rq_export); - spin_lock(&pl->pl_lock); - LASSERT(ldlm_pool_get_slv(pl) != 0 && ldlm_pool_get_limit(pl) != 0); - lustre_msg_set_slv(req->rq_repmsg, ldlm_pool_get_slv(pl)); - lustre_msg_set_limit(req->rq_repmsg, ldlm_pool_get_limit(pl)); - spin_unlock(&pl->pl_lock); + /* + * OBD is alive here as export is alive, which we checked above. + */ + obd = req->rq_export->exp_obd; + + read_lock(&obd->obd_pool_lock); + lustre_msg_set_slv(req->rq_repmsg, obd->obd_pool_slv); + lustre_msg_set_limit(req->rq_repmsg, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); RETURN(0); } @@ -1535,7 +1601,7 @@ target_send_reply_msg (struct ptlrpc_request *req, int rc, int fail_id) if (rc) { DEBUG_REQ(D_ERROR, req, "processing error (%d)", rc); req->rq_status = rc; - return (ptlrpc_error(req)); + return (ptlrpc_send_error(req, 1)); } else { DEBUG_REQ(D_NET, req, "sending reply"); } diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index b63aaba665f92fada1a71192ddf3ca373f8cc5c8..2c25f4140562284f0216f7c7c666026c0d50da98 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -39,13 +39,6 @@ //struct lustre_lock ldlm_everything_lock; -/* lock's skip list pointers fix mode */ -#define LDLM_JOIN_NONE 0 -#define LDLM_MODE_JOIN_RIGHT 1 -#define LDLM_MODE_JOIN_LEFT (1 << 1) -#define LDLM_POLICY_JOIN_RIGHT (1 << 2) -#define LDLM_POLICY_JOIN_LEFT (1 << 3) - /* lock types */ char *ldlm_lockname[] = { [0] "--", @@ -340,10 +333,8 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource) CFS_INIT_LIST_HEAD(&lock->l_cp_ast); cfs_waitq_init(&lock->l_waitq); lock->l_blocking_lock = NULL; - lock->l_sl_mode.prev = NULL; - lock->l_sl_mode.next = NULL; - lock->l_sl_policy.prev = NULL; - lock->l_sl_policy.next = NULL; + CFS_INIT_LIST_HEAD(&lock->l_sl_mode); + CFS_INIT_LIST_HEAD(&lock->l_sl_policy); atomic_inc(&resource->lr_namespace->ns_locks); CFS_INIT_LIST_HEAD(&lock->l_handle.h_link); @@ -383,10 +374,8 @@ int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, unlock_res_and_lock(lock); newres = ldlm_resource_get(ns, NULL, new_resid, type, 1); - if (newres == NULL) { - LBUG(); + if (newres == NULL) RETURN(-ENOMEM); - } lock_res_and_lock(lock); LASSERT(memcmp(&new_resid, &lock->l_resource->lr_name, @@ -683,6 +672,12 @@ void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode) LDLM_LOCK_PUT(lock); } +struct sl_insert_point { + struct list_head *res_link; + struct list_head *mode_link; + struct list_head *policy_link; +}; + /* * search_granted_lock * @@ -691,109 +686,98 @@ void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode) * Parameters: * queue [input]: the granted list where search acts on; * req [input]: the lock whose position to be located; - * lockp [output]: the position where the lock should be inserted before, or - * NULL indicating @req should be appended to @queue. - * Return Values: - * Bit-masks combination of following values indicating in which way the - * lock need to be inserted. - * - LDLM_JOIN_NONE: noting about skip list needs to be fixed; - * - LDLM_MODE_JOIN_RIGHT: @req needs join right becoming the head of a - * mode group; - * - LDLM_POLICY_JOIN_RIGHT: @req needs join right becoming the head of - * a policy group. + * prev [output]: positions within 3 lists to insert @req to + * Return Value: + * filled @prev * NOTE: called by * - ldlm_grant_lock_with_skiplist */ -static int search_granted_lock(struct list_head *queue, - struct ldlm_lock *req, - struct ldlm_lock **lockp) +static void search_granted_lock(struct list_head *queue, + struct ldlm_lock *req, + struct sl_insert_point *prev) { - struct list_head *tmp, *tmp_tail; - struct ldlm_lock *lock, *mode_head_lock; - int rc = LDLM_JOIN_NONE; + struct list_head *tmp; + struct ldlm_lock *lock, *mode_end, *policy_end; ENTRY; list_for_each(tmp, queue) { lock = list_entry(tmp, struct ldlm_lock, l_res_link); + mode_end = list_entry(lock->l_sl_mode.prev, struct ldlm_lock, + l_sl_mode); + if (lock->l_req_mode != req->l_req_mode) { - if (LDLM_SL_HEAD(&lock->l_sl_mode)) - tmp = &list_entry(lock->l_sl_mode.next, - struct ldlm_lock, - l_sl_mode)->l_res_link; + /* jump to last lock of mode group */ + tmp = &mode_end->l_res_link; continue; } - - /* found the same mode group */ + + /* suitable mode group is found */ if (lock->l_resource->lr_type == LDLM_PLAIN) { - *lockp = lock; - rc = LDLM_MODE_JOIN_RIGHT; - GOTO(out, rc); + /* insert point is last lock of the mode group */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; } else if (lock->l_resource->lr_type == LDLM_IBITS) { - tmp_tail = tmp; - if (LDLM_SL_HEAD(&lock->l_sl_mode)) - tmp_tail = &list_entry(lock->l_sl_mode.next, - struct ldlm_lock, - l_sl_mode)->l_res_link; - mode_head_lock = lock; for (;;) { + policy_end = list_entry(lock->l_sl_policy.prev, + struct ldlm_lock, + l_sl_policy); + if (lock->l_policy_data.l_inodebits.bits == req->l_policy_data.l_inodebits.bits) { - /* matched policy lock is found */ - *lockp = lock; - rc |= LDLM_POLICY_JOIN_RIGHT; - - /* if the policy group head is also a - * mode group head or a single mode - * group lock */ - if (LDLM_SL_HEAD(&lock->l_sl_mode) || - (tmp == tmp_tail && - LDLM_SL_EMPTY(&lock->l_sl_mode))) - rc |= LDLM_MODE_JOIN_RIGHT; - GOTO(out, rc); + /* insert point is last lock of + * the policy group */ + prev->res_link = + &policy_end->l_res_link; + prev->mode_link = + &policy_end->l_sl_mode; + prev->policy_link = + &policy_end->l_sl_policy; + EXIT; + return; } - if (LDLM_SL_HEAD(&lock->l_sl_policy)) - tmp = &list_entry(lock->l_sl_policy.next, - struct ldlm_lock, - l_sl_policy)->l_res_link; - - if (tmp == tmp_tail) + if (policy_end == mode_end) + /* done with mode group */ break; - else - tmp = tmp->next; + + /* jump to next policy group within the mode group */ + tmp = policy_end->l_res_link.next; lock = list_entry(tmp, struct ldlm_lock, l_res_link); - } /* for all locks in the matched mode group */ - - /* no matched policy group is found, insert before - * the mode group head lock */ - *lockp = mode_head_lock; - rc = LDLM_MODE_JOIN_RIGHT; - GOTO(out, rc); + } /* loop over policy groups within the mode group */ + + /* insert point is last lock of the mode group, + * new policy group is started */ + prev->res_link = &mode_end->l_res_link; + prev->mode_link = &mode_end->l_sl_mode; + prev->policy_link = &req->l_sl_policy; + EXIT; + return; } else { LDLM_ERROR(lock, "is not LDLM_PLAIN or LDLM_IBITS lock"); LBUG(); } } - /* no matched mode group is found, append to the end */ - *lockp = NULL; - rc = LDLM_JOIN_NONE; + /* insert point is last lock on the queue, + * new mode group and new policy group are started */ + prev->res_link = queue->prev; + prev->mode_link = &req->l_sl_mode; + prev->policy_link = &req->l_sl_policy; EXIT; -out: - return rc; + return; } static void ldlm_granted_list_add_lock(struct ldlm_lock *lock, - struct ldlm_lock *lockp, - int join) + struct sl_insert_point *prev) { struct ldlm_resource *res = lock->l_resource; ENTRY; - LASSERT(lockp || join == LDLM_JOIN_NONE); - check_res_locked(res); ldlm_resource_dump(D_OTHER, res); @@ -806,72 +790,25 @@ static void ldlm_granted_list_add_lock(struct ldlm_lock *lock, } LASSERT(list_empty(&lock->l_res_link)); + LASSERT(list_empty(&lock->l_sl_mode)); + LASSERT(list_empty(&lock->l_sl_policy)); - if (!lockp) - list_add_tail(&lock->l_res_link, &lock->l_resource->lr_granted); - else if ((join & LDLM_MODE_JOIN_LEFT) || (join & LDLM_POLICY_JOIN_LEFT)) - list_add(&lock->l_res_link, &lockp->l_res_link); - else - list_add_tail(&lock->l_res_link, &lockp->l_res_link); - - /* fix skip lists */ - if (join & LDLM_MODE_JOIN_RIGHT) { - LASSERT(! LDLM_SL_TAIL(&lockp->l_sl_mode)); - if (LDLM_SL_EMPTY(&lockp->l_sl_mode)) { - lock->l_sl_mode.next = &lockp->l_sl_mode; - lockp->l_sl_mode.prev = &lock->l_sl_mode; - } else if (LDLM_SL_HEAD(&lockp->l_sl_mode)) { - lock->l_sl_mode.next = lockp->l_sl_mode.next; - lockp->l_sl_mode.next = NULL; - lock->l_sl_mode.next->prev = &lock->l_sl_mode; - } - } else if (join & LDLM_MODE_JOIN_LEFT) { - LASSERT(! LDLM_SL_HEAD(&lockp->l_sl_mode)); - if (LDLM_SL_EMPTY(&lockp->l_sl_mode)) { - lock->l_sl_mode.prev = &lockp->l_sl_mode; - lockp->l_sl_mode.next = &lock->l_sl_mode; - } else if (LDLM_SL_TAIL(&lockp->l_sl_mode)) { - lock->l_sl_mode.prev = lockp->l_sl_mode.prev; - lockp->l_sl_mode.prev = NULL; - lock->l_sl_mode.prev->next = &lock->l_sl_mode; - } - } - - if (join & LDLM_POLICY_JOIN_RIGHT) { - LASSERT(! LDLM_SL_TAIL(&lockp->l_sl_policy)); - if (LDLM_SL_EMPTY(&lockp->l_sl_policy)) { - lock->l_sl_policy.next = &lockp->l_sl_policy; - lockp->l_sl_policy.prev = &lock->l_sl_policy; - } else if (LDLM_SL_HEAD(&lockp->l_sl_policy)) { - lock->l_sl_policy.next = lockp->l_sl_policy.next; - lockp->l_sl_policy.next = NULL; - lock->l_sl_policy.next->prev = &lock->l_sl_policy; - } - } else if (join & LDLM_POLICY_JOIN_LEFT) { - LASSERT(! LDLM_SL_HEAD(&lockp->l_sl_policy)); - if (LDLM_SL_EMPTY(&lockp->l_sl_policy)) { - lock->l_sl_policy.prev = &lockp->l_sl_policy; - lockp->l_sl_policy.next = &lock->l_sl_policy; - } else if (LDLM_SL_TAIL(&lockp->l_sl_policy)) { - lock->l_sl_policy.prev = lockp->l_sl_policy.prev; - lockp->l_sl_policy.prev = NULL; - lock->l_sl_policy.prev->next = &lock->l_sl_policy; - } - } + list_add(&lock->l_res_link, prev->res_link); + list_add(&lock->l_sl_mode, prev->mode_link); + list_add(&lock->l_sl_policy, prev->policy_link); EXIT; } static void ldlm_grant_lock_with_skiplist(struct ldlm_lock *lock) { - int join = LDLM_JOIN_NONE; - struct ldlm_lock *lockp = NULL; + struct sl_insert_point prev; ENTRY; LASSERT(lock->l_req_mode == lock->l_granted_mode); - join = search_granted_lock(&lock->l_resource->lr_granted, lock, &lockp); - ldlm_granted_list_add_lock(lock, lockp, join); + search_granted_lock(&lock->l_resource->lr_granted, lock, &prev); + ldlm_granted_list_add_lock(lock, &prev); EXIT; } @@ -923,7 +860,7 @@ static struct ldlm_lock *search_queue(struct list_head *queue, lock = list_entry(tmp, struct ldlm_lock, l_res_link); if (lock == old_lock) - continue; + break; /* llite sometimes wants to match locks that will be * canceled when their users drop, but we allow it to match @@ -989,6 +926,33 @@ void ldlm_lock_allow_match(struct ldlm_lock *lock) unlock_res_and_lock(lock); } +int ldlm_lock_fast_match(struct ldlm_lock *lock, int rw, + obd_off start, obd_off end, + void **cookie) +{ + LASSERT(rw == OBD_BRW_READ || rw == OBD_BRW_WRITE); + /* should LCK_GROUP be handled in a special way? */ + if (lock && (rw == OBD_BRW_READ || + (lock->l_granted_mode & (LCK_PW|LCK_GROUP))) && + (lock->l_policy_data.l_extent.start <= start) && + (lock->l_policy_data.l_extent.end >= end)) { + ldlm_lock_addref_internal(lock, rw == OBD_BRW_WRITE ? LCK_PW : LCK_PR); + *cookie = (void *)lock; + return 1; /* avoid using rc for stack relief */ + } + return 0; +} + +void ldlm_lock_fast_release(void *cookie, int rw) +{ + struct ldlm_lock *lock = (struct ldlm_lock *)cookie; + + LASSERT(lock != NULL); + LASSERT(rw == OBD_BRW_READ || rw == OBD_BRW_WRITE); + LASSERT(rw == OBD_BRW_READ || (lock->l_granted_mode & (LCK_PW | LCK_GROUP))); + ldlm_lock_decref_internal(lock, rw == OBD_BRW_WRITE ? LCK_PW : LCK_PR); +} + /* Can be called in two ways: * * If 'ns' is NULL, then lockh describes an existing lock that we want to look @@ -1528,55 +1492,12 @@ void ldlm_cancel_callback(struct ldlm_lock *lock) void ldlm_unlink_lock_skiplist(struct ldlm_lock *req) { - struct ldlm_lock *lock; - if (req->l_resource->lr_type != LDLM_PLAIN && req->l_resource->lr_type != LDLM_IBITS) return; - - if (LDLM_SL_HEAD(&req->l_sl_mode)) { - lock = list_entry(req->l_res_link.next, struct ldlm_lock, - l_res_link); - if (req->l_sl_mode.next == &lock->l_sl_mode) { - lock->l_sl_mode.prev = NULL; - } else { - lock->l_sl_mode.next = req->l_sl_mode.next; - lock->l_sl_mode.next->prev = &lock->l_sl_mode; - } - req->l_sl_mode.next = NULL; - } else if (LDLM_SL_TAIL(&req->l_sl_mode)) { - lock = list_entry(req->l_res_link.prev, struct ldlm_lock, - l_res_link); - if (req->l_sl_mode.prev == &lock->l_sl_mode) { - lock->l_sl_mode.next = NULL; - } else { - lock->l_sl_mode.prev = req->l_sl_mode.prev; - lock->l_sl_mode.prev->next = &lock->l_sl_mode; - } - req->l_sl_mode.prev = NULL; - } - if (LDLM_SL_HEAD(&req->l_sl_policy)) { - lock = list_entry(req->l_res_link.next, struct ldlm_lock, - l_res_link); - if (req->l_sl_policy.next == &lock->l_sl_policy) { - lock->l_sl_policy.prev = NULL; - } else { - lock->l_sl_policy.next = req->l_sl_policy.next; - lock->l_sl_policy.next->prev = &lock->l_sl_policy; - } - req->l_sl_policy.next = NULL; - } else if (LDLM_SL_TAIL(&req->l_sl_policy)) { - lock = list_entry(req->l_res_link.prev, struct ldlm_lock, - l_res_link); - if (req->l_sl_policy.prev == &lock->l_sl_policy) { - lock->l_sl_policy.next = NULL; - } else { - lock->l_sl_policy.prev = req->l_sl_policy.prev; - lock->l_sl_policy.prev->next = &lock->l_sl_policy; - } - req->l_sl_policy.prev = NULL; - } + list_del_init(&req->l_sl_policy); + list_del_init(&req->l_sl_mode); } void ldlm_lock_cancel(struct ldlm_lock *lock) @@ -1666,8 +1587,7 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, struct ldlm_namespace *ns; int granted = 0; int old_mode, rc; - struct ldlm_lock *mark_lock = NULL; - int join= LDLM_JOIN_NONE; + struct sl_insert_point prev; ldlm_error_t err; struct ldlm_interval *node; ENTRY; @@ -1697,27 +1617,10 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, /* remember the lock position where the lock might be * added back to the granted list later and also * remember the join mode for skiplist fixing. */ - if (LDLM_SL_HEAD(&lock->l_sl_mode)) - join = LDLM_MODE_JOIN_RIGHT; - else if (LDLM_SL_TAIL(&lock->l_sl_mode)) - join = LDLM_MODE_JOIN_LEFT; - if (LDLM_SL_HEAD(&lock->l_sl_policy)) - join |= LDLM_POLICY_JOIN_RIGHT; - else if (LDLM_SL_TAIL(&lock->l_sl_policy)) - join |= LDLM_POLICY_JOIN_LEFT; - - LASSERT(!((join & LDLM_MODE_JOIN_RIGHT) && - (join & LDLM_POLICY_JOIN_LEFT))); - LASSERT(!((join & LDLM_MODE_JOIN_LEFT) && - (join & LDLM_POLICY_JOIN_RIGHT))); - - if ((join & LDLM_MODE_JOIN_LEFT) || - (join & LDLM_POLICY_JOIN_LEFT)) - mark_lock = list_entry(lock->l_res_link.prev, - struct ldlm_lock, l_res_link); - else if (lock->l_res_link.next != &res->lr_granted) - mark_lock = list_entry(lock->l_res_link.next, - struct ldlm_lock, l_res_link); + prev.res_link = lock->l_res_link.prev; + prev.mode_link = lock->l_sl_mode.prev; + prev.policy_link = lock->l_sl_policy.prev; + ldlm_resource_unlink_lock(lock); } else { ldlm_resource_unlink_lock(lock); if (res->lr_type == LDLM_EXTENT) { @@ -1757,8 +1660,8 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, if (res->lr_type == LDLM_EXTENT) ldlm_extent_add_lock(res, lock); else - ldlm_granted_list_add_lock(lock, mark_lock, - join); + ldlm_granted_list_add_lock(lock, &prev); + res = NULL; } else { *flags |= LDLM_FL_BLOCK_GRANTED; diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 850f540f037ec125d5eb3cfe5cdfa53ae262ec9c..1caf259ee71380237f38e113c4ed56365455e6c5 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -799,9 +799,12 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) else if (rc != 0) rc = ldlm_handle_ast_error(lock, req, rc, "glimpse"); else - rc = ldlm_res_lvbo_update(res, req->rq_repmsg, + rc = ldlm_res_lvbo_update(res, req, REPLY_REC_OFF, 1); ptlrpc_req_finished(req); + if (rc == -ERESTART) + ldlm_reprocess_all(res); + RETURN(rc); } @@ -1353,8 +1356,13 @@ static void ldlm_handle_cp_callback(struct ptlrpc_request *req, &lock->l_resource->lr_name, sizeof(lock->l_resource->lr_name)) != 0) { unlock_res_and_lock(lock); - ldlm_lock_change_resource(ns, lock, - dlm_req->lock_desc.l_resource.lr_name); + if (ldlm_lock_change_resource(ns, lock, + dlm_req->lock_desc.l_resource.lr_name)) { + LDLM_ERROR(lock, "Failed to allocate resource"); + LDLM_LOCK_PUT(lock); + EXIT; + return; + } LDLM_DEBUG(lock, "completion AST, new resource"); CERROR("change resource!\n"); lock_res_and_lock(lock); @@ -1858,7 +1866,7 @@ static int ldlm_bl_thread_main(void *arg) #endif static int ldlm_setup(void); -static int ldlm_cleanup(int force); +static int ldlm_cleanup(void); int ldlm_get_ref(void) { @@ -1875,12 +1883,12 @@ int ldlm_get_ref(void) RETURN(rc); } -void ldlm_put_ref(int force) +void ldlm_put_ref(void) { ENTRY; mutex_down(&ldlm_ref_sem); if (ldlm_refcount == 1) { - int rc = ldlm_cleanup(force); + int rc = ldlm_cleanup(); if (rc) CERROR("ldlm_cleanup failed: %d\n", rc); else @@ -2027,7 +2035,7 @@ static int ldlm_setup(void) return rc; } -static int ldlm_cleanup(int force) +static int ldlm_cleanup(void) { #ifdef __KERNEL__ struct ldlm_bl_pool *blp = ldlm_state->ldlm_bl_pool; @@ -2137,6 +2145,8 @@ EXPORT_SYMBOL(ldlm_lock2handle); EXPORT_SYMBOL(__ldlm_handle2lock); EXPORT_SYMBOL(ldlm_lock_get); EXPORT_SYMBOL(ldlm_lock_put); +EXPORT_SYMBOL(ldlm_lock_fast_match); +EXPORT_SYMBOL(ldlm_lock_fast_release); EXPORT_SYMBOL(ldlm_lock_match); EXPORT_SYMBOL(ldlm_lock_cancel); EXPORT_SYMBOL(ldlm_lock_addref); diff --git a/lustre/ldlm/ldlm_plain.c b/lustre/ldlm/ldlm_plain.c index 71778cd072c031b1d4088e327a02f7994c43616d..68b5bf338e97fa3b7e983528e639a24593e0e806 100644 --- a/lustre/ldlm/ldlm_plain.c +++ b/lustre/ldlm/ldlm_plain.c @@ -54,31 +54,32 @@ ldlm_plain_compat_queue(struct list_head *queue, struct ldlm_lock *req, if (req == lock) RETURN(compat); - if (lockmode_compat(lock->l_req_mode, req_mode)) { - /* jump to next mode group */ - if (LDLM_SL_HEAD(&lock->l_sl_mode)) - tmp = &list_entry(lock->l_sl_mode.next, - struct ldlm_lock, - l_sl_mode)->l_res_link; + /* last lock in mode group */ + tmp = &list_entry(lock->l_sl_mode.prev, + struct ldlm_lock, + l_sl_mode)->l_res_link; + + if (lockmode_compat(lock->l_req_mode, req_mode)) continue; - } if (!work_list) RETURN(0); compat = 0; + + /* add locks of the mode group to @work_list as + * blocking locks for @req */ if (lock->l_blocking_ast) ldlm_add_ast_work_item(lock, req, work_list); - if (LDLM_SL_HEAD(&lock->l_sl_mode)) { - /* add all members of the mode group */ - do { - tmp = lock->l_res_link.next; - lock = list_entry(tmp, struct ldlm_lock, - l_res_link); + + { + struct list_head *head; + + head = &lock->l_sl_mode; + list_for_each_entry(lock, head, l_sl_mode) if (lock->l_blocking_ast) - ldlm_add_ast_work_item( - lock, req, work_list); - } while (!LDLM_SL_TAIL(&lock->l_sl_mode)); + ldlm_add_ast_work_item(lock, req, + work_list); } } diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c index dcae25431d4861cae71dd511cd3d056c41917ebd..bd89cfaa3e6d966515be1e18f9b52682c827420e 100644 --- a/lustre/ldlm/ldlm_pool.c +++ b/lustre/ldlm/ldlm_pool.c @@ -23,7 +23,8 @@ * license text for more details. */ -/* Idea of this code is rather simple. Each second, for each server namespace +/* + * Idea of this code is rather simple. Each second, for each server namespace * we have SLV - server lock volume which is calculated on current number of * granted locks, grant speed for past period, etc - that is, locking load. * This SLV number may be thought as a flow definition for simplicity. It is @@ -98,16 +99,24 @@ #ifdef HAVE_LRU_RESIZE_SUPPORT -/* 50 ldlm locks for 1MB of RAM. */ -#define LDLM_POOL_HOST_L ((num_physpages >> (20 - PAGE_SHIFT)) * 50) +/* + * 50 ldlm locks for 1MB of RAM. + */ +#define LDLM_POOL_HOST_L ((num_physpages >> (20 - CFS_PAGE_SHIFT)) * 50) -/* Default step in % for grant plan. */ +/* + * Default step in % for grant plan. + */ #define LDLM_POOL_GSP (10) -/* LDLM_POOL_GSP% of all locks is default GP. */ +/* + * LDLM_POOL_GSP% of all locks is default GP. + */ #define LDLM_POOL_GP(L) (((L) * LDLM_POOL_GSP) / 100) -/* Max age for locks on clients. */ +/* + * Max age for locks on clients. + */ #define LDLM_POOL_MAX_AGE (36000) #ifdef __KERNEL__ @@ -126,8 +135,10 @@ static inline __u64 dru(__u64 val, __u32 div) static inline __u64 ldlm_pool_slv_max(__u32 L) { - /* Allow to have all locks for 1 client for 10 hrs. - * Formula is the following: limit * 10h / 1 client. */ + /* + * Allow to have all locks for 1 client for 10 hrs. + * Formula is the following: limit * 10h / 1 client. + */ __u64 lim = L * LDLM_POOL_MAX_AGE / 1; return lim; } @@ -158,7 +169,11 @@ static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl) return container_of(pl, struct ldlm_namespace, ns_pool); } -/* Should be called under ->pl_lock taken */ +/** + * Recalculates next grant limit on passed \a pl. + * + * \pre ->pl_lock is locked. + */ static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl) { int granted, grant_step, limit; @@ -170,14 +185,18 @@ static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl) pl->pl_grant_plan = granted + grant_step; } -/* Should be called under ->pl_lock taken */ +/** + * Recalculates next SLV on passed \a pl. + * + * \pre ->pl_lock is locked. + */ static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl) { int grant_usage, granted, grant_plan; __u64 slv, slv_factor; __u32 limit; - slv = ldlm_pool_get_slv(pl); + slv = pl->pl_server_lock_volume; grant_plan = pl->pl_grant_plan; limit = ldlm_pool_get_limit(pl); granted = atomic_read(&pl->pl_granted); @@ -186,12 +205,14 @@ static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl) if (grant_usage <= 0) grant_usage = 1; - /* Find out SLV change factor which is the ratio of grant usage + /* + * Find out SLV change factor which is the ratio of grant usage * from limit. SLV changes as fast as the ratio of grant plan * consumtion. The more locks from grant plan are not consumed * by clients in last interval (idle time), the faster grows * SLV. And the opposite, the more grant plan is over-consumed - * (load time) the faster drops SLV. */ + * (load time) the faster drops SLV. + */ slv_factor = (grant_usage * 100) / limit; if (2 * abs(granted - limit) > limit) { slv_factor *= slv_factor; @@ -206,13 +227,18 @@ static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl) slv = ldlm_pool_slv_min(limit); } - ldlm_pool_set_slv(pl, slv); + pl->pl_server_lock_volume = slv; } +/** + * Recalculates next stats on passed \a pl. + * + * \pre ->pl_lock is locked. + */ static inline void ldlm_pool_recalc_stats(struct ldlm_pool *pl) { - __u64 slv = ldlm_pool_get_slv(pl); int grant_plan = pl->pl_grant_plan; + __u64 slv = pl->pl_server_lock_volume; int granted = atomic_read(&pl->pl_granted); int grant_rate = atomic_read(&pl->pl_grant_rate); int cancel_rate = atomic_read(&pl->pl_cancel_rate); @@ -229,6 +255,32 @@ static inline void ldlm_pool_recalc_stats(struct ldlm_pool *pl) cancel_rate); } +/** + * Sets current SLV into obd accessible via ldlm_pl2ns(pl)->ns_obd. + */ +static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Set new SLV in obd field for using it later without accessing the + * pool. This is required to avoid race between sending reply to client + * with new SLV and cleanup server stack in which we can't guarantee + * that namespace is still alive. We know only that obd is alive as + * long as valid export is alive. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_slv = pl->pl_server_lock_volume; + write_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates all pool fields on passed \a pl. + * + * \pre ->pl_lock is not locked. + */ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) { time_t recalc_interval_sec; @@ -237,17 +289,30 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) spin_lock(&pl->pl_lock); recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; if (recalc_interval_sec > 0) { - /* Update statistics */ + /* + * Update statistics. + */ ldlm_pool_recalc_stats(pl); - /* Recalc SLV after last period. This should be done - * _before_ recalculating new grant plan. */ + /* + * Recalc SLV after last period. This should be done + * _before_ recalculating new grant plan. + */ ldlm_pool_recalc_slv(pl); - - /* Update grant_plan for new period. */ + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); + + /* + * Update grant_plan for new period. + */ ldlm_pool_recalc_grant_plan(pl); - /* Zero out all rates and speed for the last period. */ + /* + * Zero out all rates and speed for the last period. + */ atomic_set(&pl->pl_grant_rate, 0); atomic_set(&pl->pl_cancel_rate, 0); atomic_set(&pl->pl_grant_speed, 0); @@ -259,26 +324,37 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) RETURN(0); } -/* Our goal here is to decrease SLV the way to make a client hold - * @nr locks smaller in next 10h. */ +/** + * This function is used on server side as main entry point for memory + * preasure handling. It decreases SLV on \a pl according to passed + * \a nr and \a gfp_mask. + * + * Our goal here is to decrease SLV such a way that clients hold \a nr + * locks smaller in next 10h. + */ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { __u32 limit; ENTRY; - /* VM is asking how many entries may be potentially freed. */ + /* + * VM is asking how many entries may be potentially freed. + */ if (nr == 0) RETURN(atomic_read(&pl->pl_granted)); - /* Client already canceled locks but server is already in shrinker - * and can't cancel anything. Let's catch this race. */ + /* + * Client already canceled locks but server is already in shrinker + * and can't cancel anything. Let's catch this race. + */ if (atomic_read(&pl->pl_granted) == 0) RETURN(0); spin_lock(&pl->pl_lock); - /* We want shrinker to possibly cause cancelation of @nr locks from + /* + * We want shrinker to possibly cause cancelation of @nr locks from * clients or grant approximately @nr locks smaller next intervals. * * This is why we decresed SLV by @nr. This effect will only be as @@ -287,27 +363,69 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, * interval pool will either increase SLV if locks load is not high * or will keep on same level or even decrease again, thus, shrinker * decreased SLV will affect next recalc intervals and this way will - * make locking load lower. */ - if (nr < ldlm_pool_get_slv(pl)) { - ldlm_pool_set_slv(pl, ldlm_pool_get_slv(pl) - nr); + * make locking load lower. + */ + if (nr < pl->pl_server_lock_volume) { + pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr; } else { limit = ldlm_pool_get_limit(pl); - ldlm_pool_set_slv(pl, ldlm_pool_slv_min(limit)); + pl->pl_server_lock_volume = ldlm_pool_slv_min(limit); } + + /* + * Make sure that pool informed obd of last SLV changes. + */ + ldlm_srv_pool_push_slv(pl); spin_unlock(&pl->pl_lock); - /* We did not really free any memory here so far, it only will be - * freed later may be, so that we return 0 to not confuse VM. */ + /* + * We did not really free any memory here so far, it only will be + * freed later may be, so that we return 0 to not confuse VM. + */ RETURN(0); } +/** + * Setup server side pool \a pl with passed \a limit. + */ static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit) { + struct obd_device *obd; ENTRY; + + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL && obd != LP_POISON); + LASSERT(obd->obd_type != LP_POISON); + write_lock(&obd->obd_pool_lock); + obd->obd_pool_limit = limit; + write_unlock(&obd->obd_pool_lock); + ldlm_pool_set_limit(pl, limit); RETURN(0); } +/** + * Sets SLV and Limit from ldlm_pl2ns(pl)->ns_obd tp passed \a pl. + */ +static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl) +{ + struct obd_device *obd; + + /* + * Get new SLV and Limit from obd which is updated with comming + * RPCs. + */ + obd = ldlm_pl2ns(pl)->ns_obd; + LASSERT(obd != NULL); + read_lock(&obd->obd_pool_lock); + pl->pl_server_lock_volume = obd->obd_pool_slv; + ldlm_pool_set_limit(pl, obd->obd_pool_limit); + read_unlock(&obd->obd_pool_lock); +} + +/** + * Recalculates client sise pool \a pl according to current SLV and Limit. + */ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) { time_t recalc_interval_sec; @@ -315,12 +433,21 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) spin_lock(&pl->pl_lock); + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + ldlm_cli_pool_pop_slv(pl); + recalc_interval_sec = cfs_time_current_sec() - pl->pl_recalc_time; if (recalc_interval_sec > 0) { - /* Update statistics only every T */ + /* + * Update statistics only every T. + */ ldlm_pool_recalc_stats(pl); - /* Zero out grant/cancel rates and speed for last period. */ + /* + * Zero out grant/cancel rates and speed for last period. + */ atomic_set(&pl->pl_grant_rate, 0); atomic_set(&pl->pl_cancel_rate, 0); atomic_set(&pl->pl_grant_speed, 0); @@ -330,34 +457,54 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) } spin_unlock(&pl->pl_lock); - /* Do not cancel locks in case lru resize is disabled for this ns */ + /* + * Do not cancel locks in case lru resize is disabled for this ns. + */ if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) RETURN(0); - /* In the time of canceling locks on client we do not need to maintain + /* + * In the time of canceling locks on client we do not need to maintain * sharp timing, we only want to cancel locks asap according to new SLV. - * This may be called when SLV has changed much, this is why we do not - * take into account pl->pl_recalc_time here. */ + * It may be called when SLV has changed much, this is why we do not + * take into account pl->pl_recalc_time here. + */ RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC, LDLM_CANCEL_LRUR)); } +/** + * This function is main entry point for memory preasure handling on client side. + * Main goal of this function is to cancel some number of locks on passed \a pl + * according to \a nr and \a gfp_mask. + */ static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { ENTRY; - /* Do not cancel locks in case lru resize is disabled for this ns */ + /* + * Do not cancel locks in case lru resize is disabled for this ns. + */ if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) RETURN(0); - /* Find out how many locks may be released according to shrink - * policy. */ + /* + * Make sure that pool knows last SLV and Limit from obd. + */ + ldlm_cli_pool_pop_slv(pl); + + /* + * Find out how many locks may be released according to shrink + * policy. + */ if (nr == 0) RETURN(ldlm_cancel_lru_estimate(ldlm_pl2ns(pl), 0, 0, LDLM_CANCEL_SHRINK)); - /* Cancel @nr locks accoding to shrink policy */ + /* + * Cancel @nr locks accoding to shrink policy. + */ RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC, LDLM_CANCEL_SHRINK)); } @@ -373,6 +520,10 @@ struct ldlm_pool_ops ldlm_cli_pool_ops = { .po_shrink = ldlm_cli_pool_shrink }; +/** + * Pool recalc wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + */ int ldlm_pool_recalc(struct ldlm_pool *pl) { int count; @@ -387,6 +538,10 @@ int ldlm_pool_recalc(struct ldlm_pool *pl) } EXPORT_SYMBOL(ldlm_pool_recalc); +/** + * Pool shrink wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + */ int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { @@ -409,8 +564,12 @@ int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, } EXPORT_SYMBOL(ldlm_pool_shrink); -/* The purpose of this function is to re-setup limit and maximal allowed - * slv according to the passed limit. */ +/** + * Pool setup wrapper. Will call either client or server pool recalc callback + * depending what pool \a pl is used. + * + * Sets passed \a limit into pool \a pl. + */ int ldlm_pool_setup(struct ldlm_pool *pl, int limit) { ENTRY; @@ -427,11 +586,12 @@ static int lprocfs_rd_pool_state(char *page, char **start, off_t off, int granted, grant_rate, cancel_rate, grant_step; int nr = 0, grant_speed, grant_plan; struct ldlm_pool *pl = data; + __u64 slv, clv; __u32 limit; - __u64 slv; spin_lock(&pl->pl_lock); - slv = ldlm_pool_get_slv(pl); + slv = pl->pl_server_lock_volume; + clv = pl->pl_client_lock_volume; limit = ldlm_pool_get_limit(pl); grant_plan = pl->pl_grant_plan; grant_step = pl->pl_grant_step; @@ -444,6 +604,7 @@ static int lprocfs_rd_pool_state(char *page, char **start, off_t off, nr += snprintf(page + nr, count - nr, "LDLM pool state (%s):\n", pl->pl_name); nr += snprintf(page + nr, count - nr, " SLV: "LPU64"\n", slv); + nr += snprintf(page + nr, count - nr, " CLV: "LPU64"\n", clv); nr += snprintf(page + nr, count - nr, " LVF: %d\n", atomic_read(&pl->pl_lock_volume_factor)); @@ -639,13 +800,13 @@ int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, if (client == LDLM_NAMESPACE_SERVER) { pl->pl_ops = &ldlm_srv_pool_ops; ldlm_pool_set_limit(pl, LDLM_POOL_HOST_L); - ldlm_pool_set_slv(pl, ldlm_pool_slv_max(LDLM_POOL_HOST_L)); + pl->pl_server_lock_volume = ldlm_pool_slv_max(LDLM_POOL_HOST_L); } else { - ldlm_pool_set_slv(pl, 1); + pl->pl_server_lock_volume = 1; ldlm_pool_set_limit(pl, 1); pl->pl_ops = &ldlm_cli_pool_ops; } - + pl->pl_client_lock_volume = 0; rc = ldlm_pool_proc_init(pl); if (rc) RETURN(rc); @@ -660,17 +821,28 @@ void ldlm_pool_fini(struct ldlm_pool *pl) { ENTRY; ldlm_pool_proc_fini(pl); - pl->pl_ops = NULL; + + /* + * Pool should not be used after this point. We can't free it here as + * it lives in struct ldlm_namespace, but still interested in catching + * any abnormal using cases. + */ + POISON(pl, 0x5a, sizeof(*pl)); EXIT; } EXPORT_SYMBOL(ldlm_pool_fini); +/** + * Add new taken ldlm lock \a lock into pool \a pl accounting. + */ void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) { - /* FLOCK locks are special in a sense that they are almost never + /* + * FLOCK locks are special in a sense that they are almost never * cancelled, instead special kind of lock is used to drop them. * also there is no LRU for flock locks, so no point in tracking - * them anyway */ + * them anyway. + */ if (lock->l_resource->lr_type == LDLM_FLOCK) return; @@ -682,18 +854,26 @@ void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT); - /* Do not do pool recalc for client side as all locks which + /* + * Do not do pool recalc for client side as all locks which * potentially may be canceled has already been packed into * enqueue/cancel rpc. Also we do not want to run out of stack - * with too long call paths. */ + * with too long call paths. + */ if (ns_is_server(ldlm_pl2ns(pl))) ldlm_pool_recalc(pl); EXIT; } EXPORT_SYMBOL(ldlm_pool_add); +/** + * Remove ldlm lock \a lock from pool \a pl accounting. + */ void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) { + /* + * Filter out FLOCK locks. Read above comment in ldlm_pool_add(). + */ if (lock->l_resource->lr_type == LDLM_FLOCK) return; ENTRY; @@ -710,33 +890,89 @@ void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) } EXPORT_SYMBOL(ldlm_pool_del); -/* ->pl_lock should be taken. */ +/** + * Returns current \a pl SLV. + * + * \pre ->pl_lock is not locked. + */ __u64 ldlm_pool_get_slv(struct ldlm_pool *pl) { - return pl->pl_server_lock_volume; + __u64 slv; + spin_lock(&pl->pl_lock); + slv = pl->pl_server_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; } EXPORT_SYMBOL(ldlm_pool_get_slv); -/* ->pl_lock should be taken. */ +/** + * Sets passed \a slv to \a pl. + * + * \pre ->pl_lock is not locked. + */ void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv) { + spin_lock(&pl->pl_lock); pl->pl_server_lock_volume = slv; + spin_unlock(&pl->pl_lock); } EXPORT_SYMBOL(ldlm_pool_set_slv); +/** + * Returns current \a pl CLV. + * + * \pre ->pl_lock is not locked. + */ +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl) +{ + __u64 slv; + spin_lock(&pl->pl_lock); + slv = pl->pl_client_lock_volume; + spin_unlock(&pl->pl_lock); + return slv; +} +EXPORT_SYMBOL(ldlm_pool_get_clv); + +/** + * Sets passed \a clv to \a pl. + * + * \pre ->pl_lock is not locked. + */ +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) +{ + spin_lock(&pl->pl_lock); + pl->pl_client_lock_volume = clv; + spin_unlock(&pl->pl_lock); +} +EXPORT_SYMBOL(ldlm_pool_set_clv); + +/** + * Returns current \a pl limit. + */ __u32 ldlm_pool_get_limit(struct ldlm_pool *pl) { return atomic_read(&pl->pl_limit); } EXPORT_SYMBOL(ldlm_pool_get_limit); +/** + * Sets passed \a limit to \a pl. + */ void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit) { atomic_set(&pl->pl_limit, limit); } EXPORT_SYMBOL(ldlm_pool_set_limit); -/* Server side is only enabled for kernel space for now. */ +/** + * Returns current LVF from \a pl. + */ +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl) +{ + return atomic_read(&pl->pl_lock_volume_factor); +} +EXPORT_SYMBOL(ldlm_pool_get_lvf); + #ifdef __KERNEL__ static int ldlm_pool_granted(struct ldlm_pool *pl) { @@ -759,9 +995,11 @@ void ldlm_pools_wakeup(void) } EXPORT_SYMBOL(ldlm_pools_wakeup); -/* Cancel @nr locks from all namespaces (if possible). Returns number of +/* + * Cancel \a nr locks from all namespaces (if possible). Returns number of * cached locks after shrink is finished. All namespaces are asked to - * cancel approximately equal amount of locks. */ + * cancel approximately equal amount of locks to keep balancing. + */ static int ldlm_pools_shrink(ldlm_side_t client, int nr, unsigned int gfp_mask) { @@ -771,10 +1009,12 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr, if (nr != 0 && !(gfp_mask & __GFP_FS)) return -1; - CDEBUG(D_DLMTRACE, "request to shrink %d %s locks from all pools\n", + CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n", nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server"); - /* Find out how many resources we may release. */ + /* + * Find out how many resources we may release. + */ for (nr_ns = atomic_read(ldlm_namespace_nr(client)); nr_ns > 0; nr_ns--) { @@ -783,9 +1023,9 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr, mutex_up(ldlm_namespace_lock(client)); return 0; } - ns = ldlm_namespace_first(client); + ns = ldlm_namespace_first_locked(client); ldlm_namespace_get(ns); - ldlm_namespace_move(ns, client); + ldlm_namespace_move_locked(ns, client); mutex_up(ldlm_namespace_lock(client)); total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask); ldlm_namespace_put(ns, 1); @@ -794,25 +1034,31 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr, if (nr == 0 || total == 0) return total; - /* Shrink at least ldlm_namespace_nr(client) namespaces. */ + /* + * Shrink at least ldlm_namespace_nr(client) namespaces. + */ for (nr_ns = atomic_read(ldlm_namespace_nr(client)); nr_ns > 0; nr_ns--) { int cancel, nr_locks; - /* Do not call shrink under ldlm_namespace_lock(client) */ + /* + * Do not call shrink under ldlm_namespace_lock(client) + */ mutex_down(ldlm_namespace_lock(client)); if (list_empty(ldlm_namespace_list(client))) { mutex_up(ldlm_namespace_lock(client)); - /* If list is empty, we can't return any @cached > 0, + /* + * If list is empty, we can't return any @cached > 0, * that probably would cause needless shrinker - * call. */ + * call. + */ cached = 0; break; } - ns = ldlm_namespace_first(client); + ns = ldlm_namespace_first_locked(client); ldlm_namespace_get(ns); - ldlm_namespace_move(ns, client); + ldlm_namespace_move_locked(ns, client); mutex_up(ldlm_namespace_lock(client)); nr_locks = ldlm_pool_granted(&ns->ns_pool); @@ -840,9 +1086,13 @@ void ldlm_pools_recalc(ldlm_side_t client) struct ldlm_namespace *ns; int nr, equal = 0; - /* No need to setup pool limit for client pools. */ + /* + * No need to setup pool limit for client pools. + */ if (client == LDLM_NAMESPACE_SERVER) { - /* Check all modest namespaces first. */ + /* + * Check all modest namespaces first. + */ mutex_down(ldlm_namespace_lock(client)); list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) @@ -854,16 +1104,20 @@ void ldlm_pools_recalc(ldlm_side_t client) if (l == 0) l = 1; - /* Set the modest pools limit equal to their avg granted - * locks + 5%. */ + /* + * Set the modest pools limit equal to their avg granted + * locks + 5%. + */ l += dru(l * LDLM_POOLS_MODEST_MARGIN, 100); ldlm_pool_setup(&ns->ns_pool, l); nr_l += l; nr_p++; } - /* Make sure that modest namespaces did not eat more that 2/3 - * of limit */ + /* + * Make sure that modest namespaces did not eat more that 2/3 + * of limit. + */ if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) { CWARN("\"Modest\" pools eat out 2/3 of server locks " "limit (%d of %lu). This means that you have too " @@ -872,7 +1126,9 @@ void ldlm_pools_recalc(ldlm_side_t client) equal = 1; } - /* The rest is given to greedy namespaces. */ + /* + * The rest is given to greedy namespaces. + */ list_for_each_entry(ns, ldlm_namespace_list(client), ns_list_chain) { @@ -880,14 +1136,18 @@ void ldlm_pools_recalc(ldlm_side_t client) continue; if (equal) { - /* In the case 2/3 locks are eaten out by + /* + * In the case 2/3 locks are eaten out by * modest pools, we re-setup equal limit - * for _all_ pools. */ + * for _all_ pools. + */ l = LDLM_POOL_HOST_L / atomic_read(ldlm_namespace_nr(client)); } else { - /* All the rest of greedy pools will have - * all locks in equal parts.*/ + /* + * All the rest of greedy pools will have + * all locks in equal parts. + */ l = (LDLM_POOL_HOST_L - nr_l) / (atomic_read(ldlm_namespace_nr(client)) - nr_p); @@ -897,24 +1157,30 @@ void ldlm_pools_recalc(ldlm_side_t client) mutex_up(ldlm_namespace_lock(client)); } - /* Recalc at least ldlm_namespace_nr(client) namespaces. */ + /* + * Recalc at least ldlm_namespace_nr(client) namespaces. + */ for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) { - /* Lock the list, get first @ns in the list, getref, move it + /* + * Lock the list, get first @ns in the list, getref, move it * to the tail, unlock and call pool recalc. This way we avoid * calling recalc under @ns lock what is really good as we get * rid of potential deadlock on client nodes when canceling - * locks synchronously. */ + * locks synchronously. + */ mutex_down(ldlm_namespace_lock(client)); if (list_empty(ldlm_namespace_list(client))) { mutex_up(ldlm_namespace_lock(client)); break; } - ns = ldlm_namespace_first(client); + ns = ldlm_namespace_first_locked(client); ldlm_namespace_get(ns); - ldlm_namespace_move(ns, client); + ldlm_namespace_move_locked(ns, client); mutex_up(ldlm_namespace_lock(client)); - /* After setup is done - recalc the pool. */ + /* + * After setup is done - recalc the pool. + */ ldlm_pool_recalc(&ns->ns_pool); ldlm_namespace_put(ns, 1); } @@ -937,12 +1203,16 @@ static int ldlm_pools_thread_main(void *arg) while (1) { struct l_wait_info lwi; - /* Recal all pools on this tick. */ + /* + * Recal all pools on this tick. + */ ldlm_pools_recalc(LDLM_NAMESPACE_SERVER); ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT); - /* Wait until the next check time, or until we're - * stopped. */ + /* + * Wait until the next check time, or until we're + * stopped. + */ lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD), NULL, NULL); l_wait_event(thread->t_ctl_waitq, (thread->t_flags & @@ -982,8 +1252,10 @@ static int ldlm_pools_thread_start(void) init_completion(&ldlm_pools_comp); cfs_waitq_init(&ldlm_pools_thread->t_ctl_waitq); - /* CLONE_VM and CLONE_FILES just avoid a needless copy, because we - * just drop the VM and FILES in ptlrpc_daemonize() right away. */ + /* + * CLONE_VM and CLONE_FILES just avoid a needless copy, because we + * just drop the VM and FILES in ptlrpc_daemonize() right away. + */ rc = cfs_kernel_thread(ldlm_pools_thread_main, ldlm_pools_thread, CLONE_VM | CLONE_FILES); if (rc < 0) { @@ -1010,9 +1282,11 @@ static void ldlm_pools_thread_stop(void) ldlm_pools_thread->t_flags = SVC_STOPPING; cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq); - /* Make sure that pools thread is finished before freeing @thread. + /* + * Make sure that pools thread is finished before freeing @thread. * This fixes possible race and oops due to accessing freed memory - * in pools thread. */ + * in pools thread. + */ wait_for_completion(&ldlm_pools_comp); OBD_FREE_PTR(ldlm_pools_thread); ldlm_pools_thread = NULL; @@ -1107,6 +1381,18 @@ void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv) } EXPORT_SYMBOL(ldlm_pool_set_slv); +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl) +{ + return 1; +} +EXPORT_SYMBOL(ldlm_pool_get_clv); + +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) +{ + return; +} +EXPORT_SYMBOL(ldlm_pool_set_clv); + __u32 ldlm_pool_get_limit(struct ldlm_pool *pl) { return 0; @@ -1119,6 +1405,12 @@ void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit) } EXPORT_SYMBOL(ldlm_pool_set_limit); +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl) +{ + return 0; +} +EXPORT_SYMBOL(ldlm_pool_get_lvf); + int ldlm_pools_init(void) { return 0; diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index aa3d1ae7b2da664e712ce672f94d87fde6774ce4..7a8bcf9dc2f8104e727a201aeb44e55e3915c6ff 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -433,19 +433,23 @@ int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, lock->l_req_mode = newmode; } - if (reply->lock_desc.l_resource.lr_name.name[0] != - lock->l_resource->lr_name.name[0]) { - CDEBUG(D_INFO, "remote intent success, locking %ld " - "instead of %ld\n", - (long)reply->lock_desc.l_resource.lr_name.name[0], - (long)lock->l_resource->lr_name.name[0]); - - ldlm_lock_change_resource(ns, lock, + if (memcmp(reply->lock_desc.l_resource.lr_name.name, + lock->l_resource->lr_name.name, + sizeof(struct ldlm_res_id))) { + CDEBUG(D_INFO, "remote intent success, locking " + "("LPU64"/"LPU64"/"LPU64") instead of " + "("LPU64"/"LPU64"/"LPU64")\n", + reply->lock_desc.l_resource.lr_name.name[0], + reply->lock_desc.l_resource.lr_name.name[1], + reply->lock_desc.l_resource.lr_name.name[2], + lock->l_resource->lr_name.name[0], + lock->l_resource->lr_name.name[1], + lock->l_resource->lr_name.name[2]); + + rc = ldlm_lock_change_resource(ns, lock, reply->lock_desc.l_resource.lr_name); - if (lock->l_resource == NULL) { - LBUG(); + if (rc || lock->l_resource == NULL) GOTO(cleanup, rc = -ENOMEM); - } LDLM_DEBUG(lock, "client-side enqueue, new resource"); } if (with_policy) @@ -513,7 +517,7 @@ cleanup: static inline int ldlm_req_handles_avail(struct obd_export *exp, int *size, int bufcount, int off) { - int avail = min_t(int, LDLM_MAXREQSIZE, PAGE_SIZE - 512); + int avail = min_t(int, LDLM_MAXREQSIZE, CFS_PAGE_SIZE - 512); avail -= lustre_msg_size(class_exp2cliimp(exp)->imp_msg_magic, bufcount, size); @@ -568,8 +572,10 @@ struct ptlrpc_request *ldlm_prep_elc_req(struct obd_export *exp, int version, pack = avail; size[bufoff] = ldlm_request_bufsize(pack, opc); } + req = ptlrpc_prep_req(class_exp2cliimp(exp), version, opc, bufcount, size, NULL); + req->rq_export = class_export_get(exp); if (exp_connect_cancelset(exp) && req) { if (canceloff) { dlm = lustre_msg_buf(req->rq_reqmsg, bufoff, @@ -618,7 +624,8 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, struct ldlm_reply *reply; int size[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREQ_OFF] = sizeof(*body), - [DLM_REPLY_REC_OFF] = lvb_len }; + [DLM_REPLY_REC_OFF] = lvb_len ? lvb_len : + sizeof(struct ost_lvb) }; int is_replay = *flags & LDLM_FL_REPLAY; int req_passed_in = 1, rc, err; struct ptlrpc_request *req; @@ -679,10 +686,10 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, } else { req = *reqp; LASSERTF(lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF) >= - sizeof(*body), "buflen[%d] = %d, not "LPSZ"\n", + sizeof(*body), "buflen[%d] = %d, not %d\n", DLM_LOCKREQ_OFF, lustre_msg_buflen(req->rq_reqmsg, DLM_LOCKREQ_OFF), - sizeof(*body)); + (int)sizeof(*body)); } lock->l_conn_export = exp; @@ -698,7 +705,7 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, /* Continue as normal. */ if (!req_passed_in) { size[DLM_LOCKREPLY_OFF] = sizeof(*reply); - ptlrpc_req_set_repsize(req, 2 + (lvb_len > 0), size); + ptlrpc_req_set_repsize(req, 3, size); } /* @@ -1004,22 +1011,30 @@ static inline struct ldlm_pool *ldlm_imp2pl(struct obd_import *imp) int ldlm_cli_update_pool(struct ptlrpc_request *req) { + struct obd_device *obd; __u64 old_slv, new_slv; - struct ldlm_pool *pl; __u32 new_limit; ENTRY; - if (!imp_connect_lru_resize(req->rq_import)) + if (unlikely(!req->rq_import || !req->rq_import->imp_obd || + !imp_connect_lru_resize(req->rq_import))) + { + /* + * Do nothing for corner cases. + */ RETURN(0); + } - /* In some cases RPC may contain slv and limit zeroed out. This is + /* + * In some cases RPC may contain slv and limit zeroed out. This is * the case when server does not support lru resize feature. This is * also possible in some recovery cases when server side reqs have no * ref to obd export and thus access to server side namespace is no - * possible. */ + * possible. + */ if (lustre_msg_get_slv(req->rq_repmsg) == 0 || lustre_msg_get_limit(req->rq_repmsg) == 0) { - DEBUG_REQ(D_HA, req, "zero SLV or Limit found " + DEBUG_REQ(D_HA, req, "Zero SLV or Limit found " "(SLV: "LPU64", Limit: %u)", lustre_msg_get_slv(req->rq_repmsg), lustre_msg_get_limit(req->rq_repmsg)); @@ -1028,30 +1043,41 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req) new_limit = lustre_msg_get_limit(req->rq_repmsg); new_slv = lustre_msg_get_slv(req->rq_repmsg); - pl = ldlm_imp2pl(req->rq_import); - - spin_lock(&pl->pl_lock); - old_slv = ldlm_pool_get_slv(pl); - ldlm_pool_set_slv(pl, new_slv); - ldlm_pool_set_limit(pl, new_limit); - - /* Check if we need to wakeup pools thread for fast SLV change. + obd = req->rq_import->imp_obd; + + /* + * Set new SLV and Limit to obd fields to make accessible for pool + * thread. We do not access obd_namespace and pool directly here + * as there is no reliable way to make sure that they are still + * alive in cleanup time. Evil races are possible which may cause + * oops in that time. + */ + write_lock(&obd->obd_pool_lock); + old_slv = obd->obd_pool_slv; + obd->obd_pool_slv = new_slv; + obd->obd_pool_limit = new_limit; + write_unlock(&obd->obd_pool_lock); + + /* + * Check if we need to wakeup pools thread for fast SLV change. * This is only done when threads period is noticably long like - * 10s or more. */ + * 10s or more. + */ #if defined(__KERNEL__) && (LDLM_POOLS_THREAD_PERIOD >= 10) - { + if (old_slv > 0) { __u64 fast_change = old_slv * LDLM_POOLS_FAST_SLV_CHANGE; do_div(fast_change, 100); - /* Wake up pools thread only if SLV has changed more than + /* + * Wake up pools thread only if SLV has changed more than * 50% since last update. In this case we want to react asap. * Otherwise it is no sense to wake up pools as they are - * re-calculated every LDLM_POOLS_THREAD_PERIOD anyways. */ + * re-calculated every LDLM_POOLS_THREAD_PERIOD anyways. + */ if (old_slv > new_slv && old_slv - new_slv > fast_change) ldlm_pools_wakeup(); } #endif - spin_unlock(&pl->pl_lock); RETURN(0); } EXPORT_SYMBOL(ldlm_cli_update_pool); @@ -1203,10 +1229,8 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, if (count && added >= count) return LDLM_POLICY_KEEP_LOCK; - spin_lock(&pl->pl_lock); slv = ldlm_pool_get_slv(pl); - lvf = atomic_read(&pl->pl_lock_volume_factor); - spin_unlock(&pl->pl_lock); + lvf = ldlm_pool_get_lvf(pl); la = cfs_duration_sec(cfs_time_sub(cur, lock->l_last_used)); @@ -1214,6 +1238,9 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, /* Stop when slv is not yet come from server or * lv is smaller than it is. */ lv = lvf * la * unused; + + /* Inform pool about current CLV to see it via proc. */ + ldlm_pool_set_clv(pl, lv); return (slv == 1 || lv < slv) ? LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; } diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index df5bc2cbd47fe8ce6a9fdeaebc5f714b851ddf1b..3649a93e154bc518d06c808fd3c09cf5580da730 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -40,11 +40,11 @@ atomic_t ldlm_srv_namespace_nr = ATOMIC_INIT(0); atomic_t ldlm_cli_namespace_nr = ATOMIC_INIT(0); struct semaphore ldlm_srv_namespace_lock; -struct list_head ldlm_srv_namespace_list = +struct list_head ldlm_srv_namespace_list = CFS_LIST_HEAD_INIT(ldlm_srv_namespace_list); struct semaphore ldlm_cli_namespace_lock; -struct list_head ldlm_cli_namespace_list = +struct list_head ldlm_cli_namespace_list = CFS_LIST_HEAD_INIT(ldlm_cli_namespace_list); cfs_proc_dir_entry_t *ldlm_type_proc_dir = NULL; @@ -111,10 +111,10 @@ err: void ldlm_proc_cleanup(void) { - if (ldlm_svc_proc_dir) + if (ldlm_svc_proc_dir) lprocfs_remove(&ldlm_svc_proc_dir); - if (ldlm_ns_proc_dir) + if (ldlm_ns_proc_dir) lprocfs_remove(&ldlm_ns_proc_dir); if (ldlm_type_proc_dir) @@ -136,27 +136,29 @@ static int lprocfs_wr_lru_size(struct file *file, const char *buffer, unsigned long count, void *data) { struct ldlm_namespace *ns = data; - char dummy[MAX_STRING_SIZE + 1], *end; + char dummy[MAX_STRING_SIZE + 1] = { '\0' }, *end; unsigned long tmp; int lru_resize; - dummy[MAX_STRING_SIZE] = '\0'; - if (copy_from_user(dummy, buffer, MAX_STRING_SIZE)) + if (count >= sizeof(dummy) || count == 0) + return -EINVAL; + + if (copy_from_user(dummy, buffer, count)) return -EFAULT; - if (count == 6 && memcmp(dummy, "clear", 5) == 0) { + if (strncmp(dummy, "clear", 5) == 0) { CDEBUG(D_DLMTRACE, "dropping all unused locks from namespace %s\n", ns->ns_name); if (ns_connect_lru_resize(ns)) { int canceled, unused = ns->ns_nr_unused; - + /* Try to cancel all @ns_nr_unused locks. */ - canceled = ldlm_cancel_lru(ns, unused, LDLM_SYNC, + canceled = ldlm_cancel_lru(ns, unused, LDLM_SYNC, LDLM_CANCEL_PASSED); if (canceled < unused) { CERROR("not all requested locks are canceled, " - "requested: %d, canceled: %d\n", unused, + "requested: %d, canceled: %d\n", unused, canceled); return -EINVAL; } @@ -175,35 +177,39 @@ static int lprocfs_wr_lru_size(struct file *file, const char *buffer, return -EINVAL; } lru_resize = (tmp == 0); - + if (ns_connect_lru_resize(ns)) { if (!lru_resize) ns->ns_max_unused = (unsigned int)tmp; - + if (tmp > ns->ns_nr_unused) tmp = ns->ns_nr_unused; tmp = ns->ns_nr_unused - tmp; - - CDEBUG(D_DLMTRACE, "changing namespace %s unused locks from %u to %u\n", + + CDEBUG(D_DLMTRACE, + "changing namespace %s unused locks from %u to %u\n", ns->ns_name, ns->ns_nr_unused, (unsigned int)tmp); - ldlm_cancel_lru(ns, (unsigned int)tmp, LDLM_ASYNC, LDLM_CANCEL_PASSED); - + ldlm_cancel_lru(ns, tmp, LDLM_ASYNC, LDLM_CANCEL_PASSED); + if (!lru_resize) { - CDEBUG(D_DLMTRACE, "disable lru_resize for namespace %s\n", + CDEBUG(D_DLMTRACE, + "disable lru_resize for namespace %s\n", ns->ns_name); ns->ns_connect_flags &= ~OBD_CONNECT_LRU_RESIZE; } } else { - CDEBUG(D_DLMTRACE, "changing namespace %s max_unused from %u to %u\n", + CDEBUG(D_DLMTRACE, + "changing namespace %s max_unused from %u to %u\n", ns->ns_name, ns->ns_max_unused, (unsigned int)tmp); ns->ns_max_unused = (unsigned int)tmp; ldlm_cancel_lru(ns, 0, LDLM_ASYNC, LDLM_CANCEL_PASSED); - - /* Make sure that originally lru resize was supported before + + /* Make sure that originally lru resize was supported before * turning it on here. */ - if (lru_resize && + if (lru_resize && (ns->ns_orig_connect_flags & OBD_CONNECT_LRU_RESIZE)) { - CDEBUG(D_DLMTRACE, "enable lru_resize for namespace %s\n", + CDEBUG(D_DLMTRACE, + "enable lru_resize for namespace %s\n", ns->ns_name); ns->ns_connect_flags |= OBD_CONNECT_LRU_RESIZE; } @@ -247,14 +253,14 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns) lock_vars[0].read_fptr = lprocfs_rd_lru_size; lock_vars[0].write_fptr = lprocfs_wr_lru_size; lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); - + snprintf(lock_name, MAX_STRING_SIZE, "%s/shrink_thumb", ns->ns_name); lock_vars[0].data = ns; lock_vars[0].read_fptr = lprocfs_rd_uint; lock_vars[0].write_fptr = lprocfs_wr_uint; lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); - + snprintf(lock_name, MAX_STRING_SIZE, "%s/lru_max_age", ns->ns_name); lock_vars[0].data = &ns->ns_max_age; @@ -289,8 +295,9 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns) #define ldlm_proc_namespace(ns) do {} while (0) #endif /* LPROCFS */ -struct ldlm_namespace *ldlm_namespace_new(char *name, ldlm_side_t client, - ldlm_appetite_t apt) +struct ldlm_namespace * +ldlm_namespace_new(struct obd_device *obd, char *name, + ldlm_side_t client, ldlm_appetite_t apt) { struct ldlm_namespace *ns = NULL; struct list_head *bucket; @@ -318,9 +325,14 @@ struct ldlm_namespace *ldlm_namespace_new(char *name, ldlm_side_t client, ns->ns_shrink_thumb = LDLM_LOCK_SHRINK_THUMB; ns->ns_appetite = apt; + + LASSERT(obd != NULL); + ns->ns_obd = obd; + strcpy(ns->ns_name, name); CFS_INIT_LIST_HEAD(&ns->ns_root_list); + CFS_INIT_LIST_HEAD(&ns->ns_list_chain); ns->ns_refcount = 0; ns->ns_client = client; spin_lock_init(&ns->ns_hash_lock); @@ -336,6 +348,7 @@ struct ldlm_namespace *ldlm_namespace_new(char *name, ldlm_side_t client, CFS_INIT_LIST_HEAD(bucket); CFS_INIT_LIST_HEAD(&ns->ns_unused_list); + CFS_INIT_LIST_HEAD(&ns->ns_list_chain); ns->ns_nr_unused = 0; ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE; ns->ns_max_age = LDLM_DEFAULT_MAX_ALIVE; @@ -346,7 +359,7 @@ struct ldlm_namespace *ldlm_namespace_new(char *name, ldlm_side_t client, ldlm_proc_namespace(ns); idx = atomic_read(ldlm_namespace_nr(client)); - + rc = ldlm_pool_init(&ns->ns_pool, ns, idx, client); if (rc) { CERROR("Can't initialize lock pool, rc %d\n", rc); @@ -354,12 +367,7 @@ struct ldlm_namespace *ldlm_namespace_new(char *name, ldlm_side_t client, } at_init(&ns->ns_at_estimate, ldlm_enqueue_min, 0); - - mutex_down(ldlm_namespace_lock(client)); - list_add(&ns->ns_list_chain, ldlm_namespace_list(client)); - atomic_inc(ldlm_namespace_nr(client)); - mutex_up(ldlm_namespace_lock(client)); - + ldlm_namespace_register(ns, client); RETURN(ns); out_proc: ldlm_namespace_cleanup(ns, 0); @@ -369,7 +377,7 @@ out_hash: out_ns: OBD_FREE_PTR(ns); out_ref: - ldlm_put_ref(0); + ldlm_put_ref(); RETURN(NULL); } @@ -388,10 +396,9 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, int local_only = (flags & LDLM_FL_LOCAL_ONLY); ENTRY; - do { struct ldlm_lock *lock = NULL; - + /* first, we look for non-cleaned-yet lock * all cleaned locks are marked by CLEANED flag */ lock_res(res); @@ -405,7 +412,7 @@ static void cleanup_resource(struct ldlm_resource *res, struct list_head *q, lock->l_flags |= LDLM_FL_CLEANED; break; } - + if (lock == NULL) { unlock_res(res); break; @@ -498,25 +505,11 @@ int ldlm_namespace_cleanup(struct ldlm_namespace *ns, int flags) return ELDLM_OK; } -int ldlm_namespace_free_prior(struct ldlm_namespace *ns) +static int __ldlm_namespace_free(struct ldlm_namespace *ns, int force) { ENTRY; - if (!ns) - RETURN(ELDLM_OK); - - mutex_down(ldlm_namespace_lock(ns->ns_client)); - /* - * Some asserts and possibly other parts of code still using - * list_empty(&ns->ns_list_chain). This is why it is important - * to use list_del_init() here. - */ - list_del_init(&ns->ns_list_chain); - atomic_dec(ldlm_namespace_nr(ns->ns_client)); - ldlm_pool_fini(&ns->ns_pool); - mutex_up(ldlm_namespace_lock(ns->ns_client)); - /* At shutdown time, don't call the cancellation callback */ - ldlm_namespace_cleanup(ns, 0); + ldlm_namespace_cleanup(ns, force ? LDLM_FL_LOCAL_ONLY : 0); if (ns->ns_refcount > 0) { struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); @@ -524,16 +517,30 @@ int ldlm_namespace_free_prior(struct ldlm_namespace *ns) CDEBUG(D_DLMTRACE, "dlm namespace %s free waiting on refcount %d\n", ns->ns_name, ns->ns_refcount); +force_wait: + if (force) + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL); + rc = l_wait_event(ns->ns_waitq, ns->ns_refcount == 0, &lwi); - if (ns->ns_refcount) - LCONSOLE_ERROR_MSG(0x139, "Lock manager: wait for %s " - "namespace cleanup aborted with %d " - "resources in use. (%d)\nI'm going " - "to try to clean up anyway, but I " - "might need a reboot of this node.\n", - ns->ns_name, (int) ns->ns_refcount, - rc); + + /* Forced cleanups should be able to reclaim all references, + * so it's safe to wait forever... we can't leak locks... */ + if (force && rc == -ETIMEDOUT) { + LCONSOLE_ERROR("Forced cleanup waiting for %s " + "namespace with %d resources in use, " + "(rc=%d)\n", ns->ns_name, + ns->ns_refcount, rc); + GOTO(force_wait, rc); + } + + if (ns->ns_refcount) { + LCONSOLE_ERROR("Cleanup waiting for %s namespace " + "with %d resources in use, (rc=%d)\n", + ns->ns_name, + ns->ns_refcount, rc); + RETURN(ELDLM_NAMESPACE_EXISTS); + } CDEBUG(D_DLMTRACE, "dlm namespace %s free done waiting\n", ns->ns_name); } @@ -541,11 +548,48 @@ int ldlm_namespace_free_prior(struct ldlm_namespace *ns) RETURN(ELDLM_OK); } -int ldlm_namespace_free_post(struct ldlm_namespace *ns, int force) +void ldlm_namespace_free_prior(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) { + int rc; ENTRY; - if (!ns) - RETURN(ELDLM_OK); + if (!ns) { + EXIT; + return; + } + + /* Make sure that nobody can find this ns in its list. */ + ldlm_namespace_unregister(ns, ns->ns_client); + + /* Can fail with -EINTR when force == 0 in which case try harder */ + rc = __ldlm_namespace_free(ns, force); + if (rc != ELDLM_OK) { + if (imp) { + ptlrpc_disconnect_import(imp, 0); + ptlrpc_invalidate_import(imp); + } + + /* With all requests dropped and the import inactive + * we are gaurenteed all reference will be dropped. */ + rc = __ldlm_namespace_free(ns, 1); + LASSERT(rc == 0); + } + EXIT; +} + +void ldlm_namespace_free_post(struct ldlm_namespace *ns) +{ + ENTRY; + if (!ns) { + EXIT; + return; + } + + /* Fini pool _before_ parent proc dir is removed. This is important + * as ldlm_pool_fini() removes own proc dir which is child to @dir. + * Removing it after @dir may cause oops. */ + ldlm_pool_fini(&ns->ns_pool); #ifdef LPROCFS { @@ -561,22 +605,21 @@ int ldlm_namespace_free_post(struct ldlm_namespace *ns, int force) #endif OBD_VFREE(ns->ns_hash, sizeof(*ns->ns_hash) * RES_HASH_SIZE); OBD_FREE(ns->ns_name, strlen(ns->ns_name) + 1); - /* - * @ns should be not on list in this time, otherwise this will cause - * issues realted to using freed @ns in pools thread. - */ + + /* @ns should be not on list in this time, otherwise this will cause + * issues realted to using freed @ns in pools thread. */ LASSERT(list_empty(&ns->ns_list_chain)); OBD_FREE_PTR(ns); - ldlm_put_ref(force); - RETURN(ELDLM_OK); + ldlm_put_ref(); + EXIT; } /* Cleanup the resource, and free namespace. * bug 12864: - * Deadlock issue: - * proc1: destroy import - * class_disconnect_export(grab cl_sem) -> - * -> ldlm_namespace_free -> + * Deadlock issue: + * proc1: destroy import + * class_disconnect_export(grab cl_sem) -> + * -> ldlm_namespace_free -> * -> lprocfs_remove(grab _lprocfs_lock). * proc2: read proc info * lprocfs_fops_read(grab _lprocfs_lock) -> @@ -585,17 +628,18 @@ int ldlm_namespace_free_post(struct ldlm_namespace *ns, int force) * So that I have to split the ldlm_namespace_free into two parts - the first * part ldlm_namespace_free_prior is used to cleanup the resource which is * being used; the 2nd part ldlm_namespace_free_post is used to unregister the - * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem + * lprocfs entries, and then free memory. It will be called w/o cli->cl_sem * held. */ -int ldlm_namespace_free(struct ldlm_namespace *ns, int force) +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, + int force) { - ldlm_namespace_free_prior(ns); - ldlm_namespace_free_post(ns, force); - return ELDLM_OK; + ldlm_namespace_free_prior(ns, imp, force); + ldlm_namespace_free_post(ns); } -void ldlm_namespace_get_nolock(struct ldlm_namespace *ns) +void ldlm_namespace_get_locked(struct ldlm_namespace *ns) { LASSERT(ns->ns_refcount >= 0); ns->ns_refcount++; @@ -604,11 +648,11 @@ void ldlm_namespace_get_nolock(struct ldlm_namespace *ns) void ldlm_namespace_get(struct ldlm_namespace *ns) { spin_lock(&ns->ns_hash_lock); - ldlm_namespace_get_nolock(ns); + ldlm_namespace_get_locked(ns); spin_unlock(&ns->ns_hash_lock); } -void ldlm_namespace_put_nolock(struct ldlm_namespace *ns, int wakeup) +void ldlm_namespace_put_locked(struct ldlm_namespace *ns, int wakeup) { LASSERT(ns->ns_refcount > 0); ns->ns_refcount--; @@ -619,12 +663,37 @@ void ldlm_namespace_put_nolock(struct ldlm_namespace *ns, int wakeup) void ldlm_namespace_put(struct ldlm_namespace *ns, int wakeup) { spin_lock(&ns->ns_hash_lock); - ldlm_namespace_put_nolock(ns, wakeup); + ldlm_namespace_put_locked(ns, wakeup); spin_unlock(&ns->ns_hash_lock); } +/* Register @ns in the list of namespaces */ +void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client) +{ + mutex_down(ldlm_namespace_lock(client)); + LASSERT(list_empty(&ns->ns_list_chain)); + list_add(&ns->ns_list_chain, ldlm_namespace_list(client)); + atomic_inc(ldlm_namespace_nr(client)); + mutex_up(ldlm_namespace_lock(client)); +} + +/* Unregister @ns from the list of namespaces */ +void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client) +{ + mutex_down(ldlm_namespace_lock(client)); + LASSERT(!list_empty(&ns->ns_list_chain)); + /* + * Some asserts and possibly other parts of code still using + * list_empty(&ns->ns_list_chain). This is why it is important + * to use list_del_init() here. + */ + list_del_init(&ns->ns_list_chain); + atomic_dec(ldlm_namespace_nr(client)); + mutex_up(ldlm_namespace_lock(client)); +} + /* Should be called under ldlm_namespace_lock(client) taken */ -void ldlm_namespace_move(struct ldlm_namespace *ns, ldlm_side_t client) +void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client) { LASSERT(!list_empty(&ns->ns_list_chain)); LASSERT_SEM_LOCKED(ldlm_namespace_lock(client)); @@ -632,11 +701,11 @@ void ldlm_namespace_move(struct ldlm_namespace *ns, ldlm_side_t client) } /* Should be called under ldlm_namespace_lock(client) taken */ -struct ldlm_namespace *ldlm_namespace_first(ldlm_side_t client) +struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client) { LASSERT_SEM_LOCKED(ldlm_namespace_lock(client)); LASSERT(!list_empty(ldlm_namespace_list(client))); - return container_of(ldlm_namespace_list(client)->next, + return container_of(ldlm_namespace_list(client)->next, struct ldlm_namespace, ns_list_chain); } @@ -747,7 +816,7 @@ ldlm_resource_add(struct ldlm_namespace *ns, struct ldlm_resource *parent, bucket = ns->ns_hash + hash; list_add(&res->lr_hash, bucket); ns->ns_resources++; - ldlm_namespace_get_nolock(ns); + ldlm_namespace_get_locked(ns); if (parent == NULL) { list_add(&res->lr_childof, &ns->ns_root_list); @@ -846,7 +915,7 @@ void __ldlm_resource_putref_final(struct ldlm_resource *res) /* Pass 0 as second argument to not wake up ->ns_waitq yet, will do it * later. */ - ldlm_namespace_put_nolock(ns, 0); + ldlm_namespace_put_locked(ns, 0); list_del_init(&res->lr_hash); list_del_init(&res->lr_childof); @@ -909,9 +978,9 @@ void ldlm_resource_add_lock(struct ldlm_resource *res, struct list_head *head, { check_res_locked(res); - ldlm_resource_dump(D_OTHER, res); - CDEBUG(D_OTHER, "About to add this lock:\n"); - ldlm_lock_dump(D_OTHER, lock, 0); + ldlm_resource_dump(D_INFO, res); + CDEBUG(D_INFO, "About to add this lock:\n"); + ldlm_lock_dump(D_INFO, lock, 0); if (lock->l_destroyed) { CDEBUG(D_OTHER, "Lock destroyed, not adding to resource\n"); @@ -988,8 +1057,8 @@ void ldlm_namespace_dump(int level, struct ldlm_namespace *ns) if (!((libcfs_debug | D_ERROR) & level)) return; - CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n", - ns->ns_name, ns->ns_refcount, + CDEBUG(level, "--- Namespace: %s (rc: %d, side: %s)\n", + ns->ns_name, ns->ns_refcount, ns_is_client(ns) ? "client" : "server"); if (cfs_time_before(cfs_time_current(), ns->ns_next_dump)) @@ -1007,7 +1076,7 @@ void ldlm_namespace_dump(int level, struct ldlm_namespace *ns) lock_res(res); ldlm_resource_dump(level, res); unlock_res(res); - + spin_lock(&ns->ns_hash_lock); tmp = tmp->next; ldlm_resource_putref_locked(res); diff --git a/lustre/liblustre/dir.c b/lustre/liblustre/dir.c index e6788bfe607e69380147c3f84446a1f7c6cfdec1..2aa932d9f31f75fde07b2739995bdb3411f7534a 100644 --- a/lustre/liblustre/dir.c +++ b/lustre/liblustre/dir.c @@ -102,7 +102,7 @@ static int llu_dir_do_readpage(struct inode *inode, struct page *page) } ldlm_lock_dump_handle(D_OTHER, &lockh); - mdc_pack_fid(&mdc_fid, st->st_ino, lli->lli_st_generation, S_IFDIR); + ll_pack_fid(&mdc_fid, st->st_ino, lli->lli_st_generation, S_IFDIR); offset = (__u64)page->index << CFS_PAGE_SHIFT; rc = mdc_readpage(sbi->ll_mdc_exp, &mdc_fid, diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index bd67c0f87cf7b5ae2b6dbd8efd821b4028f2d7d6..db21132f51a26c67968885e7e601b6236ceabb14 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -82,14 +82,14 @@ void llu_prepare_mdc_op_data(struct mdc_op_data *data, if (i1) { ll_i2gids(data->suppgids, i1, i2); - ll_inode2fid(&data->fid1, i1); + llu_inode2fid(&data->fid1, i1); }else { ll_i2gids(data->suppgids, i2, i1); - ll_inode2fid(&data->fid1, i2); + llu_inode2fid(&data->fid1, i2); } if (i2) - ll_inode2fid(&data->fid2, i2); + llu_inode2fid(&data->fid2, i2); else memset(&data->fid2, 0, sizeof(data->fid2)); @@ -319,6 +319,7 @@ int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode) struct ptlrpc_request *req = NULL; struct obd_client_handle *och = &fd->fd_mds_och; struct obdo obdo; + struct mdc_op_data data = { { 0 } }; int rc, valid; ENTRY; @@ -343,7 +344,8 @@ int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode) obdo.o_flags = MDS_BFLAG_UNCOMMITTED_WRITES; obdo.o_valid |= OBD_MD_FLFLAGS; } - rc = mdc_close(mdc_exp, &obdo, och, &req); + data.fid1 = lli->lli_fid; + rc = mdc_close(mdc_exp, &data, &obdo, och, &req); if (rc == EAGAIN) { /* We are the last writer, so the MDS has instructed us to get * the file size and any write cookies, then close again. */ diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h index e943aa46eca2711a1fb231d37189c423e7d693b9..6bcf4a7470311e35a021e03601fadff118ae8071 100644 --- a/lustre/liblustre/llite_lib.h +++ b/lustre/liblustre/llite_lib.h @@ -139,7 +139,7 @@ do { \ #define LL_LOOKUP_POSITIVE 1 #define LL_LOOKUP_NEGATIVE 2 -static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode) +static inline void llu_inode2fid(struct ll_fid *fid, struct inode *inode) { *fid = llu_i2info(inode)->lli_fid; } diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index e096a0e84afe55217b5e66ac0fcfc9cefbb9f4bd..e49d5181d89ce88daf0aba632b1d2454791008f0 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -94,7 +94,7 @@ static int llu_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock char name[16]; struct ldlm_lock *lock; struct lov_stripe_md *lsm; - } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm }; + } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock, .lsm = lsm }; __u32 stripe, vallen = sizeof(stripe); int rc; ENTRY; diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 24b40580ad409a30afb359d5af9aebe2cfc455ba..adfac108f6dc7db81ec4856ee2d87936ea0aeb00 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -135,6 +135,8 @@ void llu_update_inode(struct inode *inode, struct mds_body *body, if (body->valid & OBD_MD_FLID) st->st_ino = body->ino; + if (body->valid & OBD_MD_FLGENER) + lli->lli_st_generation = body->generation; if (body->valid & OBD_MD_FLATIME && body->atime > LTIME_S(st->st_atime)) LTIME_S(st->st_atime) = body->atime; @@ -171,16 +173,8 @@ void llu_update_inode(struct inode *inode, struct mds_body *body, st->st_blocks = body->blocks; if (body->valid & OBD_MD_FLFLAGS) lli->lli_st_flags = body->flags; - if (body->valid & OBD_MD_FLGENER) - lli->lli_st_generation = body->generation; - /* fillin fid */ - if (body->valid & OBD_MD_FLID) - lli->lli_fid.id = body->ino; - if (body->valid & OBD_MD_FLGENER) - lli->lli_fid.generation = body->generation; - if (body->valid & OBD_MD_FLTYPE) - lli->lli_fid.f_type = body->mode & S_IFMT; + lli->lli_fid = body->fid1; } void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid) @@ -431,7 +425,7 @@ static int llu_inode_revalidate(struct inode *inode) ealen = obd_size_diskmd(sbi->ll_osc_exp, NULL); valid |= OBD_MD_FLEASIZE; } - ll_inode2fid(&fid, inode); + llu_inode2fid(&fid, inode); rc = mdc_getattr(sbi->ll_mdc_exp, &fid, valid, ealen, &req); if (rc) { CERROR("failure %d inode %llu\n", rc, @@ -532,7 +526,7 @@ void llu_clear_inode(struct inode *inode) (long long)llu_i2stat(inode)->st_ino, lli->lli_st_generation, inode); - ll_inode2fid(&fid, inode); + llu_inode2fid(&fid, inode); clear_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &(lli->lli_flags)); mdc_change_cbdata(sbi->ll_mdc_exp, &fid, null_if_equal, inode); @@ -898,7 +892,7 @@ static int llu_readlink_internal(struct inode *inode, RETURN(0); } - ll_inode2fid(&fid, inode); + llu_inode2fid(&fid, inode); rc = mdc_getattr(sbi->ll_mdc_exp, &fid, OBD_MD_LINKNAME, symlen, request); if (rc) { @@ -1814,9 +1808,7 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md) } /* try to find existing inode */ - fid.id = md->body->ino; - fid.generation = md->body->generation; - fid.f_type = md->body->mode & S_IFMT; + fid = md->body->fid1; inode = _sysio_i_find(fs, &fileid); if (inode) { @@ -1934,7 +1926,7 @@ llu_fsswop_mount(const char *source, CERROR("MDC %s: not setup or attached\n", mdc); GOTO(out_free, err = -EINVAL); } - obd_set_info_async(obd->obd_self_export, strlen("async"), "async", + obd_set_info_async(obd->obd_self_export, sizeof(KEY_ASYNC), KEY_ASYNC, sizeof(async), &async, NULL); ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION | @@ -1967,7 +1959,7 @@ llu_fsswop_mount(const char *source, CERROR("OSC %s: not setup or attached\n", osc); GOTO(out_mdc, err = -EINVAL); } - obd_set_info_async(obd->obd_self_export, strlen("async"), "async", + obd_set_info_async(obd->obd_self_export, sizeof(KEY_ASYNC), KEY_ASYNC, sizeof(async), &async, NULL); obd->obd_upcall.onu_owner = &sbi->ll_lco; diff --git a/lustre/liblustre/tests/sanity.c b/lustre/liblustre/tests/sanity.c index 27f7211d4359c75fbb682e17045d402ef8ff9fd7..230990731f20a009cef0070b99cebc78f7868a92 100644 --- a/lustre/liblustre/tests/sanity.c +++ b/lustre/liblustre/tests/sanity.c @@ -52,7 +52,7 @@ struct timeval start; extern char *lustre_path; -#define ENTRY(str) \ +#define ENTER(str) \ do { \ char buf[100]; \ int len; \ @@ -94,6 +94,7 @@ int t1(char *name) { char path[MAX_PATH_LENGTH] = ""; + ENTER("touch+unlink"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t1", lustre_path); if (opt_verbose) @@ -108,7 +109,7 @@ int t2(char *name) { char path[MAX_PATH_LENGTH] = ""; - ENTRY("mkdir/rmdir"); + ENTER("mkdir/rmdir"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t2", lustre_path); t_mkdir(path); @@ -120,7 +121,7 @@ int t3(char *name) { char path[MAX_PATH_LENGTH] = ""; - ENTRY("regular stat"); + ENTER("regular stat"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t3", lustre_path); t_touch(path); @@ -133,7 +134,7 @@ int t4(char *name) { char path[MAX_PATH_LENGTH] = ""; - ENTRY("dir stat"); + ENTER("dir stat"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t4", lustre_path); t_mkdir(path); @@ -147,7 +148,7 @@ int t6(char *name) char path[MAX_PATH_LENGTH] = ""; char path2[MAX_PATH_LENGTH] = ""; - ENTRY("symlink"); + ENTER("symlink"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t6", lustre_path); snprintf(path2, MAX_PATH_LENGTH, "%s/test_t6_link", lustre_path); @@ -167,7 +168,7 @@ int t6b(char *name) char *tmp; int fd; - ENTRY("symlink + chdir and open"); + ENTER("symlink + chdir and open"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t6b", lustre_path); snprintf(path2, MAX_PATH_LENGTH, "%s/test_t6b_link", lustre_path); @@ -199,7 +200,7 @@ int t7(char *name) char path[MAX_PATH_LENGTH] = ""; int rc; - ENTRY("mknod"); + ENTER("mknod"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t7", lustre_path); if (geteuid() != 0) { @@ -220,7 +221,7 @@ int t8(char *name) { char path[MAX_PATH_LENGTH] = ""; - ENTRY("chmod"); + ENTER("chmod"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t8", lustre_path); /* Check file. */ @@ -243,7 +244,7 @@ int t9(char *name) char path[MAX_PATH_LENGTH] = ""; char path2[MAX_PATH_LENGTH] = ""; - ENTRY("hard link"); + ENTER("hard link"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t9", lustre_path); snprintf(path2, MAX_PATH_LENGTH, "%s/test_t9_link", lustre_path); @@ -266,7 +267,7 @@ int t10(char *name) char rename2[MAX_PATH_LENGTH] = ""; char rename3[MAX_PATH_LENGTH] = ""; - ENTRY("rename"); + ENTER("rename"); snprintf(dir1, MAX_PATH_LENGTH, "%s/test_t10_dir1", lustre_path); snprintf(dir2, MAX_PATH_LENGTH, "%s/test_t10_dir2", lustre_path); snprintf(path1, MAX_PATH_LENGTH, "%s/test_t10_reg1", lustre_path); @@ -294,7 +295,7 @@ int t11(char *name) char *base=lustre_path; char path[MAX_PATH_LENGTH], path2[MAX_PATH_LENGTH]; int i, j, level = 5, nreg = 5; - ENTRY("deep tree"); + ENTER("deep tree"); safe_strncpy(path, base, MAX_PATH_LENGTH); @@ -330,7 +331,7 @@ int t12(char *name) char dir[MAX_PATH_LENGTH] = ""; char buf[1024*128]; int fd; - ENTRY("empty directory readdir"); + ENTER("empty directory readdir"); snprintf(dir, MAX_PATH_LENGTH, "%s/test_t12_dir", lustre_path); t_mkdir(dir); @@ -349,7 +350,7 @@ int t13(char *name) const int nfiles = 20; char *prefix = "test13_filename_prefix_"; int fd, i; - ENTRY("multiple entries directory readdir"); + ENTER("multiple entries directory readdir"); snprintf(dir, MAX_PATH_LENGTH, "%s/test_t13_dir/", lustre_path); t_mkdir(dir); @@ -380,7 +381,7 @@ int t14(char *name) struct dirent64 *ent; int fd, i, rc, pos, index; loff_t base = 0; - ENTRY(">1 block(4k) directory readdir"); + ENTER(">1 block(4k) directory readdir"); snprintf(dir, MAX_PATH_LENGTH, "%s/test_t14_dir/", lustre_path); rc = mkdir(dir, 0755); @@ -437,7 +438,7 @@ int t15(char *name) { char file[MAX_PATH_LENGTH] = ""; int fd; - ENTRY("open-stat-close"); + ENTER("open-stat-close"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t15_file", lustre_path); t_touch(file); @@ -451,7 +452,7 @@ int t15(char *name) int t16(char *name) { char file[MAX_PATH_LENGTH] = ""; - ENTRY("small-write-read"); + ENTER("small-write-read"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t16_file", lustre_path); t_echo_create(file, "aaaaaaaaaaaaaaaaaaaaaa"); @@ -464,7 +465,7 @@ int t17(char *name) { char file[MAX_PATH_LENGTH] = ""; int fd; - ENTRY("open-unlink without close"); + ENTER("open-unlink without close"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t17_file", lustre_path); fd = open(file, O_WRONLY | O_CREAT, 0666); @@ -482,7 +483,7 @@ int t18(char *name) char buf[128]; int fd, i; struct stat statbuf[3]; - ENTRY("write should change mtime/ctime"); + ENTER("write should change mtime/ctime"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t18_file", lustre_path); for (i = 0; i < 3; i++) { @@ -521,7 +522,7 @@ int t18b(char *name) char file[MAX_PATH_LENGTH] = ""; int i; struct stat statbuf[3]; - ENTRY("utime should change mtime/atime/ctime"); + ENTER("utime should change mtime/atime/ctime"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t18b_file", lustre_path); t_touch(file); @@ -569,7 +570,7 @@ int t19(char *name) char file[MAX_PATH_LENGTH] = ""; int fd; int result; - ENTRY("open(O_TRUNC) should truncate file to 0-length"); + ENTER("open(O_TRUNC) should truncate file to 0-length"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t19_file", lustre_path); t_echo_create(file, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); @@ -594,7 +595,7 @@ int t20(char *name) struct iovec iov[2]; char buf[100]; ssize_t ret; - ENTRY("trap app's general bad pointer for file i/o"); + ENTER("trap app's general bad pointer for file i/o"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t20_file", lustre_path); fd = open(file, O_RDWR|O_CREAT, (mode_t)0666); @@ -676,7 +677,7 @@ int t21(char *name) .l_whence = SEEK_SET, }; - ENTRY("basic fcntl support"); + ENTER("basic fcntl support"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t21_file", lustre_path); fd = open(file, O_RDWR|O_CREAT, (mode_t)0666); @@ -711,7 +712,7 @@ int t22(char *name) char *str = "1234567890"; char buf[100]; ssize_t ret; - ENTRY("make sure O_APPEND take effect"); + ENTER("make sure O_APPEND take effect"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t22_file", lustre_path); fd = open(file, O_RDWR|O_CREAT|O_APPEND, (mode_t)0666); @@ -775,7 +776,7 @@ int t23(char *name) long long ret; loff_t off; - ENTRY("handle seek > 2GB"); + ENTER("handle seek > 2GB"); snprintf(path, MAX_PATH_LENGTH, "%s/f%s", lustre_path, name); fd = open(path, O_WRONLY | O_CREAT | O_LARGEFILE, 0666); @@ -878,14 +879,14 @@ static int pages_io(int xfer, loff_t pos) /* create sample data */ for (i = 0, buf = buf_alloc; i < _npages; i++) { - for (j = 0; j < PAGE_SIZE/sizeof(int); j++, buf++) { + for (j = 0; j < CFS_PAGE_SIZE/sizeof(int); j++, buf++) { *buf = rand(); } } /* compute checksum */ for (i = 0, buf = buf_alloc; i < _npages; i++) { - for (j = 0; j < PAGE_SIZE/sizeof(int); j++, buf++) { + for (j = 0; j < CFS_PAGE_SIZE/sizeof(int); j++, buf++) { check_sum[i] += *buf; } } @@ -903,9 +904,9 @@ static int pages_io(int xfer, loff_t pos) } gettimeofday(&tw1, NULL); for (i = 0, buf = buf_alloc; i < _npages; - i += xfer, buf += xfer * PAGE_SIZE / sizeof(int)) { - rc = write(fd, buf, PAGE_SIZE * xfer); - if (rc != PAGE_SIZE * xfer) { + i += xfer, buf += xfer * CFS_PAGE_SIZE / sizeof(int)) { + rc = write(fd, buf, CFS_PAGE_SIZE * xfer); + if (rc != CFS_PAGE_SIZE * xfer) { printf("write error (i %d, rc %d): %s\n", i, rc, strerror(errno)); return(1); @@ -923,9 +924,9 @@ static int pages_io(int xfer, loff_t pos) } gettimeofday(&tr1, NULL); for (i = 0, buf = buf_alloc; i < _npages; - i += xfer, buf += xfer * PAGE_SIZE / sizeof(int)) { - rc = read(fd, buf, PAGE_SIZE * xfer); - if (rc != PAGE_SIZE * xfer) { + i += xfer, buf += xfer * CFS_PAGE_SIZE / sizeof(int)) { + rc = read(fd, buf, CFS_PAGE_SIZE * xfer); + if (rc != CFS_PAGE_SIZE * xfer) { printf("read error (i %d, rc %d): %s\n", i, rc, strerror(errno)); return(1); @@ -936,7 +937,7 @@ static int pages_io(int xfer, loff_t pos) /* compute checksum */ for (i = 0, buf = buf_alloc; i < _npages; i++) { int sum = 0; - for (j = 0; j < PAGE_SIZE/sizeof(int); j++, buf++) { + for (j = 0; j < CFS_PAGE_SIZE/sizeof(int); j++, buf++) { sum += *buf; } if (sum != check_sum[i]) { @@ -951,8 +952,8 @@ static int pages_io(int xfer, loff_t pos) tw = (tw2.tv_sec - tw1.tv_sec) * 1000000 + (tw2.tv_usec - tw1.tv_usec); tr = (tr2.tv_sec - tr1.tv_sec) * 1000000 + (tr2.tv_usec - tr1.tv_usec); printf(" (R:%.3fM/s, W:%.3fM/s)\n", - (_npages * PAGE_SIZE) / (tw / 1000000.0) / (1024 * 1024), - (_npages * PAGE_SIZE) / (tr / 1000000.0) / (1024 * 1024)); + (_npages * CFS_PAGE_SIZE) / (tw / 1000000.0) / (1024 * 1024), + (_npages * CFS_PAGE_SIZE) / (tr / 1000000.0) / (1024 * 1024)); if (data_error) return 1; @@ -965,7 +966,7 @@ int t50(char *name) int np = 1; loff_t offset = 0; - ENTRY("4k aligned i/o sanity"); + ENTER("4k aligned i/o sanity"); while (np <= _npages) { printf("%3d per xfer(total %d)...\t", np, _npages); fflush(stdout); @@ -983,7 +984,7 @@ int t50b(char *name) int i; loff_t offset; - ENTRY("4k un-aligned i/o sanity"); + ENTER("4k un-aligned i/o sanity"); for (i = 0; i < sizeof(off_array)/sizeof(loff_t); i++) { offset = off_array[i]; printf("16 per xfer(total %d), offset %10lld...\t", @@ -1010,7 +1011,7 @@ int t51(char *name) off_t size; int result; - ENTRY("truncate() should truncate file to proper length"); + ENTER("truncate() should truncate file to proper length"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t51_file", lustre_path); for (size = 0; size < T51_NR * T51_STEP; size += T51_STEP) { @@ -1059,7 +1060,7 @@ int t52(char *name) time_t diff; int fd, i; - ENTRY("atime should be updated during read"); + ENTER("atime should be updated during read"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t52_file", lustre_path); t_echo_create(file, "check atime update during read"); @@ -1098,7 +1099,7 @@ int t53(char *name) struct stat stat_buf; /* struct buffer to hold file info. */ time_t mtime, atime; - ENTRY("mtime/atime should be updated by utime() call"); + ENTER("mtime/atime should be updated by utime() call"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t53_file", lustre_path); t_echo_create(file, "check mtime/atime update by utime() call"); @@ -1135,7 +1136,7 @@ int t54(char *name) struct flock lock; int fd, err; - ENTRY("fcntl should return 0 when succeed in getting flock"); + ENTER("fcntl should return 0 when succeed in getting flock"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t54_file", lustre_path); t_echo_create(file, "fcntl should return 0 when succeed"); @@ -1180,7 +1181,7 @@ int t55(char *name) struct lov_user_ost_data *lo = NULL; int index, fd, buflen, rc; - ENTRY("setstripe/getstripe"); + ENTER("setstripe/getstripe"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t55", lustre_path); snprintf(file, MAX_PATH_LENGTH, "%s/test_t55/file_t55", lustre_path); @@ -1343,7 +1344,7 @@ int t56(char *name) ssize_t rc = 0; struct dirent dir; - ENTRY("getdirentries should fail if nbytes is too small"); + ENTER("getdirentries should fail if nbytes is too small"); /* Set count to be very small. The result should be EINVAL */ nbytes = 8; @@ -1377,8 +1378,8 @@ extern void __liblustre_cleanup_(void); void usage(char *cmd) { printf("\n" - "usage: %s [--only {test}] --target mgsnid:/fsname\n", - cmd); + "usage: %s [-o test][-e test][-v] --target mgsnid:/fsname\n", + cmd); printf(" %s --dumpfile dumpfile\n", cmd); exit(-1); } @@ -1424,21 +1425,28 @@ struct testlist { int main(int argc, char * const argv[]) { struct testlist *test; - int opt_index, c, rc = 0, numonly = 0; - char *only[100]; + int opt_index, c, rc = 0, numonly = 0, numexcept = 0; + char *only[100], *except[100]; static struct option long_opts[] = { {"dumpfile", 1, 0, 'd'}, {"only", 1, 0, 'o'}, + {"except", 1, 0, 'e'}, {"target", 1, 0, 't'}, {"verbose", 1, 0, 'v'}, {0, 0, 0, 0} }; - while ((c = getopt_long(argc, argv, "d:o:t:v", long_opts, &opt_index)) != -1) { + while ((c = getopt_long(argc, argv, "d:e:o:t:v", long_opts, &opt_index)) != -1) { switch (c) { case 'd': setenv(ENV_LUSTRE_DUMPFILE, optarg, 1); break; + case 'e': + if (numexcept == 0) + printf("Not running test(s): "); + printf("%s ", optarg); + except[numexcept++] = optarg; + break; case 'o': if (numonly == 0) printf("Only running test(s): "); @@ -1468,7 +1476,7 @@ int main(int argc, char * const argv[]) __liblustre_setup_(); - buf_size = _npages * PAGE_SIZE; + buf_size = _npages * CFS_PAGE_SIZE; if (opt_verbose) printf("allocating %d bytes buffer\n", buf_size); buf_alloc = calloc(1, buf_size); @@ -1479,14 +1487,35 @@ int main(int argc, char * const argv[]) for (test = testlist; test->test != NULL; test++) { int run = 1, i; + int len, olen; - if (numonly > 0) { - int len; + if (numexcept > 0) { + len = strlen(test->name); + for (i = 0; i < numexcept; i++) { + olen = strlen(except[i]); + + if (len < olen) + continue; + + if (strncmp(except[i], test->name, olen) == 0) { + switch(test->name[olen]) { + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + case '8': case '9': + break; + default: + run = 0; + break; + } + } + } + } + if (numonly > 0) { run = 0; len = strlen(test->name); for (i = 0; i < numonly; i++) { - int olen = strlen(only[i]); + olen = strlen(only[i]); if (len < olen) continue; diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index 96e01b117c8bfdf0d7da45aa094ab01044df1581..4de18aefce5cbc05af237ef5fe62a6f9498da7ae 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -343,7 +343,7 @@ void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft) int ll_revalidate_it(struct dentry *de, int lookup_flags, struct lookup_intent *it) { - struct mdc_op_data op_data; + struct mdc_op_data op_data = { { 0 } }; struct ptlrpc_request *req = NULL; struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; struct obd_export *exp; @@ -440,9 +440,10 @@ int ll_revalidate_it(struct dentry *de, int lookup_flags, do_lock: it->it_create_mode &= ~current->fs->umask; - + it->it_flags |= O_CHECK_STALE; rc = mdc_intent_lock(exp, &op_data, NULL, 0, it, lookup_flags, &req, ll_mdc_blocking_ast, 0); + it->it_flags &= ~O_CHECK_STALE; if (it->it_op == IT_GETATTR && !first) ll_statahead_exit(de, rc); /* If req is NULL, then mdc_intent_lock only tried to do a lock match; @@ -593,8 +594,8 @@ out_sa: unlock_kernel(); handle = (flag) ? &ldd->lld_mnt_och : &ldd->lld_cwd_och; - rc = obd_pin(sbi->ll_mdc_exp, inode->i_ino, inode->i_generation, - inode->i_mode & S_IFMT, handle, flag); + rc = obd_pin(sbi->ll_mdc_exp, ll_inode_ll_fid(inode), + handle, flag); if (rc) { lock_kernel(); diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 5b9955ee5e084513ae9860152c60218d3ad27f88..5791e41dae16eb1667ade8cbadc6ae7ff6139830 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -47,32 +47,6 @@ #include <lustre_dlm.h> #include "llite_internal.h" -/* - * Directory entries are currently in the same format as ext2/ext3, but will - * be changed in the future to accomodate FIDs - */ -#define LL_DIR_NAME_LEN (255) - -static const int LL_DIR_PAD = 4; - -struct ll_dir_entry { - /* number of inode, referenced by this entry */ - __le32 lde_inode; - /* total record length, multiple of LL_DIR_PAD */ - __le16 lde_rec_len; - /* length of name */ - __u8 lde_name_len; - /* file type: regular, directory, device, etc. */ - __u8 lde_file_type; - /* name. NOT NUL-terminated */ - char lde_name[LL_DIR_NAME_LEN]; -}; - -static inline unsigned ll_dir_rec_len(unsigned name_len) -{ - return (name_len + 8 + LL_DIR_PAD - 1) & ~(LL_DIR_PAD - 1); -} - #ifndef HAVE_PAGE_CHECKED #ifdef HAVE_PG_FS_MISC #define PageChecked(page) test_bit(PG_fs_misc, &(page)->flags) @@ -97,7 +71,7 @@ static int ll_dir_readpage(struct file *file, struct page *page) CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off "LPU64"\n", inode->i_ino, inode->i_generation, inode, offset); - mdc_pack_fid(&mdc_fid, inode->i_ino, inode->i_generation, S_IFDIR); + ll_pack_fid(&mdc_fid, inode->i_ino, inode->i_generation, S_IFDIR); rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &mdc_fid, offset, page, &request); @@ -165,11 +139,6 @@ static int ll_dir_check_entry(struct inode *dir, struct ll_dir_entry *ent, return -EIO; } -static inline struct ll_dir_entry *ll_entry_at(void *base, unsigned offset) -{ - return (struct ll_dir_entry *)(base + offset); -} - static void ll_dir_check_page(struct inode *dir, struct page *page) { int err; @@ -231,8 +200,7 @@ static void ll_dir_check_page(struct inode *dir, struct page *page) struct page *ll_get_dir_page(struct inode *dir, unsigned long n) { - struct ldlm_res_id res_id = - { .name = { dir->i_ino, (__u64)dir->i_generation} }; + struct ldlm_res_id res_id; struct lustre_handle lockh; struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp); struct address_space *mapping = dir->i_mapping; @@ -240,6 +208,7 @@ struct page *ll_get_dir_page(struct inode *dir, unsigned long n) ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} }; int rc; + fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id); rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED, &res_id, LDLM_IBITS, &policy, LCK_CR, &lockh); if (!rc) { @@ -247,7 +216,7 @@ struct page *ll_get_dir_page(struct inode *dir, unsigned long n) struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_CR, ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir }; struct ptlrpc_request *request; - struct mdc_op_data data; + struct mdc_op_data data = { { 0 } }; ll_prepare_mdc_op_data(&data, dir, NULL, NULL, 0, 0, NULL); @@ -283,20 +252,11 @@ out_unlock: return page; fail: - kunmap(page); - page_cache_release(page); + ll_put_page(page); page = ERR_PTR(-EIO); goto out_unlock; } -/* - * p is at least 6 bytes before the end of page - */ -static inline struct ll_dir_entry *ll_dir_next_entry(struct ll_dir_entry *p) -{ - return ll_entry_at(p, le16_to_cpu(p->lde_rec_len)); -} - static inline unsigned ll_dir_validate_entry(char *base, unsigned offset, unsigned mask) { @@ -342,8 +302,8 @@ static unsigned char ll_dir_filetype_table[LL_DIR_FT_MAX] = { * 0: no live entries on this page. */ -int ll_readdir_page(char *addr, __u64 base, unsigned *offset, - filldir_t filldir, void *cookie) +static int ll_readdir_page(char *addr, __u64 base, unsigned *offset, + filldir_t filldir, void *cookie) { struct ll_dir_entry *de; char *end; @@ -365,7 +325,7 @@ int ll_readdir_page(char *addr, __u64 base, unsigned *offset, return nr; } -int ll_readdir(struct file *filp, void *dirent, filldir_t filldir) +static int ll_readdir_18(struct file *filp, void *dirent, filldir_t filldir) { struct inode *inode = filp->f_dentry->d_inode; loff_t pos = filp->f_pos; @@ -429,8 +389,7 @@ int ll_readdir(struct file *filp, void *dirent, filldir_t filldir) } done = ll_readdir_page(kaddr, idx << CFS_PAGE_SHIFT, &offset, filldir, dirent); - kunmap(page); - page_cache_release(page); + ll_put_page(page); if (done > 0) /* * Some entries were sent to the user space, return @@ -451,6 +410,438 @@ int ll_readdir(struct file *filp, void *dirent, filldir_t filldir) RETURN(rc); } +/* + * Chain of hash overflow pages. + */ +struct ll_dir_chain { + /* XXX something. Later */ +}; + +static inline void ll_dir_chain_init(struct ll_dir_chain *chain) +{ +} + +static inline void ll_dir_chain_fini(struct ll_dir_chain *chain) +{ +} + +static inline __u32 hash_x_index(__u32 value) +{ + return ((__u32)~0) - value; +} + +/* + * Layout of readdir pages, as transmitted on wire. + */ +struct lu_dirent { + struct lu_fid lde_fid; + __u64 lde_hash; + __u16 lde_reclen; + __u16 lde_namelen; + __u32 lde_padding; + char lde_name[0]; +}; + +struct lu_dirpage { + __u64 ldp_hash_start; + __u64 ldp_hash_end; + __u16 ldp_flags; + __u16 ldp_pad0; + __u32 ldp_pad1; + struct lu_dirent ldp_entries[0]; +}; + +enum lu_dirpage_flags { + LDF_EMPTY = 1 << 0 +}; + +static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp) +{ + if (le16_to_cpu(dp->ldp_flags) & LDF_EMPTY) + return NULL; + else + return dp->ldp_entries; +} + +static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent) +{ + struct lu_dirent *next; + + if (le16_to_cpu(ent->lde_reclen) != 0) + next = ((void *)ent) + le16_to_cpu(ent->lde_reclen); + else + next = NULL; + + return next; +} + +static inline int lu_dirent_size(struct lu_dirent *ent) +{ + if (le16_to_cpu(ent->lde_reclen) == 0) { + return (sizeof(*ent) + + le16_to_cpu(ent->lde_namelen) + 3) & ~3; + } + return le16_to_cpu(ent->lde_reclen); +} + +#define DIR_END_OFF 0xfffffffffffffffeULL + +#ifdef HAVE_RW_TREE_LOCK +#define TREE_READ_LOCK_IRQ(mapping) read_lock_irq(&(mapping)->tree_lock) +#define TREE_READ_UNLOCK_IRQ(mapping) read_unlock_irq(&(mapping)->tree_lock) +#else +#define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock) +#define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock) +#endif + +/* returns the page unlocked, but with a reference */ +static int ll_dir_readpage_20(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ptlrpc_request *request; + struct mdt_body *body; + struct ll_fid fid; + __u64 hash; + int rc; + ENTRY; + + hash = hash_x_index(page->index); + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) off %lu\n", + inode->i_ino, inode->i_generation, inode, (unsigned long)hash); + + ll_inode2fid(&fid, inode); + rc = mdc_readpage(ll_i2sbi(inode)->ll_mdc_exp, &fid, + hash, page, &request); + if (!rc) { + body = lustre_msg_buf(request->rq_repmsg, REPLY_REC_OFF, + sizeof(*body)); + /* Checked by mdc_readpage() */ + LASSERT(body != NULL); + + if (body->valid & OBD_MD_FLSIZE) { + ll_inode_size_lock(inode, 0); + i_size_write(inode, body->size); + ll_inode_size_unlock(inode, 0); + } + SetPageUptodate(page); + } + ptlrpc_req_finished(request); + + unlock_page(page); + EXIT; + return rc; +} + + +static void ll_check_page(struct inode *dir, struct page *page) +{ + /* XXX: check page format later */ + SetPageChecked(page); +} + + +/* + * Find, kmap and return page that contains given hash. + */ +static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash, + __u64 *start, __u64 *end) +{ + struct address_space *mapping = dir->i_mapping; + /* + * Complement of hash is used as an index so that + * radix_tree_gang_lookup() can be used to find a page with starting + * hash _smaller_ than one we are looking for. + */ + unsigned long offset = hash_x_index(hash); + struct page *page; + int found; + ENTRY; + + TREE_READ_LOCK_IRQ(mapping); + found = radix_tree_gang_lookup(&mapping->page_tree, + (void **)&page, offset, 1); + if (found > 0) { + struct lu_dirpage *dp; + + page_cache_get(page); + TREE_READ_UNLOCK_IRQ(mapping); + /* + * In contrast to find_lock_page() we are sure that directory + * page cannot be truncated (while DLM lock is held) and, + * hence, can avoid restart. + * + * In fact, page cannot be locked here at all, because + * ll_dir_readpage() does synchronous io. + */ + wait_on_page(page); + if (PageUptodate(page)) { + dp = kmap(page); + *start = le64_to_cpu(dp->ldp_hash_start); + *end = le64_to_cpu(dp->ldp_hash_end); + LASSERT(*start <= hash); + if (hash > *end || (*end != *start && hash == *end)) { + kunmap(page); + lock_page(page); + ll_truncate_complete_page(page); + unlock_page(page); + page_cache_release(page); + page = NULL; + } + } else { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + + } else { + TREE_READ_UNLOCK_IRQ(mapping); + page = NULL; + } + RETURN(page); +} + +static struct page *ll_get_dir_page_20(struct inode *dir, __u64 hash, int exact, + struct ll_dir_chain *chain) +{ + struct ldlm_res_id res_id; + struct lustre_handle lockh; + struct obd_device *obddev = class_exp2obd(ll_i2sbi(dir)->ll_mdc_exp); + struct address_space *mapping = dir->i_mapping; + struct lu_dirpage *dp; + struct page *page; + ldlm_policy_data_t policy = {.l_inodebits = {MDS_INODELOCK_UPDATE} }; + ldlm_mode_t mode; + int rc; + __u64 start = 0; + __u64 end = 0; + ENTRY; + + fid_build_reg_res_name(ll_inode_lu_fid(dir), &res_id); + mode = LCK_PR; + rc = ldlm_lock_match(obddev->obd_namespace, LDLM_FL_BLOCK_GRANTED, + &res_id, LDLM_IBITS, &policy, mode, &lockh); + if (!rc) { + struct lookup_intent it = { .it_op = IT_READDIR }; + struct ldlm_enqueue_info einfo = { LDLM_IBITS, mode, + ll_mdc_blocking_ast, ldlm_completion_ast, NULL, dir }; + struct ptlrpc_request *request; + struct mdc_op_data op_data = { { 0 } }; + + ll_prepare_mdc_op_data(&op_data, dir, NULL, NULL, 0, 0, NULL); + + rc = mdc_enqueue(ll_i2sbi(dir)->ll_mdc_exp, &einfo, &it, + &op_data, &lockh, NULL, 0, 0); + + request = (struct ptlrpc_request *)it.d.lustre.it_data; + if (request) + ptlrpc_req_finished(request); + if (rc < 0) { + CERROR("lock enqueue: rc: %d\n", rc); + RETURN(ERR_PTR(rc)); + } + } + ldlm_lock_dump_handle(D_OTHER, &lockh); + + page = ll_dir_page_locate(dir, hash, &start, &end); + if (IS_ERR(page)) + GOTO(out_unlock, page); + + if (page != NULL) { + /* + * XXX nikita: not entirely correct handling of a corner case: + * suppose hash chain of entries with hash value HASH crosses + * border between pages P0 and P1. First both P0 and P1 are + * cached, seekdir() is called for some entry from the P0 part + * of the chain. Later P0 goes out of cache. telldir(HASH) + * happens and finds P1, as it starts with matching hash + * value. Remaining entries from P0 part of the chain are + * skipped. (Is that really a bug?) + * + * Possible solutions: 0. don't cache P1 is such case, handle + * it as an "overflow" page. 1. invalidate all pages at + * once. 2. use HASH|1 as an index for P1. + */ + if (exact && hash != start) { + /* + * readdir asked for a page starting _exactly_ from + * given hash, but cache contains stale page, with + * entries with smaller hash values. Stale page should + * be invalidated, and new one fetched. + */ + CDEBUG(D_INFO, "Stale readpage page %p: %#lx != %#lx\n", + page, (unsigned long)hash, (unsigned long)start); + lock_page(page); + ll_truncate_complete_page(page); + unlock_page(page); + page_cache_release(page); + } else { + GOTO(hash_collision, page); + } + } + + page = read_cache_page(mapping, hash_x_index(hash), + (filler_t*)ll_dir_readpage_20, NULL); + if (IS_ERR(page)) + GOTO(out_unlock, page); + + wait_on_page(page); + (void)kmap(page); + if (!PageUptodate(page)) + goto fail; + if (!PageChecked(page)) + ll_check_page(dir, page); + if (PageError(page)) + goto fail; +hash_collision: + dp = page_address(page); + + start = le64_to_cpu(dp->ldp_hash_start); + end = le64_to_cpu(dp->ldp_hash_end); + if (end == start) { + LASSERT(start == hash); + CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end); + /* + * Fetch whole overflow chain... + * + * XXX not yet. + */ + goto fail; + } +out_unlock: + ldlm_lock_decref(&lockh, mode); + RETURN(page); + +fail: + ll_put_page(page); + page = ERR_PTR(-EIO); + goto out_unlock; +} + +static int ll_readdir_20(struct file *filp, void *cookie, filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + __u64 pos = filp->f_pos; + struct page *page; + struct ll_dir_chain chain; + int rc; + int done; + int shift; + ENTRY; + + CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) pos %lu/%llu\n", + inode->i_ino, inode->i_generation, inode, + (unsigned long)pos, i_size_read(inode)); + + if (pos == DIR_END_OFF) + /* + * end-of-file. + */ + RETURN(0); + + rc = 0; + done = 0; + shift = 0; + ll_dir_chain_init(&chain); + + page = ll_get_dir_page_20(inode, pos, 0, &chain); + + while (rc == 0 && !done) { + struct lu_dirpage *dp; + struct lu_dirent *ent; + + if (!IS_ERR(page)) { + /* + * If page is empty (end of directoryis reached), + * use this value. + */ + __u64 hash = DIR_END_OFF; + __u64 next; + + dp = page_address(page); + for (ent = lu_dirent_start(dp); ent != NULL && !done; + ent = lu_dirent_next(ent)) { + char *name; + int namelen; + struct lu_fid fid; + ino_t ino; + + hash = le64_to_cpu(ent->lde_hash); + namelen = le16_to_cpu(ent->lde_namelen); + + if (hash < pos) + /* + * Skip until we find target hash + * value. + */ + continue; + + if (namelen == 0) + /* + * Skip dummy record. + */ + continue; + + fid = ent->lde_fid; + name = ent->lde_name; + fid_le_to_cpu(&fid, &fid); + ino = ll_fid_build_ino(sbi, (struct ll_fid*)&fid); + + done = filldir(cookie, name, namelen, + (loff_t)hash, ino, DT_UNKNOWN); + } + next = le64_to_cpu(dp->ldp_hash_end); + ll_put_page(page); + if (!done) { + pos = next; + if (pos == DIR_END_OFF) + /* + * End of directory reached. + */ + done = 1; + else if (1 /* chain is exhausted*/) + /* + * Normal case: continue to the next + * page. + */ + page = ll_get_dir_page_20(inode, pos, 1, + &chain); + else { + /* + * go into overflow page. + */ + } + } else { + pos = hash; + } + } else { + rc = PTR_ERR(page); + CERROR("error reading dir "DFID" at %lu: rc %d\n", + PFID(ll_inode_lu_fid(inode)), + (unsigned long)pos, rc); + } + } + + filp->f_pos = (loff_t)(__s32)pos; + filp->f_version = inode->i_version; + touch_atime(filp->f_vfsmnt, filp->f_dentry); + + ll_dir_chain_fini(&chain); + + RETURN(rc); +} + +static int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + + if (sbi->ll_mdc_exp->exp_connect_flags & OBD_CONNECT_FID) { + return ll_readdir_20(filp, cookie, filldir); + } else { + return ll_readdir_18(filp, cookie, filldir); + } +} + #define QCTL_COPY(out, in) \ do { \ Q_COPY(out, in, qc_cmd); \ @@ -461,7 +852,7 @@ do { \ Q_COPY(out, in, qc_dqblk); \ } while (0) -int ll_send_mgc_param(struct obd_export *mgc, char *string) +static int ll_send_mgc_param(struct obd_export *mgc, char *string) { struct mgs_send_param *msp; int rc = 0; @@ -471,7 +862,7 @@ int ll_send_mgc_param(struct obd_export *mgc, char *string) return -ENOMEM; strncpy(msp->mgs_param, string, MGS_PARAM_MAXLEN); - rc = obd_set_info_async(mgc, strlen(KEY_SET_INFO), KEY_SET_INFO, + rc = obd_set_info_async(mgc, sizeof(KEY_SET_INFO), KEY_SET_INFO, sizeof(struct mgs_send_param), msp, NULL); if (rc) CERROR("Failed to set parameter: %d\n", rc); @@ -480,7 +871,7 @@ int ll_send_mgc_param(struct obd_export *mgc, char *string) return rc; } -char *ll_get_fsname(struct inode *inode) +static char *ll_get_fsname(struct inode *inode) { struct lustre_sb_info *lsi = s2lsi(inode->i_sb); char *ptr, *fsname; @@ -501,7 +892,7 @@ int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, int set_default) { struct ll_sb_info *sbi = ll_i2sbi(inode); - struct mdc_op_data data; + struct mdc_op_data data = { { 0 } }; struct ptlrpc_request *req = NULL; struct lustre_sb_info *lsi = s2lsi(inode->i_sb); struct obd_device *mgc = lsi->lsi_mgc; @@ -615,9 +1006,9 @@ int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmmp, * little endian. We convert it to host endian before * passing it to userspace. */ - if (lmm->lmm_magic == __swab32(LOV_MAGIC)) { + if ((LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) && + (cpu_to_le32(LOV_MAGIC) == lmm->lmm_magic)) lustre_swab_lov_user_md((struct lov_user_md *)lmm); - } out: *lmmp = lmm; *lmm_size = lmmsize; @@ -817,6 +1208,15 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, if (rc) GOTO(free_lmm, rc = -EFAULT); + if (lmm->lmm_magic != LOV_USER_MAGIC) + GOTO(free_lmm, rc = -EINVAL); + + if (LOV_USER_MAGIC != cpu_to_le32(LOV_USER_MAGIC) && + cpu_to_le32(LOV_USER_MAGIC) == cpu_to_le32(lmm->lmm_magic)) { + lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm); + lustre_swab_lov_user_md((struct lov_user_md *)lmm); + } + rc = obd_unpackmd(sbi->ll_osc_exp, &lsm, lmm, lmmsize); if (rc < 0) GOTO(free_lmm, rc = -ENOMEM); @@ -970,6 +1370,7 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, id = qctl->qc_id; switch (cmd) { case LUSTRE_Q_INVALIDATE: + case LUSTRE_Q_FINVALIDATE: case Q_QUOTAON: case Q_QUOTAOFF: case Q_SETQUOTA: diff --git a/lustre/llite/file.c b/lustre/llite/file.c index ddaa5aba2e3d57c5b4299ddbc04e2e5b48074f24..989aaba6a474495153df44b08ea1a6f00b68475c 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -53,6 +53,7 @@ static int ll_close_inode_openhandle(struct inode *inode, struct ptlrpc_request *req = NULL; struct obd_device *obd; struct obdo *oa; + struct mdc_op_data data = { { 0 } }; int rc; ENTRY; @@ -85,8 +86,8 @@ static int ll_close_inode_openhandle(struct inode *inode, oa->o_flags = MDS_BFLAG_UNCOMMITTED_WRITES; oa->o_valid |= OBD_MD_FLFLAGS; } - - rc = mdc_close(ll_i2mdcexp(inode), oa, och, &req); + ll_inode2fid(&data.fid1, inode); + rc = mdc_close(ll_i2mdcexp(inode), &data, oa, och, &req); if (rc == EAGAIN) { /* We are the last writer, so the MDS has instructed us to get * the file size and any write cookies, then close again. */ @@ -179,9 +180,10 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode, int flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK; struct lustre_handle lockh; struct inode *inode = file->f_dentry->d_inode; - struct ldlm_res_id file_res_id = {.name={inode->i_ino, - inode->i_generation}}; + struct ldlm_res_id file_res_id; + ldlm_policy_data_t policy = {.l_inodebits={MDS_INODELOCK_OPEN}}; + fid_build_reg_res_name(ll_inode_lu_fid(inode), &file_res_id); down(&lli->lli_och_sem); if (fd->fd_omode & FMODE_WRITE) { @@ -267,7 +269,7 @@ static int ll_intent_file_open(struct file *file, void *lmm, int lmmsize, struct lookup_intent *itp) { struct ll_sb_info *sbi = ll_i2sbi(file->f_dentry->d_inode); - struct mdc_op_data data; + struct mdc_op_data data = { { 0 } }; struct dentry *parent = file->f_dentry->d_parent; const char *name = file->f_dentry->d_name.name; const int len = file->f_dentry->d_name.len; @@ -508,7 +510,9 @@ restart: would attempt to grab och_sem as well, that would result in a deadlock */ up(&lli->lli_och_sem); + it->it_flags |= O_CHECK_STALE; rc = ll_intent_file_open(file, NULL, 0, it); + it->it_flags &= ~O_CHECK_STALE; if (rc) { ll_file_data_put(fd); GOTO(out_openerr, rc); @@ -600,10 +604,11 @@ int ll_lsm_getattr(struct obd_export *exp, struct lov_stripe_md *lsm, oinfo.oi_md = lsm; oinfo.oi_oa = oa; oa->o_id = lsm->lsm_object_id; + oa->o_gr = lsm->lsm_object_gr; oa->o_mode = S_IFREG; oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ | OBD_MD_FLMTIME | - OBD_MD_FLCTIME; + OBD_MD_FLCTIME | OBD_MD_FLGROUP; set = ptlrpc_prep_set(); if (set == NULL) { @@ -646,8 +651,9 @@ static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock) char name[16]; struct ldlm_lock *lock; struct lov_stripe_md *lsm; - } key = { .name = "lock_to_stripe", .lock = lock, .lsm = lsm }; + } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock, .lsm = lsm }; __u32 stripe, vallen = sizeof(stripe); + struct lov_oinfo *loinfo; int rc; ENTRY; @@ -663,11 +669,11 @@ static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock) LASSERT(stripe < lsm->lsm_stripe_count); check: - if (lsm->lsm_oinfo[stripe]->loi_id != lock->l_resource->lr_name.name[0]|| - lsm->lsm_oinfo[stripe]->loi_gr != lock->l_resource->lr_name.name[1]){ + loinfo = lsm->lsm_oinfo[stripe]; + if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr, + &lock->l_resource->lr_name)) { LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64, - lsm->lsm_oinfo[stripe]->loi_id, - lsm->lsm_oinfo[stripe]->loi_gr); + loinfo->loi_id, loinfo->loi_gr); RETURN(-ELDLM_NO_LOCK_DATA); } @@ -713,17 +719,18 @@ int ll_page_removal_cb(void *data, int discard) ll_teardown_mmaps(mapping, (__u64)page->index << PAGE_CACHE_SHIFT, ((__u64)page->index<<PAGE_CACHE_SHIFT)| - ~PAGE_CACHE_MASK); + ~PAGE_CACHE_MASK); LL_CDEBUG_PAGE(D_PAGE, page, "removing page\n"); + if (!discard && PageWriteback(page)) + wait_on_page_writeback(page); if (!discard && clear_page_dirty_for_io(page)) { - LASSERT(page->mapping); rc = ll_call_writepage(page->mapping->host, page); /* either waiting for io to complete or reacquiring * the lock that the failed writepage released */ lock_page(page); wait_on_page_writeback(page); - if (rc != 0) { + if (rc < 0) { CERROR("writepage inode %lu(%p) of page %p " "failed: %d\n", mapping->host->i_ino, mapping->host, page, rc); @@ -1161,14 +1168,15 @@ static int ll_is_file_contended(struct file *file) static int ll_file_get_tree_lock_iov(struct ll_lock_tree *tree, struct file *file, const struct iovec *iov, unsigned long nr_segs, - loff_t start, loff_t end, int rw) + obd_off start, obd_off end, int rw) { int append; int tree_locked = 0; int rc; struct inode * inode = file->f_dentry->d_inode; + ENTRY; - append = (rw == WRITE) && (file->f_flags & O_APPEND); + append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND); if (append || !ll_is_file_contended(file)) { struct ll_lock_tree_node *node; @@ -1178,7 +1186,7 @@ static int ll_file_get_tree_lock_iov(struct ll_lock_tree *tree, if (file->f_flags & O_NONBLOCK) ast_flags |= LDLM_FL_BLOCK_NOWAIT; node = ll_node_from_inode(inode, start, end, - (rw == WRITE) ? LCK_PW : LCK_PR); + (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR); if (IS_ERR(node)) { rc = PTR_ERR(node); GOTO(out, rc); @@ -1259,6 +1267,133 @@ static int iov_copy_update(unsigned long *nr_segs, const struct iovec **iov_out, return 0; } +static int ll_reget_short_lock(struct page *page, int rw, + obd_off start, obd_off end, + void **cookie) +{ + struct ll_async_page *llap; + struct obd_export *exp; + struct inode *inode = page->mapping->host; + + ENTRY; + + exp = ll_i2obdexp(inode); + if (exp == NULL) + RETURN(0); + + llap = llap_cast_private(page); + if (llap == NULL) + RETURN(0); + + RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd, + &llap->llap_cookie, rw, start, end, + cookie)); +} + +static void ll_release_short_lock(struct inode *inode, obd_off end, + void *cookie, int rw) +{ + struct obd_export *exp; + int rc; + + exp = ll_i2obdexp(inode); + if (exp == NULL) + return; + + rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end, + cookie, rw); + if (rc < 0) + CERROR("unlock failed (%d)\n", rc); +} + +static inline int ll_file_get_fast_lock(struct file *file, + obd_off ppos, obd_off end, + const struct iovec *iov, + unsigned long nr_segs, + void **cookie, int rw) +{ + int rc = 0, seg; + struct page *page; + + ENTRY; + + /* we would like this read request to be lockfree */ + for (seg = 0; seg < nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; + if (ll_region_mapped((unsigned long)iv->iov_base, iv->iov_len)) + GOTO(out, rc); + } + + page = find_lock_page(file->f_dentry->d_inode->i_mapping, + ppos >> CFS_PAGE_SHIFT); + if (page) { + if (ll_reget_short_lock(page, rw, ppos, end, cookie)) + rc = 1; + + unlock_page(page); + page_cache_release(page); + } + +out: + RETURN(rc); +} + +static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end, + void *cookie, int rw) +{ + ll_release_short_lock(inode, end, cookie, rw); +} + +enum ll_lock_style { + LL_LOCK_STYLE_NOLOCK = 0, + LL_LOCK_STYLE_FASTLOCK = 1, + LL_LOCK_STYLE_TREELOCK = 2 +}; + +static inline int ll_file_get_lock(struct file *file, obd_off ppos, + obd_off end, const struct iovec *iov, + unsigned long nr_segs, void **cookie, + struct ll_lock_tree *tree, int rw) +{ + int rc; + + ENTRY; + + if (ll_file_get_fast_lock(file, ppos, end, iov, nr_segs, cookie, rw)) + RETURN(LL_LOCK_STYLE_FASTLOCK); + + rc = ll_file_get_tree_lock_iov(tree, file, iov, nr_segs, + ppos, end, rw); + /* rc: 1 for tree lock, 0 for no lock, <0 for error */ + switch (rc) { + case 1: + RETURN(LL_LOCK_STYLE_TREELOCK); + case 0: + RETURN(LL_LOCK_STYLE_NOLOCK); + } + + /* an error happened if we reached this point, rc = -errno here */ + RETURN(rc); +} + +static inline void ll_file_put_lock(struct inode *inode, obd_off end, + enum ll_lock_style lock_style, + void *cookie, struct ll_lock_tree *tree, + int rw) + +{ + switch (lock_style) { + case LL_LOCK_STYLE_TREELOCK: + ll_tree_unlock(tree); + break; + case LL_LOCK_STYLE_FASTLOCK: + ll_file_put_fast_lock(inode, end, cookie, rw); + break; + default: + CERROR("invalid locking style (%d)\n", lock_style); + } +} + #ifdef HAVE_FILE_READV static ssize_t ll_file_readv(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) @@ -1278,13 +1413,14 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov, struct ost_lvb lvb; struct ll_ra_read bead; int ra = 0; - loff_t end; + obd_off end; ssize_t retval, chunk, sum = 0; - int tree_locked; + int lock_style; struct iovec *iov_copy = NULL; unsigned long nrsegs_copy, nrsegs_orig = 0; size_t count, iov_offset = 0; __u64 kms; + void *cookie; ENTRY; count = ll_file_get_iov_count(iov, &nr_segs); @@ -1333,12 +1469,12 @@ static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov, RETURN(-EFAULT); RETURN(sum); } + repeat: if (sbi->ll_max_rw_chunk != 0) { /* first, let's know the end of the current stripe */ end = *ppos; - obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END, - (obd_off *)&end); + obd_extent_calc(sbi->ll_osc_exp, lsm, OBD_CALC_STRIPE_END,&end); /* correct, the end is beyond the request */ if (end > *ppos + count - 1) @@ -1372,10 +1508,11 @@ repeat: nrsegs_copy = nr_segs; } - tree_locked = ll_file_get_tree_lock_iov(&tree, file, iov_copy, - nrsegs_copy, *ppos, end, READ); - if (tree_locked < 0) - GOTO(out, retval = tree_locked); + lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end, + iov_copy, nrsegs_copy, &cookie, &tree, + OBD_BRW_READ); + if (lock_style < 0) + GOTO(out, retval = lock_style); ll_inode_size_lock(inode, 1); /* @@ -1406,7 +1543,9 @@ repeat: ll_inode_size_unlock(inode, 1); retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED); if (retval) { - ll_tree_unlock(&tree); + if (lock_style != LL_LOCK_STYLE_NOLOCK) + ll_file_put_lock(inode, end, lock_style, + cookie, &tree, OBD_BRW_READ); goto out; } } else { @@ -1426,7 +1565,7 @@ repeat: inode->i_ino, chunk, *ppos, i_size_read(inode)); /* turn off the kernel's read-ahead */ - if (tree_locked) { + if (lock_style != LL_LOCK_STYLE_NOLOCK) { #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) file->f_ramax = 0; #else @@ -1448,7 +1587,8 @@ repeat: retval = generic_file_aio_read(iocb, iov_copy, nrsegs_copy, *ppos); #endif - ll_tree_unlock(&tree); + ll_file_put_lock(inode, end, lock_style, cookie, + &tree, OBD_BRW_READ); } else { retval = ll_file_lockless_io(file, iov_copy, nrsegs_copy, ppos, READ, chunk); @@ -1524,7 +1664,7 @@ static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov, CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n", inode->i_ino, inode->i_generation, inode, count, *ppos); - + SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */ /* POSIX, but surprised the VFS doesn't check this already */ @@ -1590,8 +1730,10 @@ repeat: } tree_locked = ll_file_get_tree_lock_iov(&tree, file, iov_copy, - nrsegs_copy, lock_start, - lock_end, WRITE); + nrsegs_copy, + (obd_off)lock_start, + (obd_off)lock_end, + OBD_BRW_WRITE); if (tree_locked < 0) GOTO(out, retval = tree_locked); @@ -1909,16 +2051,25 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, LASSERT(lmm != NULL); LASSERT(lustre_rep_swabbed(req, REPLY_REC_OFF + 1)); + if ((lmm->lmm_magic != cpu_to_le32(LOV_MAGIC)) && + (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_JOIN))) { + GOTO(out, rc = -EPROTO); + } /* * This is coming from the MDS, so is probably in * little endian. We convert it to host endian before * passing it to userspace. */ - if (lmm->lmm_magic == __swab32(LOV_MAGIC)) { - lustre_swab_lov_user_md((struct lov_user_md *)lmm); - lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm); - } else if (lmm->lmm_magic == __swab32(LOV_MAGIC_JOIN)) { - lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm); + if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) { + if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC)) { + lustre_swab_lov_user_md((struct lov_user_md *)lmm); + /* if function called for directory - we should be + * avoid swab not existent lsm objects */ + if (S_ISREG(body->mode)) + lustre_swab_lov_user_md_objects((struct lov_user_md *)lmm); + } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_JOIN)) { + lustre_swab_lov_user_md_join((struct lov_user_md_join *)lmm); + } } if (lmm->lmm_magic == LOV_MAGIC_JOIN) { @@ -2505,7 +2656,8 @@ int ll_fsync(struct file *file, struct dentry *dentry, int data) RETURN(rc ? rc : -ENOMEM); oa->o_id = lsm->lsm_object_id; - oa->o_valid = OBD_MD_FLID; + oa->o_gr = lsm->lsm_object_gr; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; obdo_from_inode(oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME); @@ -2523,8 +2675,12 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) { struct inode *inode = file->f_dentry->d_inode; struct ll_sb_info *sbi = ll_i2sbi(inode); + struct lu_fid *fid = ll_inode_lu_fid(inode); struct ldlm_res_id res_id = - { .name = {inode->i_ino, inode->i_generation, LDLM_FLOCK} }; + { .name = { fid_seq(fid), + fid_oid(fid), + fid_ver(fid), + LDLM_FLOCK} }; struct ldlm_enqueue_info einfo = { LDLM_FLOCK, 0, NULL, ldlm_flock_completion_ast, NULL, file_lock }; struct lustre_handle lockh = {0}; @@ -2537,6 +2693,15 @@ int ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock) inode->i_ino, file_lock); ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1); + if (fid_is_igif(fid)) { + /* If this is an IGIF inode, we need to keep the 1.6-style + * flock mapping for compatibility. If it is a proper FID + * then we know any other client accessing it must also be + * accessing it as a FID and can use the CMD-style flock. */ + res_id.name[2] = LDLM_FLOCK; + res_id.name[3] = 0; + } + if (file_lock->fl_flags & FL_FLOCK) { LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK)); /* set missing params for flock() calls */ @@ -2624,7 +2789,7 @@ int ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock) int ll_have_md_lock(struct inode *inode, __u64 bits) { struct lustre_handle lockh; - struct ldlm_res_id res_id = { .name = {0} }; + struct ldlm_res_id res_id; struct obd_device *obddev; ldlm_policy_data_t policy = { .l_inodebits = {bits}}; int flags; @@ -2634,10 +2799,12 @@ int ll_have_md_lock(struct inode *inode, __u64 bits) RETURN(0); obddev = ll_i2mdcexp(inode)->exp_obd; - res_id.name[0] = inode->i_ino; - res_id.name[1] = inode->i_generation; + fid_build_reg_res_name(ll_inode_lu_fid(inode), &res_id); - CDEBUG(D_INFO, "trying to match res "LPU64"\n", res_id.name[0]); + CDEBUG(D_INFO, "trying to match res "LPU64":"LPU64":"LPU64"\n", + res_id.name[0], + res_id.name[1], + res_id.name[2]); flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK; if (ldlm_lock_match(obddev->obd_namespace, flags, &res_id, LDLM_IBITS, @@ -2691,16 +2858,18 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) if (exp->exp_connect_flags & OBD_CONNECT_ATTRFID) { struct lookup_intent oit = { .it_op = IT_GETATTR }; - struct mdc_op_data op_data; + struct mdc_op_data op_data = { { 0 } }; /* Call getattr by fid, so do not provide name at all. */ ll_prepare_mdc_op_data(&op_data, dentry->d_parent->d_inode, dentry->d_inode, NULL, 0, 0, NULL); + oit.it_flags |= O_CHECK_STALE; rc = mdc_intent_lock(exp, &op_data, NULL, 0, /* we are not interested in name based lookup */ &oit, 0, &req, ll_mdc_blocking_ast, 0); + oit.it_flags &= ~O_CHECK_STALE; if (rc < 0) { rc = ll_inode_revalidate_fini(inode, rc); GOTO (out, rc); diff --git a/lustre/llite/llite_close.c b/lustre/llite/llite_close.c index d36f2d6f8e2377f7c42d88cf50a57fe39f3c2f3d..60ae02055cb1c049671090daed5a2308048d621f 100644 --- a/lustre/llite/llite_close.c +++ b/lustre/llite/llite_close.c @@ -122,6 +122,7 @@ static void ll_close_done_writing(struct inode *inode) ldlm_policy_data_t policy = { .l_extent = {0, OBD_OBJECT_EOF } }; struct lustre_handle lockh = { 0 }; struct obdo obdo; + struct mdc_op_data data = { { 0 } }; obd_flag valid; int rc, ast_flags = 0; ENTRY; @@ -166,7 +167,8 @@ static void ll_close_done_writing(struct inode *inode) obdo.o_blocks = inode->i_blocks; obdo.o_valid = OBD_MD_FLID | OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; - rc = mdc_done_writing(ll_i2sbi(inode)->ll_mdc_exp, &obdo); + ll_inode2fid(&data.fid1, inode); + rc = mdc_done_writing(ll_i2sbi(inode)->ll_mdc_exp, &data, &obdo); out: } diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 10fed7c7e0038814059fed1771528942e40e269c..9e6a61c97d725594b9d5a273259e997ac6c8d74b 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -5,7 +5,6 @@ #ifndef LLITE_INTERNAL_H #define LLITE_INTERNAL_H -#include <linux/ext2_fs.h> #ifdef CONFIG_FS_POSIX_ACL # include <linux/fs.h> #ifdef HAVE_XATTR_ACL @@ -48,6 +47,26 @@ static inline struct lookup_intent *ll_nd2it(struct nameidata *nd) } #endif +/* + * Directory entries are currently in the same format as ext2/ext3, but will + * be changed in the future to accomodate FIDs + */ +#define LL_DIR_NAME_LEN (255) +#define LL_DIR_PAD (4) + +struct ll_dir_entry { + /* number of inode, referenced by this entry */ + __le32 lde_inode; + /* total record length, multiple of LL_DIR_PAD */ + __le16 lde_rec_len; + /* length of name */ + __u8 lde_name_len; + /* file type: regular, directory, device, etc. */ + __u8 lde_file_type; + /* name. NOT NUL-terminated */ + char lde_name[LL_DIR_NAME_LEN]; +}; + struct ll_dentry_data { int lld_cwd_count; int lld_mnt_count; @@ -110,9 +129,11 @@ struct ll_inode_info { struct obd_client_handle *lli_mds_exec_och; __u64 lli_open_fd_exec_count; -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) - struct inode lli_vfs_inode; -#endif + /** fid of this object. */ + union { + struct lu_fid f20; + struct ll_fid f16; + } lli_fid; /* metadata stat-ahead */ /* @@ -126,6 +147,10 @@ struct ll_inode_info { * before child -- it is me should cleanup the dir readahead. */ void *lli_opendir_key; struct ll_statahead_info *lli_sai; + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) + struct inode lli_vfs_inode; +#endif }; /* @@ -231,6 +256,8 @@ enum stats_track_type { /* default value for ll_sb_info->contention_time */ #define SBI_DEFAULT_CONTENTION_SECONDS 60 +/* default value for lockless_truncate_enable */ +#define SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE 1 struct ll_sb_info { struct list_head ll_list; @@ -260,6 +287,7 @@ struct ll_sb_info { struct list_head ll_pglist; /* all pages (llap_pglist_item) */ unsigned ll_contention_time; /* seconds */ + unsigned ll_lockless_truncate_enable; /* true/false */ struct ll_ra_info ll_ra_info; unsigned int ll_namelen; @@ -501,27 +529,26 @@ extern struct file_operations ll_dir_operations; extern struct inode_operations ll_dir_inode_operations; struct page *ll_get_dir_page(struct inode *dir, unsigned long n); -/* - * p is at least 6 bytes before the end of page - */ -typedef struct ext2_dir_entry_2 ext2_dirent; -static inline ext2_dirent *ext2_next_entry(ext2_dirent *p) +static inline unsigned ll_dir_rec_len(unsigned name_len) { - return (ext2_dirent *)((char*)p + le16_to_cpu(p->rec_len)); + return (name_len + 8 + LL_DIR_PAD - 1) & ~(LL_DIR_PAD - 1); } -static inline unsigned -ext2_validate_entry(char *base, unsigned offset, unsigned mask) +static inline struct ll_dir_entry *ll_entry_at(void *base, unsigned offset) { - ext2_dirent *de = (ext2_dirent*)(base + offset); - ext2_dirent *p = (ext2_dirent*)(base + (offset&mask)); - while ((char*)p < (char*)de) - p = ext2_next_entry(p); - return (char *)p - base; + return (struct ll_dir_entry *)((char *)base + offset); } -static inline void ext2_put_page(struct page *page) +/* + * p is at least 6 bytes before the end of page + */ +static inline struct ll_dir_entry *ll_dir_next_entry(struct ll_dir_entry *p) +{ + return ll_entry_at(p, le16_to_cpu(p->lde_rec_len)); +} + +static inline void ll_put_page(struct page *page) { kunmap(page); page_cache_release(page); @@ -582,6 +609,7 @@ extern struct file_operations ll_file_operations_noflock; extern struct inode_operations ll_file_inode_operations; extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *); extern int ll_have_md_lock(struct inode *inode, __u64 bits); +int ll_region_mapped(unsigned long addr, size_t count); int ll_extent_lock(struct ll_file_data *, struct inode *, struct lov_stripe_md *, int mode, ldlm_policy_data_t *, struct lustre_handle *, int ast_flags); @@ -804,10 +832,21 @@ static inline struct obd_export *ll_i2mdcexp(struct inode *inode) return ll_s2mdcexp(inode->i_sb); } +/** get lu_fid from inode. */ +static inline struct lu_fid *ll_inode_lu_fid(struct inode *inode) +{ + return &ll_i2info(inode)->lli_fid.f20; +} + +/** get ll_fid from inode. */ +static inline struct ll_fid *ll_inode_ll_fid(struct inode *inode) +{ + return &ll_i2info(inode)->lli_fid.f16; +} + static inline void ll_inode2fid(struct ll_fid *fid, struct inode *inode) { - mdc_pack_fid(fid, inode->i_ino, inode->i_generation, - inode->i_mode & S_IFMT); + *fid = *ll_inode_ll_fid(inode); } static inline int ll_mds_max_easize(struct super_block *sb) @@ -889,6 +928,10 @@ int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup) if (sbi->ll_sa_max == 0) return -ENOTSUPP; + /* temporarily disable dir stat ahead in interoperability mode */ + if (sbi->ll_mdc_exp->exp_connect_flags & OBD_CONNECT_FID) + return -ENOTSUPP; + /* not the same process, don't statahead */ if (lli->lli_opendir_pid != cfs_curproc_pid()) return -EBADF; @@ -960,6 +1003,9 @@ enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd); void ll_iocontrol_unregister(void *magic); +ino_t ll_fid_build_ino(struct ll_sb_info *sbi, + struct ll_fid *fid); + #endif #endif /* LLITE_INTERNAL_H */ diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 54fcd637409ad9de7b7ccb95486fecaa0b91d069..2691a500550642982967a869ca514f4a344f5326 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -76,6 +76,7 @@ static struct ll_sb_info *ll_init_sbi(void) sbi->ll_ra_info.ra_max_read_ahead_whole_pages = SBI_DEFAULT_READAHEAD_WHOLE_MAX; sbi->ll_contention_time = SBI_DEFAULT_CONTENTION_SECONDS; + sbi->ll_lockless_truncate_enable = SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE; INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list); @@ -163,13 +164,14 @@ static int client_common_fill_super(struct super_block *sb, err = lprocfs_register_mountpoint(proc_lustre_fs_root, sb, osc, mdc); if (err < 0) - CERROR("could not register mount in /proc/lustre"); + CERROR("could not register mount in /proc/fs/lustre\n"); } /* indicate the features supported by this client */ - data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_IBITS | - OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID | OBD_CONNECT_NODEVOH | - OBD_CONNECT_CANCELSET | OBD_CONNECT_AT; + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_IBITS | + OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID | + OBD_CONNECT_NODEVOH | OBD_CONNECT_CANCELSET | + OBD_CONNECT_AT | OBD_CONNECT_FID; #ifdef HAVE_LRU_RESIZE_SUPPORT if (sbi->ll_flags & LL_SBI_LRU_RESIZE) data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; @@ -209,6 +211,9 @@ static int client_common_fill_super(struct super_block *sb, GOTO(out, err); } sbi->ll_mdc_exp = class_conn2export(&mdc_conn); + err = obd_fid_init(sbi->ll_mdc_exp); + if (err) + GOTO(out_mdc, err); err = obd_statfs(obd, &osfs, cfs_time_current_64() - HZ, 0); if (err) @@ -271,9 +276,11 @@ static int client_common_fill_super(struct super_block *sb, GOTO(out_mdc, err = -ENODEV); } - data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_GRANT | - OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | - OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET | OBD_CONNECT_AT; + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_GRANT | + OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET| + OBD_CONNECT_AT | OBD_CONNECT_FID | + OBD_CONNECT_TRUNCLOCK; if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) { /* OBD_CONNECT_CKSUM should always be set, even if checksums are @@ -362,7 +369,8 @@ static int client_common_fill_super(struct super_block *sb, CERROR("cannot mds_connect: rc = %d\n", err); GOTO(out_lock_cn_cb, err); } - CDEBUG(D_SUPER, "rootfid "LPU64"\n", rootfid.id); + CDEBUG(D_SUPER, "rootfid "LPU64":"DFID"\n", rootfid.id, + PFID((struct lu_fid*)&rootfid)); sbi->ll_rootino = rootfid.id; sb->s_op = &lustre_super_operations; @@ -389,7 +397,7 @@ static int client_common_fill_super(struct super_block *sb, } LASSERT(sbi->ll_rootino != 0); - root = ll_iget(sb, sbi->ll_rootino, &md); + root = ll_iget(sb, ll_fid_build_ino(sbi, &rootfid), &md); ptlrpc_req_finished(request); @@ -406,8 +414,8 @@ static int client_common_fill_super(struct super_block *sb, } checksum = sbi->ll_flags & LL_SBI_DATA_CHECKSUM; - err = obd_set_info_async(sbi->ll_osc_exp, strlen("checksum"), - "checksum", sizeof(checksum), + err = obd_set_info_async(sbi->ll_osc_exp, sizeof(KEY_CHECKSUM), + KEY_CHECKSUM, sizeof(checksum), &checksum, NULL); /* making vm readahead 0 for 2.4.x. In the case of 2.6.x, @@ -438,6 +446,7 @@ out_osc: obd_disconnect(sbi->ll_osc_exp); sbi->ll_osc_exp = NULL; out_mdc: + obd_fid_fini(sbi->ll_mdc_exp); obd_disconnect(sbi->ll_mdc_exp); sbi->ll_mdc_exp = NULL; out: @@ -453,8 +462,8 @@ int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize) *lmmsize = obd_size_diskmd(sbi->ll_osc_exp, NULL); size = sizeof(int); - rc = obd_get_info(sbi->ll_mdc_exp, strlen("max_easize"), "max_easize", - &size, lmmsize); + rc = obd_get_info(sbi->ll_mdc_exp, sizeof(KEY_MAX_EASIZE), + KEY_MAX_EASIZE, &size, lmmsize); if (rc) CERROR("Get max mdsize error rc %d \n", rc); @@ -638,6 +647,7 @@ void client_common_put_super(struct super_block *sb) obd_disconnect(sbi->ll_osc_exp); sbi->ll_osc_exp = NULL; + obd_fid_fini(sbi->ll_mdc_exp); obd_disconnect(sbi->ll_mdc_exp); sbi->ll_mdc_exp = NULL; @@ -910,7 +920,7 @@ static int old_lustre_process_log(struct super_block *sb, char *newprofile, /* Try all connections, but only once. */ rc = obd_set_info_async(obd->obd_self_export, - strlen("init_recov_bk"), "init_recov_bk", + sizeof(KEY_INIT_RECOV_BACKUP), KEY_INIT_RECOV_BACKUP, sizeof(recov_bk), &recov_bk, NULL); if (rc) GOTO(out_cleanup, rc); @@ -987,7 +997,7 @@ out: int ll_fill_super(struct super_block *sb) { - struct lustre_profile *lprof; + struct lustre_profile *lprof = NULL; struct lustre_sb_info *lsi = s2lsi(sb); struct ll_sb_info *sbi; char *osc = NULL, *mdc = NULL; @@ -995,6 +1005,8 @@ int ll_fill_super(struct super_block *sb) struct config_llog_instance cfg = {0, }; char ll_instance[sizeof(sb) * 2 + 1]; int err; + char *save = NULL; + char pseudo[32] = { 0 }; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op: sb %p\n", sb); @@ -1069,6 +1081,30 @@ int ll_fill_super(struct super_block *sb) "exist?\n", profilenm); GOTO(out_free, err = -EINVAL); } + + /* + * The configuration for 1.8 client and 2.0 client are different. + * 2.0 introduces lmv, but 1.8 directly uses mdc. + * Here, we will hack to use proper name for mdc if needed. + */ + { + char *fsname_end; + int namelen; + + save = lprof->lp_mdc; + fsname_end = strrchr(save, '-'); + if (fsname_end) { + namelen = fsname_end - save; + if (strcmp(fsname_end, "-clilmv") == 0) { + strncpy(pseudo, save, namelen); + strcat(pseudo, "-MDT0000-mdc"); + lprof->lp_mdc = pseudo; + CDEBUG(D_INFO, "1.8.x connecting to 2.0: lmv=%s" + " new mdc=%s\n", save, pseudo); + } + } + } + CDEBUG(D_CONFIG, "Found profile %s: mdc=%s osc=%s\n", profilenm, lprof->lp_mdc, lprof->lp_osc); @@ -1088,6 +1124,8 @@ int ll_fill_super(struct super_block *sb) err = client_common_fill_super(sb, mdc, osc); out_free: + if (save && lprof) + lprof->lp_mdc = save; if (mdc) OBD_FREE(mdc, strlen(mdc) + 1); if (osc) @@ -1308,7 +1346,8 @@ static int ll_setattr_do_truncate(struct inode *inode, loff_t new_size) UNLOCK_INODE_MUTEX(inode); UP_WRITE_I_ALLOC_SEM(inode); - if (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK) { + if (sbi->ll_lockless_truncate_enable && + (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK)) { ast_flags = LDLM_FL_BLOCK_GRANTED; rc = obd_match(sbi->ll_osc_exp, lsm, LDLM_EXTENT, &policy, LCK_PW, &ast_flags, inode, &lockh); @@ -1385,7 +1424,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) struct lov_stripe_md *lsm = lli->lli_smd; struct ll_sb_info *sbi = ll_i2sbi(inode); struct ptlrpc_request *request = NULL; - struct mdc_op_data op_data; + struct mdc_op_data op_data = { { 0 } }; struct lustre_md md; int ia_valid = attr->ia_valid; int rc = 0; @@ -1505,7 +1544,8 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) if (oa) { oa->o_id = lsm->lsm_object_id; - oa->o_valid = OBD_MD_FLID; + oa->o_gr = lsm->lsm_object_gr; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; flags = OBD_MD_FLTYPE | OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME | @@ -1529,9 +1569,19 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) int ll_setattr(struct dentry *de, struct iattr *attr) { + int mode; + if ((attr->ia_valid & (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) == (ATTR_CTIME|ATTR_SIZE|ATTR_MODE)) attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; + if ((attr->ia_valid & (ATTR_MODE|ATTR_FORCE|ATTR_SIZE)) == + (ATTR_SIZE|ATTR_MODE)) { + mode = de->d_inode->i_mode; + if (((mode & S_ISUID) && (!(attr->ia_mode & S_ISUID))) || + ((mode & S_ISGID) && (mode & S_IXGRP) && + (!(attr->ia_mode & S_ISGID)))) + attr->ia_valid |= ATTR_FORCE; + } return ll_setattr_raw(de->d_inode, attr); } @@ -1682,6 +1732,10 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) struct ll_inode_info *lli = ll_i2info(inode); struct mds_body *body = md->body; struct lov_stripe_md *lsm = md->lsm; + struct ll_sb_info *sbi = ll_i2sbi(inode); + ENTRY; + + CDEBUG(D_INODE, "body->valid = "LPX64"\n", body->valid); LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); if (lsm != NULL) { @@ -1731,8 +1785,10 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) } #endif - if (body->valid & OBD_MD_FLID) - inode->i_ino = body->ino; + inode->i_ino = ll_fid_build_ino(sbi, &body->fid1); + if (body->valid & OBD_MD_FLGENER) + inode->i_generation = body->generation; + if (body->valid & OBD_MD_FLATIME && body->atime > LTIME_S(inode->i_atime)) LTIME_S(inode->i_atime) = body->atime; @@ -1771,8 +1827,7 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) inode->i_flags = ll_ext_to_inode_flags(body->flags); if (body->valid & OBD_MD_FLNLINK) inode->i_nlink = body->nlink; - if (body->valid & OBD_MD_FLGENER) - inode->i_generation = body->generation; + if (body->valid & OBD_MD_FLRDEV) #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) inode->i_rdev = body->rdev; @@ -1793,6 +1848,7 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) if (body->valid & OBD_MD_FLSIZE) set_bit(LLI_F_HAVE_MDS_SIZE_LOCK, &lli->lli_flags); + EXIT; } #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) @@ -1887,13 +1943,13 @@ int ll_iocontrol(struct inode *inode, struct file *file, /* We want to return EXT3_*_FL flags to the caller via this * ioctl. An older MDS may be sending S_* flags, fix it up. */ flags = ll_inode_to_ext_flags(body->flags, - body->flags &MDS_BFLAG_EXT_FLAGS); + MDS_BFLAG_EXT_FLAGS); ptlrpc_req_finished (req); RETURN(put_user(flags, (int *)arg)); } case EXT3_IOC_SETFLAGS: { - struct mdc_op_data op_data; + struct mdc_op_data op_data = { { 0 } }; struct ll_iattr_struct attr; struct obd_info oinfo = { { { 0 } } }; struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; @@ -1921,8 +1977,10 @@ int ll_iocontrol(struct inode *inode, struct file *file, } oinfo.oi_oa->o_id = lsm->lsm_object_id; + oinfo.oi_oa->o_gr = lsm->lsm_object_gr; oinfo.oi_oa->o_flags = flags; - oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS; + oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | + OBD_MD_FLFLAGS; obdo_from_inode(oinfo.oi_oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER); @@ -2013,8 +2071,8 @@ int ll_remount_fs(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { read_only = *flags & MS_RDONLY; - err = obd_set_info_async(sbi->ll_mdc_exp, strlen("read-only"), - "read-only", sizeof(read_only), + err = obd_set_info_async(sbi->ll_mdc_exp, sizeof(KEY_READONLY), + KEY_READONLY, sizeof(read_only), &read_only, NULL); if (err) { CERROR("Failed to change the read-only flag during " @@ -2050,7 +2108,10 @@ int ll_prep_inode(struct obd_export *exp, struct inode **inode, ll_update_inode(*inode, &md); } else { LASSERT(sb); - *inode = ll_iget(sb, md.body->ino, &md); + /** hashing VFS inode by FIDs. + * IGIF will be used for for compatibility if needed. + */ + *inode =ll_iget(sb, ll_fid_build_ino(sbi, &md.body->fid1), &md); if (*inode == NULL || is_bad_inode(*inode)) { mdc_free_lustre_md(exp, &md); rc = -ENOMEM; diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index 275ea3d915dfeaa1b0c37653d2481da6c5839914..09614790305c44a6b04c080c8bfa6ded66c1c1dc 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -69,14 +69,8 @@ struct ll_lock_tree_node { int lt_get_mmap_locks(struct ll_lock_tree *tree, unsigned long addr, size_t count); -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, int *type); -#else - -struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, - int unused); -#endif struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, __u64 end, ldlm_mode_t mode) @@ -312,6 +306,11 @@ static struct vm_area_struct * our_vma(unsigned long addr, size_t count) RETURN(ret); } +int ll_region_mapped(unsigned long addr, size_t count) +{ + return !!our_vma(addr, count); +} + int lt_get_mmap_locks(struct ll_lock_tree *tree, unsigned long addr, size_t count) { @@ -349,29 +348,19 @@ int lt_get_mmap_locks(struct ll_lock_tree *tree, } RETURN(0); } - -/* FIXME: there is a pagefault race goes as follow (only 2.4): - * 1. A user process on node A accesses a portion of a mapped file, - * resulting in a page fault. The pagefault handler invokes the - * ll_nopage function, which reads the page into memory. - * 2. A user process on node B writes to the same portion of the file - * (either via mmap or write()), that cause node A to cancel the - * lock and truncate the page. - * 3. Node A then executes the rest of do_no_page(), entering the - * now-invalid page into the PTEs. +/** + * Page fault handler. * - * Make the whole do_no_page as a hook to cover both the page cache - * and page mapping installing with dlm lock would eliminate this race. + * \param vma - is virtiual area struct related to page fault + * \param address - address when hit fault + * \param type - of fault * - * In 2.6, the truncate_count of address_space can cover this race. + * \return allocated and filled page for address + * \retval NOPAGE_SIGBUS if page not exist on this address + * \retval NOPAGE_OOM not have memory for allocate new page */ -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, int *type) -#else -struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, - int type /* unused */) -#endif { struct file *filp = vma->vm_file; struct ll_file_data *fd = LUSTRE_FPRIVATE(filp); @@ -390,7 +379,7 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, if (lli->lli_smd == NULL) { CERROR("No lsm on fault?\n"); - RETURN(NULL); + RETURN(NOPAGE_SIGBUS); } ll_clear_file_contended(inode); @@ -408,7 +397,7 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, rc = ll_extent_lock(fd, inode, lsm, mode, &policy, &lockh, LDLM_FL_CBPENDING | LDLM_FL_NO_LRU); if (rc != 0) - RETURN(NULL); + RETURN(NOPAGE_SIGBUS); if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime) CWARN("binary changed. inode %lu\n", inode->i_ino); @@ -469,8 +458,13 @@ struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, vma->vm_flags |= VM_RAND_READ; page = filemap_nopage(vma, address, type); - LL_CDEBUG_PAGE(D_PAGE, page, "got addr %lu type %lx\n", address, - (long)type); + if (page != NOPAGE_SIGBUS && page != NOPAGE_OOM) + LL_CDEBUG_PAGE(D_PAGE, page, "got addr %lu type %lx\n", address, + (long)type); + else + CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n", address, + (long)type); + vma->vm_flags &= ~VM_RAND_READ; vma->vm_flags |= (rand_read | seq_read); @@ -542,7 +536,6 @@ static void ll_vm_close(struct vm_area_struct *vma) } } -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) #ifndef HAVE_FILEMAP_POPULATE static int (*filemap_populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); #endif @@ -557,7 +550,6 @@ static int ll_populate(struct vm_area_struct *area, unsigned long address, rc = filemap_populate(area, address, len, prot, pgoff, 1); RETURN(rc); } -#endif /* return the user space pointer that maps to a file offset via a vma */ static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte) @@ -566,47 +558,6 @@ static inline unsigned long file_to_user(struct vm_area_struct *vma, __u64 byte) } -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -/* [first, last] are the byte offsets affected. - * vm_{start, end} are user addresses of the first byte of the mapping and - * the next byte beyond it - * vm_pgoff is the page index of the first byte in the mapping */ -static void teardown_vmas(struct vm_area_struct *vma, __u64 first, - __u64 last) -{ - unsigned long address, len; - for (; vma ; vma = vma->vm_next_share) { - if (last >> CFS_PAGE_SHIFT < vma->vm_pgoff) - continue; - if (first >> CFS_PAGE_SHIFT >= (vma->vm_pgoff + - ((vma->vm_end - vma->vm_start) >> CFS_PAGE_SHIFT))) - continue; - - /* XXX in case of unmap the cow pages of a running file, - * don't unmap these private writeable mapping here! - * though that will break private mappping a little. - * - * the clean way is to check the mapping of every page - * and just unmap the non-cow pages, just like - * unmap_mapping_range() with even_cow=0 in kernel 2.6. - */ - if (!(vma->vm_flags & VM_SHARED) && - (vma->vm_flags & VM_WRITE)) - continue; - - address = max((unsigned long)vma->vm_start, - file_to_user(vma, first)); - len = min((unsigned long)vma->vm_end, - file_to_user(vma, last) + 1) - address; - - VMA_DEBUG(vma, "zapping vma [first="LPU64" last="LPU64" " - "address=%ld len=%ld]\n", first, last, address, len); - LASSERT(len > 0); - ll_zap_page_range(vma, address, len); - } -} -#endif - /* XXX put nice comment here. talk about __free_pte -> dirty pages and * nopage's reference passing to the pte */ int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) @@ -615,24 +566,12 @@ int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last) ENTRY; LASSERTF(last > first, "last "LPU64" first "LPU64"\n", last, first); -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) if (mapping_mapped(mapping)) { rc = 0; unmap_mapping_range(mapping, first + CFS_PAGE_SIZE - 1, last - first + 1, 0); } -#else - spin_lock(&mapping->i_shared_lock); - if (mapping->i_mmap != NULL) { - rc = 0; - teardown_vmas(mapping->i_mmap, first, last); - } - if (mapping->i_mmap_shared != NULL) { - rc = 0; - teardown_vmas(mapping->i_mmap_shared, first, last); - } - spin_unlock(&mapping->i_shared_lock); -#endif + RETURN(rc); } @@ -640,9 +579,7 @@ static struct vm_operations_struct ll_file_vm_ops = { .nopage = ll_nopage, .open = ll_vm_open, .close = ll_vm_close, -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) .populate = ll_populate, -#endif }; int ll_file_mmap(struct file * file, struct vm_area_struct * vma) diff --git a/lustre/llite/lloop.c b/lustre/llite/lloop.c index f42fd4a7c0346d6b6e668d0166bfca17d1265063..529965fc031b18f7e113c59d252b8d02b3d925ed 100644 --- a/lustre/llite/lloop.c +++ b/lustre/llite/lloop.c @@ -211,7 +211,9 @@ static int do_bio_filebacked(struct lloop_device *lo, struct bio *bio) oa->o_mode = inode->i_mode; oa->o_id = lsm->lsm_object_id; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE; + oa->o_gr = lsm->lsm_object_gr; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | + OBD_MD_FLMODE | OBD_MD_FLTYPE; obdo_from_inode(oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER); cmd = OBD_BRW_READ; diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 04a7f9061092b45225db4ebdde513be29747fa17..9a735c8e913472e645ccdae2d6648e5e2e53de31 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -344,7 +344,7 @@ static int ll_wr_checksum(struct file *file, const char *buffer, else sbi->ll_flags &= ~(LL_SBI_LLITE_CHECKSUM|LL_SBI_DATA_CHECKSUM); - rc = obd_set_info_async(sbi->ll_osc_exp, strlen("checksum"), "checksum", + rc = obd_set_info_async(sbi->ll_osc_exp, sizeof(KEY_CHECKSUM), KEY_CHECKSUM, sizeof(val), &val, NULL); if (rc) CWARN("Failed to set OSC checksum flags: %d\n", rc); @@ -463,6 +463,27 @@ static int ll_wr_contention_time(struct file *file, const char *buffer, count; } +static int ll_rd_lockless_truncate(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + + *eof = 1; + return snprintf(page, count, "%u\n", + ll_s2sbi(sb)->ll_lockless_truncate_enable); +} + +static int ll_wr_lockless_truncate(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct super_block *sb = data; + struct ll_sb_info *sbi = ll_s2sbi(sb); + + return lprocfs_write_helper(buffer, count, + &sbi->ll_lockless_truncate_enable) + ?: count; +} + static int ll_rd_statahead_max(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -534,7 +555,10 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = { { "stats_track_pid", ll_rd_track_pid, ll_wr_track_pid, 0 }, { "stats_track_ppid", ll_rd_track_ppid, ll_wr_track_ppid, 0 }, { "stats_track_gid", ll_rd_track_gid, ll_wr_track_gid, 0 }, - { "contention_seconds", ll_rd_contention_time, ll_wr_contention_time, 0}, + { "contention_seconds", ll_rd_contention_time, + ll_wr_contention_time, 0}, + { "lockless_truncate", ll_rd_lockless_truncate, + ll_wr_lockless_truncate, 0}, { "statahead_max", ll_rd_statahead_max, ll_wr_statahead_max, 0 }, { "statahead_stats", ll_rd_statahead_stats, 0, 0 }, { 0 } @@ -576,6 +600,7 @@ struct llite_file_opcode { /* inode operation */ { LPROC_LL_SETATTR, LPROCFS_TYPE_REGS, "setattr" }, { LPROC_LL_TRUNC, LPROCFS_TYPE_REGS, "truncate" }, + { LPROC_LL_LOCKLESS_TRUNC, LPROCFS_TYPE_REGS, "lockless_truncate" }, { LPROC_LL_FLOCK, LPROCFS_TYPE_REGS, "flock" }, #if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) { LPROC_LL_GETATTR, LPROCFS_TYPE_REGS, "getattr" }, diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 2a0733fdd82fa6488d74de26785066356812ad04..0d2e262bd4a1a2ee1a12d05ac0f82c5fe7879a70 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -37,116 +37,97 @@ /* methods */ -/* called from iget{4,5_locked}->find_inode() under inode_lock spinlock */ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -static int ll_test_inode(struct inode *inode, unsigned long ino, void *opaque) -#else -static int ll_test_inode(struct inode *inode, void *opaque) -#endif +int ll_unlock(__u32 mode, struct lustre_handle *lockh) { - static int last_ino, last_gen, last_count; - struct lustre_md *md = opaque; + ENTRY; - if (!(md->body->valid & (OBD_MD_FLGENER | OBD_MD_FLID))) { - CERROR("MDS body missing inum or generation\n"); - return 0; - } + ldlm_lock_decref(lockh, mode); - if (last_ino == md->body->ino && last_gen == md->body->generation && - last_count < 500) { - last_count++; - } else { - if (last_count > 1) - CDEBUG(D_VFSTRACE, "compared %u/%u %u times\n", - last_ino, last_gen, last_count); - last_count = 0; - last_ino = md->body->ino; - last_gen = md->body->generation; - CDEBUG(D_VFSTRACE, - "comparing inode %p ino %lu/%u to body "LPU64"/%u\n", - inode, inode->i_ino, inode->i_generation, - md->body->ino, md->body->generation); - } + RETURN(0); +} -#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) - if (inode->i_ino != md->body->ino) - return 0; -#endif - if (inode->i_generation != md->body->generation) { -#ifdef HAVE_EXPORT___IGET - if (inode->i_state & (I_FREEING | I_CLEAR)) - return 0; - if (inode->i_nlink == 0) - return 0; - - /* add "duplicate" inode into deathrow for destroy */ - spin_lock(&ll_i2sbi(inode)->ll_deathrow_lock); - if (list_empty(&ll_i2info(inode)->lli_dead_list)) { - __iget(inode); - list_add(&ll_i2info(inode)->lli_dead_list, - &ll_i2sbi(inode)->ll_deathrow); - } - spin_unlock(&ll_i2sbi(inode)->ll_deathrow_lock); -#endif +/* Get an inode by inode number (already instantiated by the intent lookup). + * Returns inode or NULL + */ - return 0; +static inline __u64 fid_flatten(const struct lu_fid *fid) +{ + return (fid_seq(fid) - 1) * LUSTRE_SEQ_MAX_WIDTH + fid_oid(fid); +} +/* Build inode number on passed @fid */ +ino_t ll_fid_build_ino(struct ll_sb_info *sbi, + struct ll_fid *fid) +{ + ino_t ino; + ENTRY; + + if (fid_is_igif((struct lu_fid*)fid)) { + ino = lu_igif_ino((struct lu_fid*)fid); + RETURN(ino); } - /* Apply the attributes in 'opaque' to this inode */ - if (!(inode->i_state & (I_FREEING | I_CLEAR))) - ll_update_inode(inode, md); - return 1; + /* + * Very stupid and having many downsides inode allocation algorithm + * based on fid. + */ + ino = fid_flatten((struct lu_fid*)fid); + + if (unlikely(ino == 0)) + /* the first result ino is 0xFFC001, so this is rarely used */ + ino = 0xffbcde; + ino = ino | 0x80000000; + RETURN(ino); + } -int ll_unlock(__u32 mode, struct lustre_handle *lockh) +/* called from iget5_locked->find_inode() under inode_lock spinlock */ +static int fid_test_inode(struct inode *inode, void *opaque) { - ENTRY; + struct lustre_md *md = opaque; - ldlm_lock_decref(lockh, mode); + if (unlikely(!(md->body->valid & OBD_MD_FLID))) { + CERROR("MDS body missing FID\n"); + return 0; + } - RETURN(0); + return lu_fid_eq(ll_inode_lu_fid(inode), + (struct lu_fid*)&md->body->fid1); } -/* Get an inode by inode number (already instantiated by the intent lookup). - * Returns inode or NULL - */ -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) -int ll_set_inode(struct inode *inode, void *opaque) +static int fid_set_inode(struct inode *inode, void *opaque) { - ll_read_inode2(inode, opaque); + struct lustre_md *md = opaque; + + *ll_inode_lu_fid(inode) = *((struct lu_fid*)&md->body->fid1); return 0; } struct inode *ll_iget(struct super_block *sb, ino_t hash, - struct lustre_md *md) + struct lustre_md *md) { - struct inode *inode; + struct ll_inode_info *lli; + struct inode *inode; + ENTRY; LASSERT(hash != 0); - inode = iget5_locked(sb, hash, ll_test_inode, ll_set_inode, md); + inode = iget5_locked(sb, hash, fid_test_inode, fid_set_inode, md); if (inode) { - if (inode->i_state & I_NEW) + lli = ll_i2info(inode); + if (inode->i_state & I_NEW) { + ll_read_inode2(inode, md); unlock_new_inode(inode); - CDEBUG(D_VFSTRACE, "inode: %lu/%u(%p)\n", inode->i_ino, - inode->i_generation, inode); + } else { + if (!(inode->i_state & (I_FREEING | I_CLEAR))) + ll_update_inode(inode, md); + } + CDEBUG(D_VFSTRACE, "got inode: %lu/%u(%p) for "DFID"\n", + inode->i_ino, inode->i_generation, inode, + PFID(ll_inode_lu_fid(inode))); } - return inode; -} -#else -struct inode *ll_iget(struct super_block *sb, ino_t hash, - struct lustre_md *md) -{ - struct inode *inode; - LASSERT(hash != 0); - inode = iget4(sb, hash, ll_test_inode, md); - if (inode) - CDEBUG(D_VFSTRACE, "inode: %lu/%u(%p)\n", inode->i_ino, - inode->i_generation, inode); - return inode; + RETURN(inode); } -#endif static void ll_drop_negative_dentry(struct inode *dir) { @@ -193,11 +174,14 @@ int ll_mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, case LDLM_CB_CANCELING: { struct inode *inode = ll_inode_from_lock(lock); __u64 bits = lock->l_policy_data.l_inodebits.bits; + struct lu_fid *fid; /* Invalidate all dentries associated with this inode */ if (inode == NULL) break; + fid = ll_inode_lu_fid(inode);; + LASSERT(lock->l_flags & LDLM_FL_CANCELING); if ((bits & MDS_INODELOCK_LOOKUP) && ll_have_md_lock(inode, MDS_INODELOCK_LOOKUP)) @@ -208,9 +192,8 @@ int ll_mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, if ((bits & MDS_INODELOCK_OPEN) && ll_have_md_lock(inode, MDS_INODELOCK_OPEN)) bits &= ~MDS_INODELOCK_OPEN; - - if (lock->l_resource->lr_name.name[0] != inode->i_ino || - lock->l_resource->lr_name.name[1] != inode->i_generation) { + + if (!fid_res_name_eq(fid, &lock->l_resource->lr_name)) { LDLM_ERROR(lock, "data mismatch with ino %lu/%u (%p)", inode->i_ino, inode->i_generation, inode); } @@ -267,11 +250,11 @@ int ll_mdc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, int ll_mdc_cancel_unused(struct lustre_handle *conn, struct inode *inode, int flags, void *opaque) { - struct ldlm_res_id res_id = - { .name = {inode->i_ino, inode->i_generation} }; + struct ldlm_res_id res_id; struct obd_device *obddev = class_conn2obd(conn); ENTRY; + fid_build_reg_res_name(ll_inode_lu_fid(inode), &res_id); RETURN(ldlm_cli_cancel_unused(obddev->obd_namespace, &res_id, flags, opaque)); } @@ -544,7 +527,7 @@ static struct dentry *ll_lookup_it(struct inode *parent, struct dentry *dentry, struct lookup_intent *it, int lookup_flags) { struct dentry *save = dentry, *retval; - struct mdc_op_data op_data; + struct mdc_op_data op_data = { { 0 } }; struct it_cb_data icbd; struct ptlrpc_request *req = NULL; struct lookup_intent lookup_it = { .it_op = IT_LOOKUP }; @@ -864,7 +847,7 @@ static int ll_new_node(struct inode *dir, struct qstr *name, struct ptlrpc_request *request = NULL; struct inode *inode = NULL; struct ll_sb_info *sbi = ll_i2sbi(dir); - struct mdc_op_data op_data; + struct mdc_op_data op_data = { { 0 } }; int tgt_len = 0; int err; @@ -999,7 +982,7 @@ static int ll_link_generic(struct inode *src, struct inode *dir, struct qstr *name, struct dentry *dchild) { struct ptlrpc_request *request = NULL; - struct mdc_op_data op_data; + struct mdc_op_data op_data = { { 0 } }; int err; struct ll_sb_info *sbi = ll_i2sbi(dir); @@ -1064,7 +1047,7 @@ static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent, struct qstr *name) { struct ptlrpc_request *request = NULL; - struct mdc_op_data op_data = {{0}}; + struct mdc_op_data op_data = { { 0 } }; struct dentry *dentry; int rc; ENTRY; @@ -1149,8 +1132,9 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) GOTO(out_free_memmd, rc = -ENOMEM); oa->o_id = lsm->lsm_object_id; + oa->o_gr = lsm->lsm_object_gr; oa->o_mode = body->mode & S_IFMT; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLTYPE; if (body->valid & OBD_MD_FLCOOKIE) { oa->o_valid |= OBD_MD_FLCOOKIE; @@ -1167,8 +1151,8 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) rc = obd_destroy(ll_i2obdexp(dir), oa, lsm, &oti, ll_i2mdcexp(dir)); OBDO_FREE(oa); if (rc) - CERROR("obd destroy objid "LPX64" error %d\n", - lsm->lsm_object_id, rc); + CERROR("obd destroy objid "LPX64"@"LPX64" error %d\n", + lsm->lsm_object_id, lsm->lsm_object_gr, rc); out_free_memmd: obd_free_memmd(ll_i2obdexp(dir), &lsm); out: @@ -1178,7 +1162,7 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) static int ll_unlink_generic(struct inode * dir, struct qstr *name) { struct ptlrpc_request *request = NULL; - struct mdc_op_data op_data = {{0}}; + struct mdc_op_data op_data = { { 0 } }; int rc; ENTRY; @@ -1211,7 +1195,7 @@ static int ll_rename_generic(struct inode *src, struct qstr *src_name, { struct ptlrpc_request *request = NULL; struct ll_sb_info *sbi = ll_i2sbi(src); - struct mdc_op_data op_data = {{0}}; + struct mdc_op_data op_data = { { 0 } }; int err; ENTRY; @@ -1311,8 +1295,16 @@ static int ll_link(struct dentry *old_dentry, struct inode *dir, static int ll_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - return ll_rename_generic(old_dir, &old_dentry->d_name, new_dir, - &new_dentry->d_name); + int err; + err = ll_rename_generic(old_dir, &old_dentry->d_name, new_dir, + &new_dentry->d_name); + if (!err) { +#ifndef HAVE_FS_RENAME_DOES_D_MOVE + if (!S_ISDIR(old_dentry->d_inode->i_mode)) +#endif + d_move(old_dentry, new_dentry); + } + return err; } #endif diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index c48c39e23ecf0d28f5441b49ef1b3ad1680888d0..c3e22e4d3b61b5eab18d984b027b34699bef4566 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -119,8 +119,16 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock) oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; oinfo.oi_oa = &oa; oa.o_id = lli->lli_smd->lsm_object_id; - oa.o_valid = OBD_MD_FLID; - oa.o_flags = srvlock ? OBD_FL_TRUNCLOCK : 0; + oa.o_gr = lli->lli_smd->lsm_object_gr; + oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + if (srvlock) { + /* set OBD_MD_FLFLAGS in o_valid, only if we + * set OBD_FL_TRUNCLOCK, otherwise ost_punch + * and filter_setattr get confused, see the comment + * in ost_punch */ + oa.o_flags = OBD_FL_TRUNCLOCK; + oa.o_valid |= OBD_MD_FLFLAGS; + } obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE |OBD_MD_FLFID| OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLGENER | @@ -212,6 +220,8 @@ void ll_truncate(struct inode *inode) ll_inode_size_unlock(inode, 0); if (!srvlock) ll_file_punch(inode, new_size, 0); + else + ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LOCKLESS_TRUNC, 1); EXIT; return; @@ -245,7 +255,9 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from, oa.o_mode = inode->i_mode; oa.o_id = lsm->lsm_object_id; - oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE; + oa.o_gr = lsm->lsm_object_gr; + oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | + OBD_MD_FLTYPE | OBD_MD_FLGROUP; obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER); oinfo.oi_oa = &oa; @@ -299,6 +311,14 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from, return rc; } +/** + * make page ready for ASYNC write + * \param data - pointer to llap cookie + * \param cmd - is OBD_BRW_* macroses + * + * \retval 0 is page successfully prepared to send + * \retval -EAGAIN is page not need to send + */ static int ll_ap_make_ready(void *data, int cmd) { struct ll_async_page *llap; @@ -308,14 +328,13 @@ static int ll_ap_make_ready(void *data, int cmd) llap = LLAP_FROM_COOKIE(data); page = llap->llap_page; - LASSERTF(!(cmd & OBD_BRW_READ), "cmd %x page %p ino %lu index %lu\n", cmd, page, - page->mapping->host->i_ino, page->index); - /* we're trying to write, but the page is locked.. come back later */ if (TryLockPage(page)) RETURN(-EAGAIN); - LASSERT(!PageWriteback(page)); + LASSERTF(!(cmd & OBD_BRW_READ) || !PageWriteback(page), + "cmd %x page %p ino %lu index %lu fl %lx\n", cmd, page, + page->mapping->host->i_ino, page->index, page->flags); /* if we left PageDirty we might get another writepage call * in the future. list walkers are bright enough @@ -324,16 +343,13 @@ static int ll_ap_make_ready(void *data, int cmd) * we got the page cache list we'd create a lock inversion * with the removepage path which gets the page lock then the * cli lock */ -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) - clear_page_dirty(page); -#else - LASSERTF(!PageWriteback(page),"cmd %x page %p ino %lu index %lu\n", cmd, page, - page->mapping->host->i_ino, page->index); - clear_page_dirty_for_io(page); + if(!clear_page_dirty_for_io(page)) { + unlock_page(page); + RETURN(-EAGAIN); + } /* This actually clears the dirty bit in the radix tree.*/ set_page_writeback(page); -#endif LL_CDEBUG_PAGE(D_PAGE, page, "made ready\n"); page_cache_get(page); @@ -401,7 +417,8 @@ void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa) lsm = ll_i2info(inode)->lli_smd; oa->o_id = lsm->lsm_object_id; - oa->o_valid = OBD_MD_FLID; + oa->o_gr = lsm->lsm_object_gr; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME; if (cmd & OBD_BRW_WRITE) { oa->o_valid |= OBD_MD_FLEPOCH; @@ -763,9 +780,8 @@ static int queue_or_sync_write(struct obd_export *exp, struct inode *inode, if (!rc && async_flags & ASYNC_READY) { unlock_page(llap->llap_page); - if (PageWriteback(llap->llap_page)) { + if (PageWriteback(llap->llap_page)) end_page_writeback(llap->llap_page); - } } LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "sync write returned %d\n", rc); @@ -918,6 +934,14 @@ int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) #endif } + /* be carefull about clear WB. + * if WB will cleared after page lock is released - paralel IO can be + * started before ap_make_ready is finished - so we will be have page + * with PG_Writeback set from ->writepage() and completed READ which + * clear this flag */ + if ((cmd & OBD_BRW_WRITE) && PageWriteback(page)) + end_page_writeback(page); + unlock_page(page); if (cmd & OBD_BRW_WRITE) { @@ -925,9 +949,6 @@ int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) ll_try_done_writing(page->mapping->host); } - if (PageWriteback(page)) { - end_page_writeback(page); - } page_cache_release(page); RETURN(ret); @@ -1234,7 +1255,7 @@ stride_pg_count(pgoff_t st_off, unsigned long st_len, unsigned long st_pgs, unsigned long off, unsigned length) { unsigned long cont_len = st_off > off ? st_off - off : 0; - unsigned long stride_len = length + off > st_off ? + __u64 stride_len = length + off > st_off ? length + off + 1 - st_off : 0; unsigned long left, pg_count; @@ -1372,13 +1393,14 @@ static int ll_readahead(struct ll_readahead_state *ras, ras->ras_next_readahead = max(end, end + 1); RAS_CDEBUG(ras); } - ria.ria_start = start; - ria.ria_end = end; - /* If stride I/O mode is detected, get stride window*/ - if (stride_io_mode(ras)) { - ria.ria_length = ras->ras_stride_length; - ria.ria_pages = ras->ras_stride_pages; - } + ria.ria_start = start; + ria.ria_end = end; + /* If stride I/O mode is detected, get stride window*/ + if (stride_io_mode(ras)) { + ria.ria_stoff = ras->ras_stride_offset; + ria.ria_length = ras->ras_stride_length; + ria.ria_pages = ras->ras_stride_pages; + } spin_unlock(&ras->ras_lock); if (end == 0) { @@ -1391,7 +1413,7 @@ static int ll_readahead(struct ll_readahead_state *ras, RETURN(0); reserved = ll_ra_count_get(ll_i2sbi(inode), len); - if (reserved < end - start + 1) + if (reserved < len) ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT); CDEBUG(D_READA, "reserved page %lu \n", reserved); @@ -1735,19 +1757,22 @@ int ll_writepage(struct page *page) rc = queue_or_sync_write(exp, inode, llap, CFS_PAGE_SIZE, ASYNC_READY | ASYNC_URGENT); } - if (rc) - page_cache_release(page); -out: if (rc) { - if (!lli->lli_async_rc) - lli->lli_async_rc = rc; /* re-dirty page on error so it retries write */ - if (PageWriteback(page)) { + if (PageWriteback(page)) end_page_writeback(page); - } + /* resend page only for not started IO*/ if (!PageError(page)) ll_redirty_page(page); + + page_cache_release(page); + } +out: + if (rc) { + if (!lli->lli_async_rc) + lli->lli_async_rc = rc; + /* resend page only for not started IO*/ unlock_page(page); } RETURN(rc); diff --git a/lustre/llite/statahead.c b/lustre/llite/statahead.c index 2692b96dd7fb0dde7635db5117ef8982d4b3c3f9..6262dd08a5b4d9ce046a86a191163215bc74f38f 100644 --- a/lustre/llite/statahead.c +++ b/lustre/llite/statahead.c @@ -457,7 +457,7 @@ static inline void ll_name2qstr(struct qstr *this, const char *name, int namelen this->hash = end_name_hash(hash); } -static int ll_statahead_one(struct dentry *parent, ext2_dirent *de) +static int ll_statahead_one(struct dentry *parent, struct ll_dir_entry *de) { struct inode *dir = parent->d_inode; struct ll_inode_info *lli = ll_i2info(dir); @@ -483,7 +483,7 @@ static int ll_statahead_one(struct dentry *parent, ext2_dirent *de) if (IS_ERR(se)) RETURN(PTR_ERR(se)); - ll_name2qstr(&name, de->name, de->name_len); + ll_name2qstr(&name, de->lde_name, de->lde_name_len); dentry = d_lookup(parent, &name); if (!dentry) { dentry = d_alloc(parent, &name); @@ -569,7 +569,7 @@ static int ll_statahead_thread(void *arg) struct l_wait_info lwi = { 0 }; unsigned long npages; char *kaddr, *limit; - ext2_dirent *de; + struct ll_dir_entry *de; struct page *page; npages = dir_pages(dir); @@ -590,18 +590,18 @@ static int ll_statahead_thread(void *arg) } kaddr = page_address(page); - limit = kaddr + CFS_PAGE_SIZE - EXT2_DIR_REC_LEN(1); - de = (ext2_dirent *)kaddr; + limit = kaddr + CFS_PAGE_SIZE - ll_dir_rec_len(1); + de = (struct ll_dir_entry *)kaddr; if (!index) { - de = ext2_next_entry(de); /* skip "." */ - de = ext2_next_entry(de); /* skip ".." */ + de = ll_dir_next_entry(de); /* skip "." */ + de = ll_dir_next_entry(de); /* skip ".." */ } - for (; (char*)de <= limit; de = ext2_next_entry(de)) { - if (!de->inode) + for (; (char*)de <= limit; de = ll_dir_next_entry(de)) { + if (!de->lde_inode) continue; - if (de->name[0] == '.' && !sai->sai_ls_all) { + if (de->lde_name[0] == '.' && !sai->sai_ls_all) { /* skip hidden files */ sai->sai_skip_hidden++; continue; @@ -618,17 +618,17 @@ static int ll_statahead_thread(void *arg) &lwi); if (unlikely(sa_check_stop(sai))) { - ext2_put_page(page); + ll_put_page(page); GOTO(out, rc); } rc = ll_statahead_one(parent, de); if (rc < 0) { - ext2_put_page(page); + ll_put_page(page); GOTO(out, rc); } } - ext2_put_page(page); + ll_put_page(page); index++; } EXIT; @@ -698,12 +698,12 @@ enum { static int is_first_dirent(struct inode *dir, struct dentry *dentry) { - struct qstr *d_name = &dentry->d_name; - unsigned long npages, index = 0; - struct page *page; - ext2_dirent *de; - char *kaddr, *limit; - int rc = LS_NONE_FIRST_DE, dot_de; + struct qstr *d_name = &dentry->d_name; + unsigned long npages, index = 0; + struct page *page; + struct ll_dir_entry *de; + char *kaddr, *limit; + int rc = LS_NONE_FIRST_DE, dot_de; ENTRY; while (1) { @@ -724,18 +724,29 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry) } kaddr = page_address(page); - limit = kaddr + CFS_PAGE_SIZE - EXT2_DIR_REC_LEN(1); - de = (ext2_dirent *)kaddr; + limit = kaddr + CFS_PAGE_SIZE - ll_dir_rec_len(1); + de = (struct ll_dir_entry *)kaddr; if (!index) { - de = ext2_next_entry(de); /* skip "." */ - de = ext2_next_entry(de); /* skip ".." */ + if (unlikely(!(de->lde_name_len == 1 && + strncmp(de->lde_name, ".", 1) == 0))) + CWARN("Maybe got bad on-disk dir: %lu\n", + dir->i_ino); + /* skip "." or ingore bad entry */ + de = ll_dir_next_entry(de); + + if (unlikely(!(de->lde_name_len == 2 && + strncmp(de->lde_name, "..", 2) == 0))) + CWARN("Maybe got bad on-disk dir: %lu\n", + dir->i_ino); + /* skip ".." or ingore bad entry */ + de = ll_dir_next_entry(de); } - for (; (char*)de <= limit; de = ext2_next_entry(de)) { - if (!de->inode) + for (; (char*)de <= limit; de = ll_dir_next_entry(de)) { + if (!de->lde_inode) continue; - if (de->name[0] == '.') + if (de->lde_name[0] == '.') dot_de = 1; else dot_de = 0; @@ -743,19 +754,19 @@ static int is_first_dirent(struct inode *dir, struct dentry *dentry) if (dot_de && d_name->name[0] != '.') { CDEBUG(D_READA, "%.*s skip hidden file %.*s\n", d_name->len, d_name->name, - de->name_len, de->name); + de->lde_name_len, de->lde_name); continue; } - if (d_name->len == de->name_len && - !strncmp(d_name->name, de->name, d_name->len)) + if (d_name->len == de->lde_name_len && + !strncmp(d_name->name, de->lde_name, d_name->len)) rc = LS_FIRST_DE + dot_de; else rc = LS_NONE_FIRST_DE; - ext2_put_page(page); + ll_put_page(page); RETURN(rc); } - ext2_put_page(page); + ll_put_page(page); index++; } RETURN(rc); @@ -908,7 +919,8 @@ void ll_statahead_exit(struct dentry *dentry, int result) sbi->ll_sa_miss++; sai->sai_miss++; sai->sai_consecutive_miss++; - if (sa_low_hit(sai)) { + if (sa_low_hit(sai) && + sai->sai_thread.t_flags & SVC_RUNNING) { sbi->ll_sa_wrong++; CDEBUG(D_READA, "statahead for dir %.*s hit " "ratio too low: hit/miss %u/%u, " diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index 164ff8d24ffa2aee9e5ee2109ffee16989881392..946576881fb96c6c699a24ead0014111a58ed369 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -224,6 +224,10 @@ int lov_fini_statfs_set(struct lov_request_set *set); /* lov_obd.c */ void lov_fix_desc(struct lov_desc *desc); +void lov_fix_desc_stripe_size(__u64 *val); +void lov_fix_desc_stripe_count(__u32 *val); +void lov_fix_desc_pattern(__u32 *val); +void lov_fix_desc_qos_maxage(__u32 *val); int lov_get_stripecnt(struct lov_obd *lov, __u32 stripe_count); void lov_getref(struct obd_device *obd); void lov_putref(struct obd_device *obd); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 0cb06acac08516401c6d55f7b853ab29dc94e5f5..4171d1eb5a126cd661e36887164c08771454b282 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -768,31 +768,48 @@ static void __lov_del_obd(struct obd_device *obd, __u32 index) } } -void lov_fix_desc(struct lov_desc *desc) +void lov_fix_desc_stripe_size(__u64 *val) { - if (desc->ld_default_stripe_size < PTLRPC_MAX_BRW_SIZE) { + if (*val < PTLRPC_MAX_BRW_SIZE) { LCONSOLE_WARN("Increasing default stripe size to min %u\n", PTLRPC_MAX_BRW_SIZE); - desc->ld_default_stripe_size = PTLRPC_MAX_BRW_SIZE; - } else if (desc->ld_default_stripe_size & (LOV_MIN_STRIPE_SIZE - 1)) { - desc->ld_default_stripe_size &= ~(LOV_MIN_STRIPE_SIZE - 1); + *val = PTLRPC_MAX_BRW_SIZE; + } else if (*val & (LOV_MIN_STRIPE_SIZE - 1)) { + *val &= ~(LOV_MIN_STRIPE_SIZE - 1); LCONSOLE_WARN("Changing default stripe size to "LPU64" (a " "multiple of %u)\n", - desc->ld_default_stripe_size,LOV_MIN_STRIPE_SIZE); + *val, LOV_MIN_STRIPE_SIZE); } +} - if (desc->ld_default_stripe_count == 0) - desc->ld_default_stripe_count = 1; +void lov_fix_desc_stripe_count(__u32 *val) +{ + if (*val == 0) + *val = 1; +} +void lov_fix_desc_pattern(__u32 *val) +{ /* from lov_setstripe */ - if ((desc->ld_pattern != 0) && - (desc->ld_pattern != LOV_PATTERN_RAID0)) { - LCONSOLE_WARN("Unknown stripe pattern: %#x\n",desc->ld_pattern); - desc->ld_pattern = 0; + if ((*val != 0) && (*val != LOV_PATTERN_RAID0)) { + LCONSOLE_WARN("Unknown stripe pattern: %#x\n", *val); + *val = 0; } +} + +void lov_fix_desc_qos_maxage(__u32 *val) +{ /* fix qos_maxage */ - if (desc->ld_qos_maxage == 0) - desc->ld_qos_maxage = QOS_DEFAULT_MAXAGE; + if (*val == 0) + *val = QOS_DEFAULT_MAXAGE; +} + +void lov_fix_desc(struct lov_desc *desc) +{ + lov_fix_desc_stripe_size(&desc->ld_default_stripe_size); + lov_fix_desc_stripe_count(&desc->ld_default_stripe_count); + lov_fix_desc_pattern(&desc->ld_pattern); + lov_fix_desc_qos_maxage(&desc->ld_qos_maxage); } static int lov_setup(struct obd_device *obd, obd_count len, void *buf) @@ -1427,8 +1444,10 @@ static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo, if (rc) RETURN(rc); - CDEBUG(D_INFO, "objid "LPX64": %ux%u byte stripes\n", - oinfo->oi_md->lsm_object_id, oinfo->oi_md->lsm_stripe_count, + CDEBUG(D_INFO, "objid "LPX64"@"LPX64": %ux%u byte stripes\n", + oinfo->oi_md->lsm_object_id, + oinfo->oi_md->lsm_object_gr, + oinfo->oi_md->lsm_stripe_count, oinfo->oi_md->lsm_stripe_size); list_for_each (pos, &set->set_list) { @@ -1437,9 +1456,9 @@ static int lov_setattr_async(struct obd_export *exp, struct obd_info *oinfo, if (oinfo->oi_oa->o_valid & OBD_MD_FLCOOKIE) oti->oti_logcookies = set->set_cookies + req->rq_stripe; - CDEBUG(D_INFO, "objid "LPX64"[%d] has subobj "LPX64" at idx " - "%u\n", oinfo->oi_oa->o_id, req->rq_stripe, - req->rq_oi.oi_oa->o_id, req->rq_idx); + CDEBUG(D_INFO, "objid "LPX64"@"LPX64"[%d] has subobj "LPX64 + " at idx %u\n", oinfo->oi_oa->o_id, oinfo->oi_oa->o_gr, + req->rq_stripe, req->rq_oi.oi_oa->o_id, req->rq_idx); rc = obd_setattr_async(lov->lov_tgts[req->rq_idx]->ltd_exp, &req->rq_oi, oti, rqset); @@ -2091,6 +2110,7 @@ static int lov_change_cbdata(struct obd_export *exp, continue; } submd.lsm_object_id = loi->loi_id; + submd.lsm_object_gr = loi->loi_gr; submd.lsm_stripe_count = 0; rc = obd_change_cbdata(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, &submd, it, data); @@ -2450,7 +2470,7 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, lov_getref(obddev); - if (KEY_IS("lock_to_stripe")) { + if (KEY_IS(KEY_LOCK_TO_STRIPE)) { struct { char name[16]; struct ldlm_lock *lock; @@ -2476,8 +2496,7 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, continue; if (lov->lov_tgts[loi->loi_ost_idx]->ltd_exp == data->lock->l_conn_export && - loi->loi_id == res_id->name[0] && - loi->loi_gr == res_id->name[1]) { + osc_res_name_eq(loi->loi_id, loi->loi_gr, res_id)) { *stripe = i; GOTO(out, rc = 0); } @@ -2508,7 +2527,7 @@ static int lov_get_info(struct obd_export *exp, __u32 keylen, for(i = 0; i < lov->desc.ld_tgt_count; i++) { tgt = lov->lov_tgts[i]; - if (obd_uuid_equals(val, &tgt->ltd_uuid)) + if (tgt && obd_uuid_equals(val, &tgt->ltd_uuid)) GOTO(out, rc = i); } } @@ -2548,11 +2567,11 @@ static int lov_set_info_async(struct obd_export *exp, obd_count keylen, incr = sizeof(struct obd_id_info); do_inactive = 1; next_id = 1; - } else if (KEY_IS("checksum")) { + } else if (KEY_IS(KEY_CHECKSUM)) { do_inactive = 1; - } else if (KEY_IS(KEY_MDS_CONN) || KEY_IS("unlinked")) { + } else if (KEY_IS(KEY_MDS_CONN) || KEY_IS(KEY_UNLINKED)) { check_uuid = val ? 1 : 0; - } else if (KEY_IS("evict_by_nid")) { + } else if (KEY_IS(KEY_EVICT_BY_NID)) { /* use defaults: do_inactive = incr = 0; */ @@ -2754,6 +2773,47 @@ void lov_stripe_unlock(struct lov_stripe_md *md) } EXPORT_SYMBOL(lov_stripe_unlock); +static int lov_reget_short_lock(struct obd_export *exp, + struct lov_stripe_md *lsm, + void **res, int rw, + obd_off start, obd_off end, + void **cookie) +{ + struct lov_async_page *l = *res; + obd_off stripe_start, stripe_end = start; + + ENTRY; + + /* ensure we don't cross stripe boundaries */ + lov_extent_calc(exp, lsm, OBD_CALC_STRIPE_END, &stripe_end); + if (stripe_end <= end) + RETURN(0); + + /* map the region limits to the object limits */ + lov_stripe_offset(lsm, start, l->lap_stripe, &stripe_start); + lov_stripe_offset(lsm, end, l->lap_stripe, &stripe_end); + + RETURN(obd_reget_short_lock(exp->exp_obd->u.lov.lov_tgts[lsm-> + lsm_oinfo[l->lap_stripe]->loi_ost_idx]-> + ltd_exp, NULL, &l->lap_sub_cookie, + rw, stripe_start, stripe_end, cookie)); +} + +static int lov_release_short_lock(struct obd_export *exp, + struct lov_stripe_md *lsm, obd_off end, + void *cookie, int rw) +{ + int stripe; + + ENTRY; + + stripe = lov_stripe_number(lsm, end); + + RETURN(obd_release_short_lock(exp->exp_obd->u.lov.lov_tgts[lsm-> + lsm_oinfo[stripe]->loi_ost_idx]-> + ltd_exp, NULL, end, cookie, rw)); +} + struct obd_ops lov_obd_ops = { .o_owner = THIS_MODULE, .o_setup = lov_setup, @@ -2776,6 +2836,8 @@ struct obd_ops lov_obd_ops = { .o_brw = lov_brw, .o_brw_async = lov_brw_async, .o_prep_async_page = lov_prep_async_page, + .o_reget_short_lock = lov_reget_short_lock, + .o_release_short_lock = lov_release_short_lock, .o_queue_async_io = lov_queue_async_io, .o_set_async_flags = lov_set_async_flags, .o_queue_group_io = lov_queue_group_io, diff --git a/lustre/lov/lov_pack.c b/lustre/lov/lov_pack.c index ba8663b72f70046a2773b052644a0102250c837d..aea018f268a1e875f8ee3fd0f964494a72d76f1b 100644 --- a/lustre/lov/lov_pack.c +++ b/lustre/lov/lov_pack.c @@ -378,7 +378,7 @@ int lov_setea(struct obd_export *exp, struct lov_stripe_md **lsmp, for (i = 0; i < lump->lmm_stripe_count; i++) { __u32 len = sizeof(last_id); oexp = lov->lov_tgts[lump->lmm_objects[i].l_ost_idx]->ltd_exp; - rc = obd_get_info(oexp, strlen("last_id"), "last_id", + rc = obd_get_info(oexp, sizeof(KEY_LAST_ID), KEY_LAST_ID, &len, &last_id); if (rc) RETURN(rc); diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index 19d06888ec170a6414d0baeb3a13d82ed011cec0..695704da895ecde45fc669a0fe15638075df04e4 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -311,8 +311,8 @@ static int qos_used(struct lov_obd *lov, __u32 index, __u64 *total_wt) lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj >> 10, lov->lov_tgts[i]->ltd_qos.ltq_penalty >> 10, lov->lov_tgts[i]->ltd_qos.ltq_oss->lqo_penalty_per_obj>>10, - lov->lov_tgts[i]->ltd_qos.ltq_oss->lqo_penalty>>10, - lov->lov_tgts[i]->ltd_qos.ltq_weight>>10); + lov->lov_tgts[i]->ltd_qos.ltq_oss->lqo_penalty >> 10, + lov->lov_tgts[i]->ltd_qos.ltq_weight >> 10); #endif } @@ -333,6 +333,7 @@ static int qos_calc_rr(struct lov_obd *lov) RETURN(0); } + /* Do actual allocation. */ down_write(&lov->lov_qos.lq_rw_sem); ost_count = lov->desc.ld_tgt_count; @@ -360,7 +361,7 @@ static int qos_calc_rr(struct lov_obd *lov) int j = 0; for (i = 0; i < ost_count; i++) { if (lov->lov_tgts[i] && - (lov->lov_tgts[i]->ltd_qos.ltq_oss == oss)) { + lov->lov_tgts[i]->ltd_qos.ltq_oss == oss) { /* Evenly space these OSTs across arrayspace */ int next = j * ost_count / oss->lqo_ost_count; while (lov->lov_qos.lq_rr_array[next] != @@ -380,7 +381,7 @@ static int qos_calc_rr(struct lov_obd *lov) if (placed != real_count) { /* This should never happen */ LCONSOLE_ERROR_MSG(0x14e, "Failed to place all OSTs in the " - "round-robin list (%d of %d).\n", + "round-robin list (%d of %d).\n", placed, real_count); for (i = 0; i < ost_count; i++) { LCONSOLE(D_WARNING, "rr #%d ost idx=%d\n", i, @@ -524,7 +525,7 @@ static int alloc_rr(struct lov_obd *lov, int *idx_arr, int *stripe_cnt, down_read(&lov->lov_qos.lq_rw_sem); ost_start_idx_temp = lov->lov_start_idx; -repeat_find : +repeat_find: array_idx = (lov->lov_start_idx + lov->lov_offset_idx) % ost_count; idx_pos = idx_arr; #ifdef QOS_DEBUG @@ -600,7 +601,7 @@ repeat_find: continue; /* Drop slow OSCs if we can, but not for requested start idx */ - if (obd_precreate(lov->lov_tgts[ost_idx]->ltd_exp) > speed && + if ((obd_precreate(lov->lov_tgts[ost_idx]->ltd_exp) > speed) && (i != 0 || speed < 2)) continue; @@ -622,8 +623,9 @@ repeat_find: * * We can only get here if lsm_stripe_count was originally > 1. */ - CERROR("can't lstripe objid "LPX64": have "LPSZ" want %u\n", - lsm->lsm_object_id, idx_pos - idx_arr, lsm->lsm_stripe_count); + CERROR("can't lstripe objid "LPX64": have %d want %u\n", + lsm->lsm_object_id, (int)(idx_pos - idx_arr), + lsm->lsm_stripe_count); RETURN(-EFBIG); } @@ -927,7 +929,6 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) CDEBUG(D_INODE, "stripe %d has size "LPU64"/"LPU64"\n", i, req->rq_oi.oi_oa->o_size, src_oa->o_size); } - } LASSERT(set->set_count == stripes); @@ -943,7 +944,8 @@ int qos_prep_create(struct obd_export *exp, struct lov_request_set *set) out_err: if (newea && rc) obd_free_memmd(exp, &set->set_oi->oi_md); - free_idx_array(idx_arr, idx_cnt); + if (idx_arr) + free_idx_array(idx_arr, idx_cnt); EXIT; return rc; } diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index 82153b7a46966b3005f44da9bb8fa2b487bfb03a..5ff766142cfb7d573e446c1806a1649ed8e84893 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -337,6 +337,7 @@ int lov_prep_enqueue_set(struct obd_export *exp, struct obd_info *oinfo, /* XXX LOV STACKING: submd should be from the subobj */ req->rq_oi.oi_md->lsm_object_id = loi->loi_id; + req->rq_oi.oi_md->lsm_object_gr = loi->loi_gr; req->rq_oi.oi_md->lsm_stripe_count = 0; req->rq_oi.oi_md->lsm_oinfo[0]->loi_kms_valid = loi->loi_kms_valid; @@ -447,6 +448,7 @@ int lov_prep_match_set(struct obd_export *exp, struct obd_info *oinfo, /* XXX LOV STACKING: submd should be from the subobj */ req->rq_oi.oi_md->lsm_object_id = loi->loi_id; + req->rq_oi.oi_md->lsm_object_gr = loi->loi_gr; req->rq_oi.oi_md->lsm_stripe_count = 0; lov_set_add_req(req, set); diff --git a/lustre/lov/lproc_lov.c b/lustre/lov/lproc_lov.c index c585feaf96076afc39c4248ab945dd9ea1cc7e76..b486995bd41a7fe408afae2ab999bb4dbebc5804 100644 --- a/lustre/lov/lproc_lov.c +++ b/lustre/lov/lproc_lov.c @@ -60,8 +60,8 @@ static int lov_wr_stripesize(struct file *file, const char *buffer, if (rc) return rc; + lov_fix_desc_stripe_size(&val); desc->ld_default_stripe_size = val; - lov_fix_desc(desc); return count; } @@ -92,7 +92,6 @@ static int lov_wr_stripeoffset(struct file *file, const char *buffer, return rc; desc->ld_default_stripe_offset = val; - lov_fix_desc(desc); return count; } @@ -121,8 +120,8 @@ static int lov_wr_stripetype(struct file *file, const char *buffer, if (rc) return rc; + lov_fix_desc_pattern(&val); desc->ld_pattern = val; - lov_fix_desc(desc); return count; } @@ -152,8 +151,8 @@ static int lov_wr_stripecount(struct file *file, const char *buffer, if (rc) return rc; + lov_fix_desc_stripe_count(&val); desc->ld_default_stripe_count = val; - lov_fix_desc(desc); return count; } diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 88f31b8059223a12a975d0f0f6c7dc6575708f59..e22fb972be545cf3348260ed7422c2f4ee26042c 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -60,6 +60,8 @@ #include <linux/ext3_extents.h> #endif +#include "lustre_quota_fmt.h" + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) #define FSFILT_DATA_TRANS_BLOCKS(sb) EXT3_DATA_TRANS_BLOCKS #define FSFILT_DELETE_TRANS_BLOCKS(sb) EXT3_DELETE_TRANS_BLOCKS @@ -607,7 +609,7 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle, lock_24kernel(); rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED, - name, lmm, lmm_size, 0); + name, lmm, lmm_size, XATTR_NO_CTIME); unlock_24kernel(); @@ -1059,9 +1061,13 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base, nex.ee_len = count; err = ext3_ext_insert_extent(handle, base, path, &nex); if (err) { - CERROR("can't insert extent: %d\n", err); - /* XXX: export ext3_free_blocks() */ - /*ext3_free_blocks(handle, inode, nex.ee_start, nex.ee_len, 0);*/ + /* free data blocks we just allocated */ + /* not a good idea to call discard here directly, + * but otherwise we'd need to call it every free() */ +#ifdef EXT3_MB_HINT_GROUP_ALLOC + ext3_mb_discard_inode_preallocations(inode); +#endif + ext3_free_blocks(handle, inode, nex.ee_start, nex.ee_len, 0); goto out; } @@ -1398,10 +1404,10 @@ static int fsfilt_ext3_setup(struct super_block *sb) #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,6)) && HAVE_QUOTA_SUPPORT /* enable journaled quota support */ /* kfreed in ext3_put_super() */ - sbi->s_qf_names[USRQUOTA] = kstrdup("lquota.user", GFP_KERNEL); + sbi->s_qf_names[USRQUOTA] = kstrdup("lquota.user.reserved", GFP_KERNEL); if (!sbi->s_qf_names[USRQUOTA]) return -ENOMEM; - sbi->s_qf_names[GRPQUOTA] = kstrdup("lquota.group", GFP_KERNEL); + sbi->s_qf_names[GRPQUOTA] = kstrdup("lquota.group.reserved", GFP_KERNEL); if (!sbi->s_qf_names[GRPQUOTA]) { kfree(sbi->s_qf_names[USRQUOTA]); sbi->s_qf_names[USRQUOTA] = NULL; @@ -1446,8 +1452,6 @@ static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs) return 0; } -static const char *op_quotafile[] = { "lquota.user", "lquota.group" }; - #define DQINFO_COPY(out, in) \ do { \ Q_COPY(out, in, dqi_bgrace); \ @@ -1502,10 +1506,26 @@ static int fsfilt_ext3_quotactl(struct super_block *sb, continue; if (oqc->qc_cmd == Q_QUOTAON) { + lustre_quota_version_t qfmt = oqc->qc_id; + char *name[][MAXQUOTAS] = LUSTRE_OPQFILES_NAMES; + if (!qcop->quota_on) GOTO(out, rc = -ENOSYS); - rc = qcop->quota_on(sb, i, oqc->qc_id, - (char *)op_quotafile[i]); + + rc = qcop->quota_on(sb, i, QFMT_VFS_V0, + name[qfmt][i]); +#ifdef HAVE_QUOTA64 + if (rc == -ENOENT || rc == -EINVAL) { + /* see bug 13904 */ + rc = lustre_slave_quota_convert(qfmt, i); + if (!rc) + rc = qcop->quota_on(sb, i, + QFMT_VFS_V0, + name[qfmt][i]); + else if (rc == -ESTALE) + rc = -ENOENT; + } +#endif } else if (oqc->qc_cmd == Q_QUOTAOFF) { if (!qcop->quota_off) GOTO(out, rc = -ENOSYS); @@ -1546,8 +1566,29 @@ static int fsfilt_ext3_quotactl(struct super_block *sb, GOTO(out, rc = -ENOSYS); qcop->quota_sync(sb, oqc->qc_type); break; + case Q_FINVALIDATE: + CDEBUG(D_WARNING, "invalidating operational quota files\n"); + for (i = 0; i < MAXQUOTAS; i++) { + struct file *fp; + lustre_quota_version_t qfmt = oqc->qc_id; + char *name[][MAXQUOTAS] = LUSTRE_OPQFILES_NAMES; + + if (!Q_TYPESET(oqc, i)) + continue; + + fp = filp_open(name[qfmt][i], O_CREAT | O_TRUNC | O_RDWR, 0644); + if (IS_ERR(fp)) { + rc = PTR_ERR(fp); + CERROR("error invalidating operational quota file" + " %s (rc:%d)\n", name[qfmt][i], rc); + } else { + filp_close(fp, 0); + } + + } + break; default: - CERROR("unsupported quotactl command: %d", oqc->qc_cmd); + CERROR("unsupported quotactl command: %d\n", oqc->qc_cmd); LBUG(); } out: @@ -1653,7 +1694,7 @@ cqget(struct super_block *sb, struct hlist_head *hash, struct list_head *list, return cdqb; } -static inline int quota_onoff(struct super_block *sb, int cmd, int type) +static inline int quota_onoff(struct super_block *sb, int cmd, int type, int qfmt) { struct obd_quotactl *oqctl; int rc; @@ -1663,7 +1704,7 @@ static inline int quota_onoff(struct super_block *sb, int cmd, int type) RETURN(-ENOMEM); oqctl->qc_cmd = cmd; - oqctl->qc_id = QFMT_LDISKFS; + oqctl->qc_id = qfmt; oqctl->qc_type = type; rc = fsfilt_ext3_quotactl(sb, oqctl); @@ -1788,7 +1829,11 @@ static int add_inode_quota(struct inode *inode, struct qchk_ctxt *qctxt, static int v2_write_dqheader(struct file *f, int type) { static const __u32 quota_magics[] = V2_INITQMAGICS; +#ifdef HAVE_QUOTA64 + static const __u32 quota_versions[] = V2_INITQVERSIONS_R0; +#else static const __u32 quota_versions[] = V2_INITQVERSIONS; +#endif struct v2_disk_dqheader dqhead; loff_t offset = 0; @@ -1826,6 +1871,30 @@ static int v2_write_dqinfo(struct file *f, int type, struct if_dqinfo *info) return cfs_user_write(f, (char *)&dqinfo, sizeof(dqinfo), &offset); } +#ifdef HAVE_QUOTA64 +static int v3_write_dqheader(struct file *f, int type) +{ + static const __u32 quota_magics[] = V2_INITQMAGICS; + static const __u32 quota_versions[] = V2_INITQVERSIONS_R1; + struct v2_disk_dqheader dqhead; + loff_t offset = 0; + + CLASSERT(ARRAY_SIZE(quota_magics) == ARRAY_SIZE(quota_versions)); + LASSERT(0 <= type && type < ARRAY_SIZE(quota_magics)); + + dqhead.dqh_magic = cpu_to_le32(quota_magics[type]); + dqhead.dqh_version = cpu_to_le32(quota_versions[type]); + + return cfs_user_write(f, (char *)&dqhead, sizeof(dqhead), &offset); +} + +/* write dqinfo struct in a new quota file */ +static int v3_write_dqinfo(struct file *f, int type, struct if_dqinfo *info) +{ + return v2_write_dqinfo(f, type, info); +} +#endif + static int create_new_quota_files(struct qchk_ctxt *qctxt, struct obd_quotactl *oqc) { @@ -1836,32 +1905,50 @@ static int create_new_quota_files(struct qchk_ctxt *qctxt, struct if_dqinfo *info = qctxt->qckt_first_check[i]? NULL : &qctxt->qckt_dqinfo[i]; struct file *file; + const char *name[][MAXQUOTAS] = LUSTRE_OPQFILES_NAMES; + int (*write_dqheader)(struct file *, int); + int (*write_dqinfo)(struct file *, int, struct if_dqinfo *); if (!Q_TYPESET(oqc, i)) continue; - file = filp_open(op_quotafile[i], O_RDWR | O_CREAT | O_TRUNC, - 0644); + file = filp_open(name[oqc->qc_id][i], + O_RDWR | O_CREAT | O_TRUNC, 0644); if (IS_ERR(file)) { rc = PTR_ERR(file); CERROR("can't create %s file: rc = %d\n", - op_quotafile[i], rc); + name[oqc->qc_id][i], rc); GOTO(out, rc); } if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { - CERROR("file %s is not regular", op_quotafile[i]); + CERROR("file %s is not regular", name[oqc->qc_id][i]); filp_close(file, 0); GOTO(out, rc = -EINVAL); } - rc = v2_write_dqheader(file, i); + DQUOT_DROP(file->f_dentry->d_inode); + + switch (oqc->qc_id) { + case LUSTRE_QUOTA_V1 : write_dqheader = v2_write_dqheader; + write_dqinfo = v2_write_dqinfo; + break; +#ifdef HAVE_QUOTA64 + case LUSTRE_QUOTA_V2 : write_dqheader = v3_write_dqheader; + write_dqinfo = v3_write_dqinfo; + break; +#endif + default : CERROR("unknown quota format!\n"); + LBUG(); + } + + rc = (*write_dqheader)(file, i); if (rc) { filp_close(file, 0); GOTO(out, rc); } - rc = v2_write_dqinfo(file, i, info); + rc = (*write_dqinfo)(file, i, info); filp_close(file, 0); if (rc) GOTO(out, rc); @@ -1957,12 +2044,12 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb, if (!Q_TYPESET(oqc, i)) continue; - rc = quota_onoff(sb, Q_QUOTAON, i); + rc = quota_onoff(sb, Q_QUOTAON, i, oqc->qc_id); if (!rc || rc == -EBUSY) { rc = read_old_dqinfo(sb, i, qctxt->qckt_dqinfo); if (rc) GOTO(out, rc); - } else if (rc == -ENOENT) { + } else if (rc == -ENOENT || rc == -EINVAL || rc == -EEXIST) { qctxt->qckt_first_check[i] = 1; } else if (rc) { GOTO(out, rc); @@ -2030,14 +2117,14 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb, } #endif /* turn off quota cause we are to dump chk_dqblk to files */ - quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type); + quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type, oqc->qc_id); rc = create_new_quota_files(qctxt, oqc); if (rc) GOTO(out, rc); /* we use vfs functions to set dqblk, so turn quota on */ - rc = quota_onoff(sb, Q_QUOTAON, oqc->qc_type); + rc = quota_onoff(sb, Q_QUOTAON, oqc->qc_type, oqc->qc_id); out: /* dump and free chk_dqblk */ rc = prune_chkquots(sb, qctxt, rc); @@ -2045,7 +2132,7 @@ out: /* turn off quota, `lfs quotacheck` will turn on when all * nodes quotacheck finish. */ - quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type); + quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type, oqc->qc_id); oqc->qc_stat = rc; if (rc) @@ -2083,8 +2170,10 @@ static int fsfilt_ext3_quotainfo(struct lustre_quota_info *lqi, int type, rc = lustre_quota_convert(lqi, type); break; default: - CERROR("Unsupported admin quota file cmd %d\n", cmd); - LBUG(); + rc = -ENOTSUPP; + CERROR("Unsupported admin quota file cmd %d\n" + "Are lquota.ko and fsfilt_ldiskfs.ko modules in sync?\n", + cmd); break; } RETURN(rc); diff --git a/lustre/lvfs/lustre_quota_fmt.h b/lustre/lvfs/lustre_quota_fmt.h index b0638f59a8f0cccee10d5222a32a51ee1a941fb4..2139ae0aa1da3c166304d6372b4483a089e01c12 100644 --- a/lustre/lvfs/lustre_quota_fmt.h +++ b/lustre/lvfs/lustre_quota_fmt.h @@ -183,7 +183,10 @@ int lustre_get_qids(struct file *fp, struct inode *inode, int type, struct list_head *list); -/* come from lustre_quota_fmt_conver.c */ +/* comes from lustre_quota_fmt_convert.c */ +int lustre_slave_quota_convert(lustre_quota_version_t qfmt, int type); int lustre_quota_convert(struct lustre_quota_info *lqi, int type); +#define LUSTRE_OPQFILES_NAMES { { "lquota.user", "lquota.group" }, \ + { "lquota_v2.user", "lquota_v2.group" } } #endif /* lustre_quota_fmt.h */ diff --git a/lustre/lvfs/lustre_quota_fmt_convert.c b/lustre/lvfs/lustre_quota_fmt_convert.c index 70350ed8f2f8f584bfbbe0f179d166a29a95727b..baa37fb7598920dc53c91442c606c9e853d41853 100644 --- a/lustre/lvfs/lustre_quota_fmt_convert.c +++ b/lustre/lvfs/lustre_quota_fmt_convert.c @@ -51,7 +51,7 @@ static int admin_convert_dqinfo(struct file *fp_v1, struct file *fp_v2, return rc; } -static int admin_convert_v1_to_v2(struct file *fp_v1, struct file *fp_v2, +static int quota_convert_v1_to_v2(struct file *fp_v1, struct file *fp_v2, struct lustre_quota_info *lqi, int type) { struct list_head blk_list; @@ -158,7 +158,7 @@ int lustre_quota_convert(struct lustre_quota_info *lqi, int type) f_v1 = filp_open(name, O_RDONLY, 0); if (!IS_ERR(f_v1)) { if (!check_quota_file(f_v1, NULL, type, LUSTRE_QUOTA_V1)) { - rc = admin_convert_v1_to_v2(f_v1, f_v2, lqi, type); + rc = quota_convert_v1_to_v2(f_v1, f_v2, lqi, type); if (rc) CERROR("failed to convert v1 quota file" " to v2 quota file.\n"); @@ -181,5 +181,91 @@ int lustre_quota_convert(struct lustre_quota_info *lqi, int type) RETURN(rc); } - EXPORT_SYMBOL(lustre_quota_convert); + +#ifdef HAVE_QUOTA64 +/* + * convert operational quota files to the requested version + * returns: -ESTALE if upgrading to qfmt version is not supported + * -ENOMEM if memory was not allocated for conv. structures + * + * other error codes can be returned by VFS and have the + * appropriate meaning + */ +int lustre_slave_quota_convert(lustre_quota_version_t qfmt, int type) +{ + struct lustre_quota_info *lqi; + struct file *f_v1, *f_v2; + const char *name[][MAXQUOTAS] = LUSTRE_OPQFILES_NAMES; + int rc; + + ENTRY; + + /* we convert only to v2 version */ + if (qfmt != LUSTRE_QUOTA_V2) + GOTO(out, rc = -ESTALE); + + OBD_ALLOC_PTR(lqi); + if (lqi == NULL) + GOTO(out, rc = -ENOMEM); + + /* now that we support only v1 and v2 formats, + * only upgrade from v1 is possible, + * let's check if v1 file exists so that we convert it to v2 */ + f_v1 = filp_open(name[LUSTRE_QUOTA_V1][type], O_RDONLY, 0); + if (IS_ERR(f_v1)) + GOTO(out_free, rc = PTR_ERR(f_v1)); + + /* make sure it is really a v1 file */ + if (check_quota_file(f_v1, NULL, type, LUSTRE_QUOTA_V1)) + GOTO(out_f_v1, rc = -EINVAL); + + /* create new quota file for v2 version, follow the same rationale as + * mds_admin_quota_on: if the file already exists, then do not try to + * overwrite it, user has to fix the quotaon issue manually, + * e.g. through running quotacheck */ + f_v2 = filp_open(name[LUSTRE_QUOTA_V2][type], + O_CREAT | O_EXCL | O_TRUNC | O_RDWR, 0644); + if (IS_ERR(f_v2)) + GOTO(out_f_v1, rc = PTR_ERR(f_v2)); + + lqi->qi_version = LUSTRE_QUOTA_V2; + lqi->qi_files[type] = f_v2; + + /* initialize quota file with defaults, marking it invalid, + * this will help us not to get confused with partially converted + * operational quota files if we crash during conversion */ + rc = lustre_init_quota_info_generic(lqi, type, 1); + if (rc) + GOTO(out_f_v2, rc); + + rc = quota_convert_v1_to_v2(f_v1, f_v2, lqi, type); + if (!rc) { + /* we dont want good magic to store before the quota data, + * just to be safe if ldiskfs is running in writeback mode */ + LOCK_INODE_MUTEX(f_v2->f_dentry->d_inode); + rc = lustre_fsync(f_v2); + if (rc) + CERROR("error from fsync, rc=%d\n", rc); + UNLOCK_INODE_MUTEX(f_v2->f_dentry->d_inode); + + /* now that conversion successfully finished we mark + * this operational quota file with the correct magic, + * since this moment quotaon will treat it as a correct + * quota file */ + rc = lustre_init_quota_header(lqi, type, 0); + } + + EXIT; + +out_f_v2: + filp_close(f_v2, 0); +out_f_v1: + filp_close(f_v1, 0); +out_free: + OBD_FREE_PTR(lqi); +out: + return rc; +} +EXPORT_SYMBOL(lustre_slave_quota_convert); +#endif diff --git a/lustre/lvfs/lvfs_lib.c b/lustre/lvfs/lvfs_lib.c index 1a7bd013ffee4e7c796091513ceda2c08c42de2c..978000c90586d75607d2055cf028b5425b12af18 100644 --- a/lustre/lvfs/lvfs_lib.c +++ b/lustre/lvfs/lvfs_lib.c @@ -55,13 +55,13 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n", ptr ? "force " :"", type, name, (__u64)size, file, line); - CERROR(LPU64" total bytes and "LPU64" total pages " - "("LPU64" bytes) allocated by Lustre, " - "%d total bytes by LNET\n", - obd_memory_sum(), - obd_pages_sum() << CFS_PAGE_SHIFT, - obd_pages_sum(), - atomic_read(&libcfs_kmemory)); + CERROR(LPU64" total bytes and "LPU64" total pages " + "("LPU64" bytes) allocated by Lustre, " + "%d total bytes by LNET\n", + obd_memory_sum(), + obd_pages_sum() << CFS_PAGE_SHIFT, + obd_pages_sum(), + atomic_read(&libcfs_kmemory)); return 1; } return 0; @@ -82,7 +82,6 @@ void obd_update_maxusage() if (max2 > obd_max_alloc) obd_max_alloc = max2; spin_unlock(&obd_updatemax_lock); - } __u64 obd_memory_max(void) diff --git a/lustre/mdc/Makefile.in b/lustre/mdc/Makefile.in index b9b97935faa4dec8095d5f478478c354d10de453..95dfffb29fb976613bdfbdaa2604adebda530d9a 100644 --- a/lustre/mdc/Makefile.in +++ b/lustre/mdc/Makefile.in @@ -1,4 +1,4 @@ MODULES := mdc -mdc-objs := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o +mdc-objs := mdc_request.o mdc_reint.o lproc_mdc.o mdc_lib.o mdc_locks.o mdc_fid.o @INCLUDE_RULES@ diff --git a/lustre/mdc/autoMakefile.am b/lustre/mdc/autoMakefile.am index e39cc9f4634eeaf7d6d03a4a8c93e6ccd3d50792..193828b352cb0912e895e462cabd252b2aa2954a 100644 --- a/lustre/mdc/autoMakefile.am +++ b/lustre/mdc/autoMakefile.am @@ -5,7 +5,7 @@ if LIBLUSTRE noinst_LIBRARIES = libmdc.a -libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c +libmdc_a_SOURCES = mdc_request.c mdc_reint.c mdc_lib.c mdc_internal.h mdc_locks.c mdc_fid.c libmdc_a_CPPFLAGS = $(LLCPPFLAGS) libmdc_a_CFLAGS = $(LLCFLAGS) endif diff --git a/lustre/mdc/mdc_fid.c b/lustre/mdc/mdc_fid.c new file mode 100644 index 0000000000000000000000000000000000000000..ea003ef4d1adaaec9fef5221c25ebe870d103d44 --- /dev/null +++ b/lustre/mdc/mdc_fid.c @@ -0,0 +1,472 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * lustre/mdc/mdc_fid.c + * MDC fid management + * + * Copyright (c) 2006 Cluster File Systems, Inc. + * Author: Yury Umanets <umka@clusterfs.com> + * + * This file is part of the Lustre file system, http://www.lustre.org + * Lustre is a trademark of Cluster File Systems, Inc. + * + * You may have signed or agreed to another license before downloading + * this software. If so, you are bound by the terms and conditions + * of that agreement, and the following does not apply to you. See the + * LICENSE file included with this distribution for more information. + * + * If you did not agree to a different license, then this copy of Lustre + * is open source software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * In either case, Lustre is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty + * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * license text for more details. + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_FID + +#ifdef __KERNEL__ +# include <libcfs/libcfs.h> +# include <linux/module.h> +#else /* __KERNEL__ */ +# include <liblustre.h> +#endif + +#include <obd.h> +#include <obd_class.h> +#include <obd_support.h> +#include "mdc_internal.h" + +typedef __u64 mdsno_t; +struct md_fld { + seqno_t mf_seq; + mdsno_t mf_mds; +}; + +enum fld_op { + FLD_CREATE = 0, + FLD_DELETE = 1, + FLD_LOOKUP = 2 +}; + + +static int seq_client_rpc(struct lu_client_seq *seq, struct lu_range *input, + struct lu_range *output, __u32 opc, + const char *opcname) +{ + int rc, size[3] = { sizeof(struct ptlrpc_body), + sizeof(__u32), + sizeof(struct lu_range) }; + struct obd_export *exp = seq->lcs_exp; + struct ptlrpc_request *req; + struct lu_range *out, *in; + __u32 *op; + ENTRY; + + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, + SEQ_QUERY, 3, size, NULL); + if (req == NULL) + RETURN(-ENOMEM); + + req->rq_export = class_export_get(exp); + op = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(__u32)); + *op = opc; + + /* Zero out input range, this is not recovery yet. */ + in = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, + sizeof(struct lu_range)); + if (input != NULL) + *in = *input; + else + range_zero(in); + + size[1] = sizeof(struct lu_range); + ptlrpc_req_set_repsize(req, 2, size); + + LASSERT(seq->lcs_type == LUSTRE_SEQ_METADATA); + req->rq_request_portal = SEQ_METADATA_PORTAL; + + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + + if (rc) + GOTO(out_req, rc); + + out = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, + sizeof(struct lu_range)); + *output = *out; + + if (!range_is_sane(output)) { + CERROR("%s: Invalid range received from server: " + DRANGE"\n", seq->lcs_name, PRANGE(output)); + GOTO(out_req, rc = -EINVAL); + } + + if (range_is_exhausted(output)) { + CERROR("%s: Range received from server is exhausted: " + DRANGE"]\n", seq->lcs_name, PRANGE(output)); + GOTO(out_req, rc = -EINVAL); + } + *in = *out; + + CDEBUG(D_INFO, "%s: Allocated %s-sequence "DRANGE"]\n", + seq->lcs_name, opcname, PRANGE(output)); + + EXIT; +out_req: + ptlrpc_req_finished(req); + return rc; +} + + +static int fld_client_rpc(struct lu_client_seq *seq, + struct md_fld *mf, __u32 fld_op) +{ + int size[3] = { sizeof(struct ptlrpc_body), + sizeof(__u32), + sizeof(struct md_fld) }; + struct obd_export *exp = seq->lcs_exp; + struct ptlrpc_request *req; + struct md_fld *pmf; + __u32 *op; + int rc; + ENTRY; + + LASSERT(exp != NULL); + + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, + FLD_QUERY, 3, size, NULL); + if (req == NULL) + RETURN(-ENOMEM); + + req->rq_export = class_export_get(exp); + op = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(__u32)); + *op = fld_op; + + pmf = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, + sizeof(struct md_fld)); + *pmf = *mf; + + size[1] = sizeof(struct md_fld); + ptlrpc_req_set_repsize(req, 2, size); + req->rq_request_portal = FLD_REQUEST_PORTAL; + + mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + rc = ptlrpc_queue_wait(req); + mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); + if (rc) + GOTO(out_req, rc); + + pmf = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, + sizeof(struct md_fld)); + if (pmf == NULL) + GOTO(out_req, rc = -EFAULT); + *mf = *pmf; + EXIT; +out_req: + ptlrpc_req_finished(req); + return rc; +} + + +/* Request sequence-controller node to allocate new meta-sequence. */ +static int seq_client_alloc_meta(struct lu_client_seq *seq) +{ + int rc; + ENTRY; + + rc = seq_client_rpc(seq, NULL, &seq->lcs_space, + SEQ_ALLOC_META, "meta"); + RETURN(rc); +} + +/* Allocate new sequence for client. */ +static int seq_client_alloc_seq(struct lu_client_seq *seq, seqno_t *seqnr) +{ + int rc; + ENTRY; + + LASSERT(range_is_sane(&seq->lcs_space)); + + if (range_is_exhausted(&seq->lcs_space)) { + rc = seq_client_alloc_meta(seq); + if (rc) { + CERROR("%s: Can't allocate new meta-sequence, " + "rc %d\n", seq->lcs_name, rc); + RETURN(rc); + } else { + CDEBUG(D_INFO, "%s: New range - "DRANGE"\n", + seq->lcs_name, PRANGE(&seq->lcs_space)); + } + } else { + rc = 0; + } + + LASSERT(!range_is_exhausted(&seq->lcs_space)); + *seqnr = seq->lcs_space.lr_start; + seq->lcs_space.lr_start += 1; + + CDEBUG(D_INFO, "%s: Allocated sequence ["LPX64"]\n", seq->lcs_name, + *seqnr); + + RETURN(rc); +} + +/* Allocate new fid on passed client @seq and save it to @fid. */ +static int seq_client_alloc_fid(struct lu_client_seq *seq, struct lu_fid *fid) +{ + int rc; + ENTRY; + + LASSERT(seq != NULL); + LASSERT(fid != NULL); + + down(&seq->lcs_sem); + + if (fid_is_zero(&seq->lcs_fid) || + fid_oid(&seq->lcs_fid) >= seq->lcs_width) + { + seqno_t seqnr; + + rc = seq_client_alloc_seq(seq, &seqnr); + if (rc) { + CERROR("%s: Can't allocate new sequence, " + "rc %d\n", seq->lcs_name, rc); + up(&seq->lcs_sem); + RETURN(rc); + } + + CDEBUG(D_INFO, "%s: Switch to sequence " + "[0x%16.16"LPF64"x]\n", seq->lcs_name, seqnr); + + seq->lcs_fid.f_seq = seqnr; + seq->lcs_fid.f_oid = LUSTRE_FID_INIT_OID; + seq->lcs_fid.f_ver = 0; + + /* + * Inform caller that sequence switch is performed to allow it + * to setup FLD for it. + */ + rc = 1; + } else { + /* Just bump last allocated fid and return to caller. */ + seq->lcs_fid.f_oid += 1; + rc = 0; + } + + *fid = seq->lcs_fid; + up(&seq->lcs_sem); + + CDEBUG(D_INFO, "%s: Allocated FID "DFID"\n", seq->lcs_name, PFID(fid)); + RETURN(rc); +} + +/* + * Finish the current sequence due to disconnect. + * See mdc_import_event() + */ +static void seq_client_flush(struct lu_client_seq *seq) +{ + LASSERT(seq != NULL); + down(&seq->lcs_sem); + fid_init(&seq->lcs_fid); + range_zero(&seq->lcs_space); + up(&seq->lcs_sem); +} + +static int fld_client_create(struct lu_client_seq *lcs, + seqno_t seq, mdsno_t mds) +{ + struct md_fld md_fld = { .mf_seq = seq, .mf_mds = mds }; + int rc; + ENTRY; + + CDEBUG(D_INFO, "%s: Create fld entry (seq: "LPX64"; mds: " + LPU64") on target 0\n", lcs->lcs_name, seq, mds); + + rc = fld_client_rpc(lcs, &md_fld, FLD_CREATE); + RETURN(rc); +} + +static int seq_client_proc_init(struct lu_client_seq *seq) +{ + return 0; +} + +static void seq_client_proc_fini(struct lu_client_seq *seq) +{ + return; +} + +int seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + __u64 width, + const char *prefix) +{ + int rc; + ENTRY; + + LASSERT(seq != NULL); + LASSERT(prefix != NULL); + + seq->lcs_exp = exp; + seq->lcs_type = type; + sema_init(&seq->lcs_sem, 1); + seq->lcs_width = width; + + /* Make sure that things are clear before work is started. */ + seq_client_flush(seq); + + LASSERT(seq->lcs_exp != NULL); + seq->lcs_exp = class_export_get(seq->lcs_exp); + + snprintf(seq->lcs_name, sizeof(seq->lcs_name), + "cli-%s", prefix); + + rc = seq_client_proc_init(seq); + if (rc) + seq_client_fini(seq); + RETURN(rc); +} + +void seq_client_fini(struct lu_client_seq *seq) +{ + ENTRY; + + seq_client_proc_fini(seq); + LASSERT(seq->lcs_exp != NULL); + + if (seq->lcs_exp != NULL) { + class_export_put(seq->lcs_exp); + seq->lcs_exp = NULL; + } + + EXIT; +} + +/* Allocate new fid on passed client @seq and save it to @fid. */ +int mdc_fid_alloc(struct lu_client_seq *seq, struct lu_fid *fid) +{ + int rc; + ENTRY; + + rc = seq_client_alloc_fid(seq, fid); + if (rc > 0) { + /* Client switches to new sequence, setup FLD. */ + rc = fld_client_create(seq, fid_seq(fid), 0); + if (rc) { + CERROR("Can't create fld entry, rc %d\n", rc); + /* Delete just allocated fid sequence */ + seq_client_flush(seq); + } + } + RETURN(rc); +} + +void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + LASSERTF(fid_is_igif(src) || fid_ver(src) == 0, DFID"\n", PFID(src)); + dst->f_seq = cpu_to_le64(fid_seq(src)); + dst->f_oid = cpu_to_le32(fid_oid(src)); + dst->f_ver = cpu_to_le32(fid_ver(src)); +} +EXPORT_SYMBOL(fid_cpu_to_le); + +void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + dst->f_seq = le64_to_cpu(fid_seq(src)); + dst->f_oid = le32_to_cpu(fid_oid(src)); + dst->f_ver = le32_to_cpu(fid_ver(src)); + LASSERTF(fid_is_igif(dst) || fid_ver(dst) == 0, DFID"\n", PFID(dst)); +} +EXPORT_SYMBOL(fid_le_to_cpu); + +void range_cpu_to_le(struct lu_range *dst, const struct lu_range *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof src->lr_start + + sizeof src->lr_end); + dst->lr_start = cpu_to_le64(src->lr_start); + dst->lr_end = cpu_to_le64(src->lr_end); +} +EXPORT_SYMBOL(range_cpu_to_le); + +void range_le_to_cpu(struct lu_range *dst, const struct lu_range *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof src->lr_start + + sizeof src->lr_end); + dst->lr_start = le64_to_cpu(src->lr_start); + dst->lr_end = le64_to_cpu(src->lr_end); +} +EXPORT_SYMBOL(range_le_to_cpu); + +void range_cpu_to_be(struct lu_range *dst, const struct lu_range *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof src->lr_start + + sizeof src->lr_end); + dst->lr_start = cpu_to_be64(src->lr_start); + dst->lr_end = cpu_to_be64(src->lr_end); +} +EXPORT_SYMBOL(range_cpu_to_be); + +void range_be_to_cpu(struct lu_range *dst, const struct lu_range *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof src->lr_start + + sizeof src->lr_end); + dst->lr_start = be64_to_cpu(src->lr_start); + dst->lr_end = be64_to_cpu(src->lr_end); +} +EXPORT_SYMBOL(range_be_to_cpu); + +/** + * Build (DLM) resource name from fid. + */ +struct ldlm_res_id * +fid_build_reg_res_name(const struct lu_fid *f, struct ldlm_res_id *name) +{ + memset(name, 0, sizeof *name); + name->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(f); + name->name[LUSTRE_RES_ID_OID_OFF] = fid_oid(f); + if (!fid_is_igif(f)) + name->name[LUSTRE_RES_ID_VER_OFF] = fid_ver(f); + return name; +} +EXPORT_SYMBOL(fid_build_reg_res_name); + +/** + * Return true if resource is for object identified by fid. + */ +int fid_res_name_eq(const struct lu_fid *f, const struct ldlm_res_id *name) +{ + int ret; + + ret = name->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(f) && + name->name[LUSTRE_RES_ID_OID_OFF] == fid_oid(f); + if (!fid_is_igif(f)) + ret = ret && name->name[LUSTRE_RES_ID_VER_OFF] == fid_ver(f); + return ret; +} +EXPORT_SYMBOL(fid_res_name_eq); diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h index 843bcbe6a1bf1c7e204c74b523765f7a58c3a575..2da2bf751c120375eb25d874ade0652bec6c78ed 100644 --- a/lustre/mdc/mdc_internal.h +++ b/lustre/mdc/mdc_internal.h @@ -42,11 +42,23 @@ void mdc_link_pack(struct ptlrpc_request *req, int offset, void mdc_rename_pack(struct ptlrpc_request *req, int offset, struct mdc_op_data *data, const char *old, int oldlen, const char *new, int newlen); -void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa, +void mdc_close_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, + struct obdo *oa, __u64 valid, struct obd_client_handle *och); void mdc_exit_request(struct client_obd *cli); void mdc_enter_request(struct client_obd *cli); +int seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + __u64 width, + const char *prefix); + +void seq_client_fini(struct lu_client_seq *seq); + +int mdc_fid_alloc(struct lu_client_seq *seq, struct lu_fid *fid); + struct mdc_open_data { struct obd_client_handle *mod_och; struct ptlrpc_request *mod_open_req; @@ -85,3 +97,13 @@ static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, } EXIT; } + +static inline int mdc_exp_is_2_0_server(struct obd_export *exp) { + LASSERT(exp); + return !!(exp->exp_connect_flags & OBD_CONNECT_FID); +} + +static inline int mdc_req_is_2_0_server(struct ptlrpc_request *req) { + LASSERT(req); + return mdc_exp_is_2_0_server(req->rq_export); +} diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c index bb02005ca60648dd324b43c57deb7572e10aeadc..db2b8b69e4b4de678f5f4878b15f22937869337c 100644 --- a/lustre/mdc/mdc_lib.c +++ b/lustre/mdc/mdc_lib.c @@ -38,11 +38,18 @@ #endif #endif -void mdc_readdir_pack(struct ptlrpc_request *req, int offset, __u64 pg_off, - __u32 size, struct ll_fid *fid) +static void mdc_readdir_pack_18(struct ptlrpc_request *req, int offset, + __u64 pg_off, __u32 size, struct ll_fid *fid) { struct mds_body *b; + ENTRY; + + CLASSERT(sizeof(struct ll_fid) == sizeof(struct lu_fid)); + CLASSERT(sizeof(struct mds_body) == sizeof(struct mdt_body)); + CLASSERT((int)offsetof(struct mds_body, max_cookiesize) == + (int)offsetof(struct mdt_body, max_cookiesize)); + b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b)); b->fsuid = current->fsuid; b->fsgid = current->fsgid; @@ -51,49 +58,111 @@ void mdc_readdir_pack(struct ptlrpc_request *req, int offset, __u64 pg_off, b->size = pg_off; /* !! */ b->suppgid = -1; b->nlink = size; /* !! */ + EXIT; } -static void mdc_pack_body(struct mds_body *b) +static void mdc_readdir_pack_20(struct ptlrpc_request *req, int offset, + __u64 pg_off, __u32 size, struct ll_fid *fid) { - LASSERT (b != NULL); + struct mdt_body *b; + ENTRY; + b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b)); b->fsuid = current->fsuid; b->fsgid = current->fsgid; b->capability = current->cap_effective; + + if (fid) { + b->fid1 = *((struct lu_fid*)fid); + b->valid |= OBD_MD_FLID; + } + b->size = pg_off; /* !! */ + b->suppgid = -1; + b->nlink = size; /* !! */ + EXIT; } -void mdc_pack_req_body(struct ptlrpc_request *req, int offset, - __u64 valid, struct ll_fid *fid, int ea_size, int flags) +void mdc_readdir_pack(struct ptlrpc_request *req, int offset, + __u64 pg_off, __u32 size, struct ll_fid *fid) +{ + if (mdc_req_is_2_0_server(req)) + mdc_readdir_pack_20(req, offset, pg_off, size, fid); + else + mdc_readdir_pack_18(req, offset, pg_off, size, fid); +} + +static void mdc_pack_req_body_18(struct ptlrpc_request *req, int offset, + __u64 valid, struct ll_fid *fid, int ea_size, + int flags) { struct mds_body *b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b)); + ENTRY; + LASSERT (b != NULL); if (fid) b->fid1 = *fid; b->valid = valid; b->eadatasize = ea_size; b->flags = flags; - mdc_pack_body(b); + b->fsuid = current->fsuid; + b->fsgid = current->fsgid; + b->capability = current->cap_effective; + EXIT; +} + +static void mdc_pack_req_body_20(struct ptlrpc_request *req, int offset, + __u64 valid, struct ll_fid *fid, int ea_size, + int flags) +{ + struct mdt_body *b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b)); + ENTRY; + LASSERT (b != NULL); + + b->valid = valid; + b->eadatasize = ea_size; + b->flags = flags; + if (fid) { + b->fid1 = *((struct lu_fid*)fid); + b->valid |= OBD_MD_FLID; + } + + b->fsuid = current->fsuid; + b->fsgid = current->fsgid; + b->capability = current->cap_effective; + EXIT; +} + +void mdc_pack_req_body(struct ptlrpc_request *req, int offset, + __u64 valid, struct ll_fid *fid, int ea_size, + int flags) +{ + if (mdc_req_is_2_0_server(req)) + mdc_pack_req_body_20(req, offset, valid, fid, ea_size, flags); + else + mdc_pack_req_body_18(req, offset, valid, fid, ea_size, flags); } /* packing of MDS records */ -void mdc_create_pack(struct ptlrpc_request *req, int offset, - struct mdc_op_data *op_data, const void *data, int datalen, - __u32 mode, __u32 uid, __u32 gid, __u32 cap_effective, - __u64 rdev) +static void mdc_create_pack_18(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, const void *data, + int datalen, __u32 mode, __u32 uid, __u32 gid, + __u32 cap_effective, __u64 rdev) { struct mds_rec_create *rec; char *tmp; + ENTRY; + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); - rec->cr_opcode = REINT_CREATE; - rec->cr_fsuid = uid; - rec->cr_fsgid = gid; - rec->cr_cap = cap_effective; - rec->cr_fid = op_data->fid1; + rec->cr_opcode = REINT_CREATE; + rec->cr_fsuid = uid; + rec->cr_fsgid = gid; + rec->cr_cap = cap_effective; + rec->cr_fid = op_data->fid1; memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid)); - rec->cr_mode = mode; - rec->cr_rdev = rdev; - rec->cr_time = op_data->mod_time; + rec->cr_mode = mode; + rec->cr_rdev = rdev; + rec->cr_time = op_data->mod_time; rec->cr_suppgid = op_data->suppgids[0]; tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, op_data->namelen + 1); @@ -103,6 +172,53 @@ void mdc_create_pack(struct ptlrpc_request *req, int offset, tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, datalen); memcpy (tmp, data, datalen); } + EXIT; +} + +static void mdc_create_pack_20(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, const void *data, + int datalen, __u32 mode, __u32 uid, __u32 gid, + __u32 cap_effective, __u64 rdev) +{ + struct mdt_rec_create *rec; + char *tmp; + ENTRY; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + + rec->cr_opcode = REINT_CREATE; + rec->cr_fsuid = uid; + rec->cr_fsgid = gid; + rec->cr_cap = cap_effective; + rec->cr_fid1 = *((struct lu_fid*)&op_data->fid1); + rec->cr_fid2 = *((struct lu_fid*)&op_data->fid2); + rec->cr_mode = mode; + rec->cr_rdev = rdev; + rec->cr_time = op_data->mod_time; + rec->cr_suppgid1 = op_data->suppgids[0]; + + /* offset + 1 == capa */ + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, op_data->namelen + 1); + LOGL0(op_data->name, op_data->namelen, tmp); + + if (data) { + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 3, datalen); + memcpy(tmp, data, datalen); + } + EXIT; +} + +void mdc_create_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, const void *data, + int datalen, __u32 mode, __u32 uid, __u32 gid, + __u32 cap_effective, __u64 rdev) +{ + if (mdc_req_is_2_0_server(req)) + mdc_create_pack_20(req, offset, op_data, data, datalen, + mode, uid, gid, cap_effective, rdev); + else + mdc_create_pack_18(req, offset, op_data, data, datalen, + mode, uid, gid, cap_effective, rdev); } static __u32 mds_pack_open_flags(__u32 flags) @@ -133,36 +249,63 @@ static __u32 mds_pack_open_flags(__u32 flags) } /* packing of MDS records */ -void mdc_join_pack(struct ptlrpc_request *req, int offset, - struct mdc_op_data *op_data, __u64 head_size) +static void mdc_join_pack_18(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, __u64 head_size) { struct mds_rec_join *rec; + ENTRY; rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*rec)); LASSERT(rec != NULL); rec->jr_fid = op_data->fid2; rec->jr_headsize = head_size; + EXIT; } -void mdc_open_pack(struct ptlrpc_request *req, int offset, - struct mdc_op_data *op_data, __u32 mode, __u64 rdev, - __u32 flags, const void *lmm, int lmmlen) +static void mdc_join_pack_20(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, __u64 head_size) +{ + struct mdt_rec_join *rec; + ENTRY; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*rec)); + LASSERT(rec != NULL); + rec->jr_fid = *((struct lu_fid*)&op_data->fid2); + rec->jr_headsize = head_size; + EXIT; +} + +void mdc_join_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, __u64 head_size) +{ + if (mdc_req_is_2_0_server(req)) + mdc_join_pack_20(req, offset, op_data, head_size); + else + mdc_join_pack_18(req, offset, op_data, head_size); + +} + +static void mdc_open_pack_18(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, __u32 mode, __u64 rdev, + __u32 flags, const void *lmm, int lmmlen) { struct mds_rec_create *rec; char *tmp; + ENTRY; + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); /* XXX do something about time, uid, gid */ - rec->cr_opcode = REINT_OPEN; - rec->cr_fsuid = current->fsuid; - rec->cr_fsgid = current->fsgid; - rec->cr_cap = current->cap_effective; - rec->cr_fid = op_data->fid1; + rec->cr_opcode = REINT_OPEN; + rec->cr_fsuid = current->fsuid; + rec->cr_fsgid = current->fsgid; + rec->cr_cap = current->cap_effective; + rec->cr_fid = op_data->fid1; memset(&rec->cr_replayfid, 0, sizeof(rec->cr_replayfid)); - rec->cr_mode = mode; - rec->cr_flags = mds_pack_open_flags(flags); - rec->cr_rdev = rdev; - rec->cr_time = op_data->mod_time; + rec->cr_mode = mode; + rec->cr_flags = mds_pack_open_flags(flags); + rec->cr_rdev = rdev; + rec->cr_time = op_data->mod_time; rec->cr_suppgid = op_data->suppgids[0]; if (op_data->name) { @@ -180,6 +323,66 @@ void mdc_open_pack(struct ptlrpc_request *req, int offset, tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, lmmlen); memcpy (tmp, lmm, lmmlen); } + EXIT; +} + +static void mdc_open_pack_20(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, __u32 mode, __u64 rdev, + __u32 flags, const void *lmm, int lmmlen) +{ + struct mdt_rec_create *rec; + char *tmp; + ENTRY; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + + /* XXX do something about time, uid, gid */ + rec->cr_opcode = REINT_OPEN; + rec->cr_fsuid = current->fsuid; + rec->cr_fsgid = current->fsgid; + rec->cr_cap = current->cap_effective; + rec->cr_fid1 = *((struct lu_fid*)&op_data->fid1); + rec->cr_fid2 = *((struct lu_fid*)&op_data->fid2); + rec->cr_mode = mode; + rec->cr_flags = mds_pack_open_flags(flags); + rec->cr_rdev = rdev; + rec->cr_time = op_data->mod_time; + rec->cr_suppgid1 = op_data->suppgids[0]; + rec->cr_suppgid2 = op_data->suppgids[1]; + + if (op_data->name) { + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 3, + op_data->namelen + 1); + CDEBUG(D_INFO, "offset=%d, src=%p(%d):%s, dst=%p\n", + offset, op_data->name, op_data->namelen, + op_data->name, tmp); + LASSERT(tmp); + LOGL0(op_data->name, op_data->namelen, tmp); + } + + if (lmm) { + rec->cr_flags |= MDS_OPEN_HAS_EA; +#ifndef __KERNEL__ + /*XXX a hack for liblustre to set EA (LL_IOC_LOV_SETSTRIPE) */ + rec->cr_fid2 = *((struct lu_fid*)&op_data->fid2); +#endif + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 4, lmmlen); + memcpy(tmp, lmm, lmmlen); + } + EXIT; +} + +void mdc_open_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *op_data, __u32 mode, __u64 rdev, + __u32 flags, const void *lmm, int lmmlen) +{ + if (mdc_req_is_2_0_server(req)) + mdc_open_pack_20(req, offset, op_data, mode, rdev, + flags, lmm, lmmlen); + else + mdc_open_pack_18(req, offset, op_data, mode, rdev, + flags, lmm, lmmlen); + } static inline __u64 attr_pack(unsigned int ia_valid) { @@ -221,12 +424,14 @@ static inline __u64 attr_pack(unsigned int ia_valid) { return sa_valid; } -void mdc_setattr_pack(struct ptlrpc_request *req, int offset, - struct mdc_op_data *data, struct iattr *iattr, void *ea, - int ealen, void *ea2, int ea2len) +void mdc_setattr_pack_18(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, struct iattr *iattr, void *ea, + int ealen, void *ea2, int ea2len) { struct mds_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*rec)); + ENTRY; + rec->sa_opcode = REINT_SETATTR; rec->sa_fsuid = current->fsuid; rec->sa_fsgid = current->fsgid; @@ -251,22 +456,86 @@ void mdc_setattr_pack(struct ptlrpc_request *req, int offset, rec->sa_suppgid = data->suppgids[0]; } - if (ealen == 0) + if (ealen == 0) { + EXIT; return; + } memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 1, ealen), ea, ealen); - if (ea2len == 0) + if (ea2len == 0) { + EXIT; return; - + } memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 2, ea2len), ea2, ea2len); + + EXIT; } -void mdc_unlink_pack(struct ptlrpc_request *req, int offset, - struct mdc_op_data *data) +static void mdc_setattr_pack_20(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, struct iattr *iattr, + void *ea, int ealen, void *ea2, int ea2len) +{ + struct mdt_rec_setattr *rec = lustre_msg_buf(req->rq_reqmsg, offset, + sizeof(*rec)); + ENTRY; + + rec->sa_opcode = REINT_SETATTR; + rec->sa_fsuid = current->fsuid; + rec->sa_fsgid = current->fsgid; + rec->sa_cap = current->cap_effective; + rec->sa_fid = *((struct lu_fid*)&data->fid1); + rec->sa_suppgid = -1; + + if (iattr) { + rec->sa_valid = attr_pack(iattr->ia_valid); + rec->sa_mode = iattr->ia_mode; + rec->sa_uid = iattr->ia_uid; + rec->sa_gid = iattr->ia_gid; + rec->sa_size = iattr->ia_size; +// rec->sa_blocks = iattr->ia_blocks; + rec->sa_atime = LTIME_S(iattr->ia_atime); + rec->sa_mtime = LTIME_S(iattr->ia_mtime); + rec->sa_ctime = LTIME_S(iattr->ia_ctime); + rec->sa_attr_flags = + ((struct ll_iattr_struct *)iattr)->ia_attr_flags; + if ((iattr->ia_valid & ATTR_GID) && in_group_p(iattr->ia_gid)) + rec->sa_suppgid = iattr->ia_gid; + else + rec->sa_suppgid = data->suppgids[0]; + } + if (ealen == 0) { + EXIT; + return; + } + memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 3, ealen), ea, ealen); + + if (ea2len == 0) { + EXIT; + return; + } + memcpy(lustre_msg_buf(req->rq_reqmsg, offset + 4, ea2len), ea2, ea2len); + EXIT; +} + +void mdc_setattr_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, struct iattr *iattr, + void *ea, int ealen, void *ea2, int ea2len) +{ + if (mdc_req_is_2_0_server(req)) + mdc_setattr_pack_20(req, offset, data, iattr, + ea, ealen, ea2, ea2len); + else + mdc_setattr_pack_18(req, offset, data, iattr, + ea, ealen, ea2, ea2len); +} + +static void mdc_unlink_pack_18(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data) { struct mds_rec_unlink *rec; char *tmp; + ENTRY; rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); LASSERT (rec != NULL); @@ -284,13 +553,51 @@ void mdc_unlink_pack(struct ptlrpc_request *req, int offset, tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, data->namelen + 1); LASSERT (tmp != NULL); LOGL0(data->name, data->namelen, tmp); + EXIT; } -void mdc_link_pack(struct ptlrpc_request *req, int offset, - struct mdc_op_data *data) +static void mdc_unlink_pack_20(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data) +{ + struct mdt_rec_unlink *rec; + char *tmp; + ENTRY; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + LASSERT (rec != NULL); + + rec->ul_opcode = REINT_UNLINK; + rec->ul_fsuid = current->fsuid; + rec->ul_fsgid = current->fsgid; + rec->ul_cap = current->cap_effective; + rec->ul_mode = data->create_mode; + rec->ul_suppgid1= data->suppgids[0]; + rec->ul_fid1 = *((struct lu_fid*)&data->fid1); + rec->ul_fid2 = *((struct lu_fid*)&data->fid2); + rec->ul_time = data->mod_time; + + /* NULL capa is skipped. */ + + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, data->namelen + 1); + LASSERT (tmp != NULL); + LOGL0(data->name, data->namelen, tmp); + EXIT; +} + +void mdc_unlink_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data) +{ + if (mdc_req_is_2_0_server(req)) + mdc_unlink_pack_20(req, offset, data); + else + mdc_unlink_pack_18(req, offset, data); +} +static void mdc_link_pack_18(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data) { struct mds_rec_link *rec; char *tmp; + ENTRY; rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); @@ -306,14 +613,53 @@ void mdc_link_pack(struct ptlrpc_request *req, int offset, tmp = lustre_msg_buf(req->rq_reqmsg, offset + 1, data->namelen + 1); LOGL0(data->name, data->namelen, tmp); + EXIT; } -void mdc_rename_pack(struct ptlrpc_request *req, int offset, - struct mdc_op_data *data, - const char *old, int oldlen, const char *new, int newlen) +static void mdc_link_pack_20(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data) +{ + struct mdt_rec_link *rec; + char *tmp; + ENTRY; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + + rec->lk_opcode = REINT_LINK; + rec->lk_fsuid = current->fsuid; + rec->lk_fsgid = current->fsgid; + rec->lk_cap = current->cap_effective; + rec->lk_suppgid1 = data->suppgids[0]; + rec->lk_suppgid2 = data->suppgids[1]; + rec->lk_fid1 = *((struct lu_fid*)&data->fid1); + rec->lk_fid2 = *((struct lu_fid*)&data->fid2); + rec->lk_time = data->mod_time; + + + /* capa @ offset + 1; */ + /* capa @ offset + 2; */ + + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 3, data->namelen + 1); + LOGL0(data->name, data->namelen, tmp); + EXIT; +} + +void mdc_link_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data) +{ + if (mdc_req_is_2_0_server(req)) + mdc_link_pack_20(req, offset, data); + else + mdc_link_pack_18(req, offset, data); +} + +static void mdc_rename_pack_18(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, const char *old, + int oldlen, const char *new, int newlen) { struct mds_rec_rename *rec; char *tmp; + ENTRY; rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); @@ -335,12 +681,61 @@ void mdc_rename_pack(struct ptlrpc_request *req, int offset, tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, newlen + 1); LOGL0(new, newlen, tmp); } + EXIT; +} + +static void mdc_rename_pack_20(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, const char *old, + int oldlen, const char *new, int newlen) +{ + struct mdt_rec_rename *rec; + char *tmp; + ENTRY; + + rec = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*rec)); + + /* XXX do something about time, uid, gid */ + rec->rn_opcode = REINT_RENAME; + rec->rn_fsuid = current->fsuid; + rec->rn_fsgid = current->fsgid; + rec->rn_cap = current->cap_effective; + rec->rn_suppgid1 = data->suppgids[0]; + rec->rn_suppgid2 = data->suppgids[1]; + rec->rn_fid1 = *((struct lu_fid*)&data->fid1); + rec->rn_fid2 = *((struct lu_fid*)&data->fid2); + rec->rn_time = data->mod_time; + rec->rn_mode = data->create_mode; + + + /* skip capa @ offset + 1 */ + /* skip capa @ offset + 2 */ + + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 3, oldlen + 1); + LOGL0(old, oldlen, tmp); + + if (new) { + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 4, newlen + 1); + LOGL0(new, newlen, tmp); + } + EXIT; +} + +void mdc_rename_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, const char *old, + int oldlen, const char *new, int newlen) +{ + if (mdc_req_is_2_0_server(req)) + mdc_rename_pack_20(req, offset, data, old, oldlen, new, newlen); + else + mdc_rename_pack_18(req, offset, data, old, oldlen, new, newlen); } -void mdc_getattr_pack(struct ptlrpc_request *req, int offset, __u64 valid, - int flags, struct mdc_op_data *data) +static void mdc_getattr_pack_18(struct ptlrpc_request *req, int offset, + __u64 valid, int flags, struct mdc_op_data *data) { struct mds_body *b; + ENTRY; + b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*b)); b->fsuid = current->fsuid; @@ -364,16 +759,56 @@ void mdc_getattr_pack(struct ptlrpc_request *req, int offset, __u64 valid, memcpy(tmp, data->name, data->namelen); data->name = tmp; } + EXIT; +} + +static void mdc_getattr_pack_20(struct ptlrpc_request *req, int offset, + __u64 valid, int flags, struct mdc_op_data *data) +{ + struct mdt_body *b; + ENTRY; + + b = lustre_msg_buf(req->rq_reqmsg, offset, sizeof (*b)); + + b->fsuid = current->fsuid; + b->fsgid = current->fsgid; + b->capability = current->cap_effective; + b->valid = valid; + b->flags = flags | MDS_BFLAG_EXT_FLAGS; + b->suppgid = data->suppgids[0]; + + b->fid1 = *((struct lu_fid*)&data->fid1); + b->fid2 = *((struct lu_fid*)&data->fid2); + b->valid |= OBD_MD_FLID; + if (data->name) { + char *tmp; + tmp = lustre_msg_buf(req->rq_reqmsg, offset + 2, + data->namelen + 1); + LASSERT(tmp); + LOGL0(data->name, data->namelen, tmp); + } + EXIT; } -void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa, - __u64 valid, struct obd_client_handle *och) +void mdc_getattr_pack(struct ptlrpc_request *req, int offset, + __u64 valid, int flags, struct mdc_op_data *data) +{ + if (mdc_req_is_2_0_server(req)) + mdc_getattr_pack_20(req, offset, valid, flags, data); + else + mdc_getattr_pack_18(req, offset, valid, flags, data); +} +static void mdc_close_pack_18(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, + struct obdo *oa, __u64 valid, + struct obd_client_handle *och) { struct mds_body *body; + ENTRY; body = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*body)); - mdc_pack_fid(&body->fid1, oa->o_id, 0, oa->o_mode); + body->fid1 = data->fid1; memcpy(&body->handle, &och->och_fh, sizeof(body->handle)); if (oa->o_valid & OBD_MD_FLATIME) { body->atime = oa->o_atime; @@ -399,8 +834,72 @@ void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa, body->flags = oa->o_flags; body->valid |= OBD_MD_FLFLAGS; } + EXIT; } +static void mdc_close_pack_20(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, + struct obdo *oa, __u64 valid, + struct obd_client_handle *och) +{ + struct mdt_epoch *epoch; + struct mdt_rec_setattr *rec; + ENTRY; + + epoch = lustre_msg_buf(req->rq_reqmsg, offset, sizeof(*epoch)); + rec = lustre_msg_buf(req->rq_reqmsg, offset + 1, sizeof(*rec)); + + rec->sa_opcode = REINT_SETATTR; + rec->sa_fsuid = current->fsuid; + rec->sa_fsgid = current->fsgid; + rec->sa_cap = current->cap_effective; + rec->sa_suppgid = -1; + + rec->sa_fid = *((struct lu_fid*)&data->fid1); + + if (oa->o_valid & OBD_MD_FLATIME) { + rec->sa_atime = oa->o_atime; + rec->sa_valid |= MDS_ATTR_ATIME; + } + if (oa->o_valid & OBD_MD_FLMTIME) { + rec->sa_mtime = oa->o_mtime; + rec->sa_valid |= MDS_ATTR_MTIME; + } + if (oa->o_valid & OBD_MD_FLCTIME) { + rec->sa_ctime = oa->o_ctime; + rec->sa_valid |= MDS_ATTR_CTIME; + } + if (oa->o_valid & OBD_MD_FLSIZE) { + rec->sa_size = oa->o_size; + rec->sa_valid |= MDS_ATTR_SIZE; + } + if (oa->o_valid & OBD_MD_FLBLOCKS) { + rec->sa_blocks = oa->o_blocks; + rec->sa_valid |= MDS_ATTR_BLOCKS; + } + if (oa->o_valid & OBD_MD_FLFLAGS) { + rec->sa_attr_flags = oa->o_flags; + rec->sa_valid |= MDS_ATTR_ATTR_FLAG; + } + + epoch->handle = och->och_fh; + epoch->ioepoch = 0; + epoch->flags = 0; + + EXIT; +} + + +void mdc_close_pack(struct ptlrpc_request *req, int offset, + struct mdc_op_data *data, + struct obdo *oa, __u64 valid, + struct obd_client_handle *och) +{ + if (mdc_req_is_2_0_server(req)) + mdc_close_pack_20(req, offset, data, oa, valid, och); + else + mdc_close_pack_18(req, offset, data, oa, valid, och); +} struct mdc_cache_waiter { struct list_head mcw_entry; wait_queue_head_t mcw_waitq; diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index 91d923d9e8acbcdbd88a23ceec9ddf5839a11011..3438deb5ef809e628ee926005acceddde7c68ef7 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -134,12 +134,10 @@ EXPORT_SYMBOL(mdc_set_lock_data); int mdc_change_cbdata(struct obd_export *exp, struct ll_fid *fid, ldlm_iterator_t it, void *data) { - struct ldlm_res_id res_id = { .name = {0} }; + struct ldlm_res_id res_id; ENTRY; - res_id.name[0] = fid->id; - res_id.name[1] = fid->generation; - + fid_build_reg_res_name((struct lu_fid*)fid, &res_id); ldlm_resource_iterate(class_exp2obd(exp)->obd_namespace, &res_id, it, data); @@ -197,7 +195,7 @@ static void mdc_realloc_openmsg(struct ptlrpc_request *req, OBD_ALLOC(new_msg, new_size); if (new_msg != NULL) { - DEBUG_REQ(D_INFO, req, "replace reqmsg for larger EA %u\n", + DEBUG_REQ(D_INFO, req, "replace reqmsg for larger EA %u", body->eadatasize); memcpy(new_msg, old_msg, old_size); @@ -222,7 +220,7 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, struct ptlrpc_request *req; struct ldlm_intent *lit; struct obd_device *obddev = class_exp2obd(exp); - int size[7] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + int size[9] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREQ_OFF] = sizeof(struct ldlm_request), [DLM_INTENT_IT_OFF] = sizeof(*lit), [DLM_INTENT_REC_OFF] = sizeof(struct mds_rec_create), @@ -236,7 +234,7 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, * default-sized LOV EA for open replay. */ [DLM_INTENT_REC_OFF+2]= max(lmmsize, obddev->u.cli.cl_default_mds_easize) }; - int repsize[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + int repsize[7] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREPLY_OFF] = sizeof(struct ldlm_reply), [DLM_REPLY_REC_OFF] = sizeof(struct mds_body), [DLM_REPLY_REC_OFF+1] = obddev->u.cli. @@ -245,18 +243,31 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, CFS_LIST_HEAD(cancels); int do_join = (it->it_flags & O_JOIN_FILE) && data->data; int count = 0; + int bufcount = 6; + int repbufcount = 5; int mode; int rc; + ENTRY; - it->it_create_mode |= S_IFREG; - - rc = lustre_msg_size(class_exp2cliimp(exp)->imp_msg_magic, 6, size); + it->it_create_mode = (it->it_create_mode & ~S_IFMT) | S_IFREG; + if (mdc_exp_is_2_0_server(exp)) { + size[DLM_INTENT_REC_OFF] = sizeof(struct mdt_rec_create); + size[DLM_INTENT_REC_OFF+4] = size[DLM_INTENT_REC_OFF+2]; + size[DLM_INTENT_REC_OFF+3] = size[DLM_INTENT_REC_OFF+1]; + size[DLM_INTENT_REC_OFF+2] = 0; /* capa */ + size[DLM_INTENT_REC_OFF+1] = 0; /* capa */ + bufcount = 8; + repsize[DLM_REPLY_REC_OFF+3]=sizeof(struct lustre_capa); + repsize[DLM_REPLY_REC_OFF+4]=sizeof(struct lustre_capa); + repbufcount = 7; + } + rc = lustre_msg_size(class_exp2cliimp(exp)->imp_msg_magic, + bufcount, size); if (rc & (rc - 1)) - size[DLM_INTENT_REC_OFF + 2] = - min(size[DLM_INTENT_REC_OFF + 2] + round_up(rc) - rc, - obddev->u.cli.cl_max_mds_easize); + size[bufcount - 1] = min(size[bufcount - 1] + round_up(rc) - rc, + obddev->u.cli.cl_max_mds_easize); - /* If inode is known, cancel conflicting OPEN locks. */ + /* If inode is known, cancel conflicting OPEN locks. */ if (data->fid2.id) { if (it->it_flags & (FMODE_WRITE|MDS_OPEN_TRUNC)) mode = LCK_CW; @@ -279,14 +290,19 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, MDS_INODELOCK_UPDATE); if (do_join) { __u64 head_size = (*(__u64 *)data->data); - /* join is like an unlink of the tail */ - size[DLM_INTENT_REC_OFF + 3] = sizeof(struct mds_rec_join); - req = ldlm_prep_enqueue_req(exp, 7, size, &cancels, count); + /* join is like an unlink of the tail */ + if (mdc_exp_is_2_0_server(exp)) { + size[DLM_INTENT_REC_OFF+5]=sizeof(struct mdt_rec_join); + } else { + size[DLM_INTENT_REC_OFF+3]=sizeof(struct mds_rec_join); + } + bufcount++; + + req = ldlm_prep_enqueue_req(exp, bufcount, size,&cancels,count); if (req) - mdc_join_pack(req, DLM_INTENT_REC_OFF + 3, data, - head_size); + mdc_join_pack(req, bufcount - 1, data, head_size); } else { - req = ldlm_prep_enqueue_req(exp, 6, size, &cancels, count); + req = ldlm_prep_enqueue_req(exp, bufcount, size,&cancels,count); it->it_flags &= ~O_JOIN_FILE; } @@ -305,9 +321,9 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, it->it_create_mode, 0, it->it_flags, lmm, lmmsize); - ptlrpc_req_set_repsize(req, 5, repsize); + ptlrpc_req_set_repsize(req, repbufcount, repsize); } - return req; + RETURN(req); } static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp, @@ -320,7 +336,9 @@ static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp, int size[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREQ_OFF] = sizeof(struct ldlm_request), [DLM_INTENT_IT_OFF] = sizeof(*lit), - [DLM_INTENT_REC_OFF] = sizeof(struct mds_rec_unlink), + [DLM_INTENT_REC_OFF] = mdc_exp_is_2_0_server(exp) ? + sizeof(struct mdt_rec_unlink) : + sizeof(struct mds_rec_unlink), [DLM_INTENT_REC_OFF+1]= data->namelen + 1 }; int repsize[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREPLY_OFF] = sizeof(struct ldlm_reply), @@ -329,6 +347,7 @@ static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp, cl_max_mds_easize, [DLM_REPLY_REC_OFF+2] = obddev->u.cli. cl_max_mds_cookiesize }; + ENTRY; req = ldlm_prep_enqueue_req(exp, 5, size, NULL, 0); if (req) { @@ -342,7 +361,7 @@ static struct ptlrpc_request *mdc_intent_unlink_pack(struct obd_export *exp, ptlrpc_req_set_repsize(req, 5, repsize); } - return req; + RETURN(req); } static struct ptlrpc_request *mdc_intent_lookup_pack(struct obd_export *exp, @@ -352,21 +371,30 @@ static struct ptlrpc_request *mdc_intent_lookup_pack(struct obd_export *exp, struct ptlrpc_request *req; struct ldlm_intent *lit; struct obd_device *obddev = class_exp2obd(exp); - int size[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + int size[6] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREQ_OFF] = sizeof(struct ldlm_request), [DLM_INTENT_IT_OFF] = sizeof(*lit), [DLM_INTENT_REC_OFF] = sizeof(struct mds_body), - [DLM_INTENT_REC_OFF+1]= data->namelen + 1 }; - int repsize[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [DLM_INTENT_REC_OFF+1]= data->namelen + 1, + [DLM_INTENT_REC_OFF+2]= 0 }; + int repsize[6] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREPLY_OFF] = sizeof(struct ldlm_reply), [DLM_REPLY_REC_OFF] = sizeof(struct mds_body), [DLM_REPLY_REC_OFF+1] = obddev->u.cli. cl_max_mds_easize, - [DLM_REPLY_REC_OFF+2] = LUSTRE_POSIX_ACL_MAX_SIZE }; + [DLM_REPLY_REC_OFF+2] = LUSTRE_POSIX_ACL_MAX_SIZE, + [DLM_REPLY_REC_OFF+3] = 0 }; obd_valid valid = OBD_MD_FLGETATTR | OBD_MD_FLEASIZE | OBD_MD_FLACL | OBD_MD_FLMODEASIZE | OBD_MD_FLDIREA; + int bufcount = 5; + ENTRY; - req = ldlm_prep_enqueue_req(exp, 5, size, NULL, 0); + if (mdc_exp_is_2_0_server(exp)) { + size[DLM_INTENT_REC_OFF+1] = 0; /* capa */ + size[DLM_INTENT_REC_OFF+2] = data->namelen + 1; + bufcount = 6; + } + req = ldlm_prep_enqueue_req(exp, bufcount, size, NULL, 0); if (req) { /* pack the intent */ lit = lustre_msg_buf(req->rq_reqmsg, DLM_INTENT_IT_OFF, @@ -376,9 +404,9 @@ static struct ptlrpc_request *mdc_intent_lookup_pack(struct obd_export *exp, /* pack the intended request */ mdc_getattr_pack(req, DLM_INTENT_REC_OFF, valid, it->it_flags, data); - ptlrpc_req_set_repsize(req, 5, repsize); + ptlrpc_req_set_repsize(req, bufcount, repsize); } - return req; + RETURN(req); } static struct ptlrpc_request *mdc_intent_readdir_pack(struct obd_export *exp) @@ -386,13 +414,15 @@ static struct ptlrpc_request *mdc_intent_readdir_pack(struct obd_export *exp) struct ptlrpc_request *req; int size[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), [DLM_LOCKREQ_OFF] = sizeof(struct ldlm_request) }; - int repsize[2] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), - [DLM_LOCKREPLY_OFF] = sizeof(struct ldlm_reply) }; + int repsize[3] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [DLM_LOCKREPLY_OFF] = sizeof(struct ldlm_reply), + [DLM_REPLY_REC_OFF] = sizeof(struct ost_lvb) }; + ENTRY; req = ldlm_prep_enqueue_req(exp, 2, size, NULL, 0); if (req) - ptlrpc_req_set_repsize(req, 2, repsize); - return req; + ptlrpc_req_set_repsize(req, 3, repsize); + RETURN(req); } static int mdc_finish_enqueue(struct obd_export *exp, @@ -435,7 +465,7 @@ static int mdc_finish_enqueue(struct obd_export *exp, lockrep = lustre_msg_buf(req->rq_repmsg, DLM_LOCKREPLY_OFF, sizeof(*lockrep)); - LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */ + LASSERT(lockrep != NULL); /* checked by ldlm_cli_enqueue() */ /* swabbed by ldlm_cli_enqueue() */ LASSERT(lustre_rep_swabbed(req, DLM_LOCKREPLY_OFF)); @@ -543,18 +573,28 @@ int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, { struct ptlrpc_request *req; struct obd_device *obddev = class_exp2obd(exp); - struct ldlm_res_id res_id = - { .name = {data->fid1.id, data->fid1.generation} }; + struct ldlm_res_id res_id; ldlm_policy_data_t policy = { .l_inodebits = { MDS_INODELOCK_LOOKUP } }; int flags = extra_lock_flags | LDLM_FL_HAS_INTENT; int rc; ENTRY; + fid_build_reg_res_name((struct lu_fid*)&data->fid1, &res_id); LASSERTF(einfo->ei_type == LDLM_IBITS,"lock type %d\n", einfo->ei_type); if (it->it_op & (IT_UNLINK | IT_GETATTR | IT_READDIR)) policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; if (it->it_op & IT_OPEN) { + if ((it->it_op & IT_CREAT) && mdc_exp_is_2_0_server(exp)) { + struct client_obd *cli = &obddev->u.cli; + data->fid3 = data->fid2; + rc = mdc_fid_alloc(cli->cl_seq, + (struct lu_fid*)&data->fid2); + if (rc) { + CERROR("fid allocation result: %d\n", rc); + RETURN(rc); + } + } req = mdc_intent_open_pack(exp, it, data, lmm, lmmsize); if (it->it_flags & O_JOIN_FILE) { policy.l_inodebits.bits = MDS_INODELOCK_UPDATE; @@ -600,11 +640,13 @@ int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, /* We could just return 1 immediately, but since we should only * be called in revalidate_it if we already have a lock, let's * verify that. */ - struct ldlm_res_id res_id = {.name ={fid->id, fid->generation}}; + struct ldlm_res_id res_id; struct lustre_handle lockh; ldlm_policy_data_t policy; ldlm_mode_t mode; + ENTRY; + fid_build_reg_res_name((struct lu_fid*)fid, &res_id); /* As not all attributes are kept under update lock, e.g. owner/group/acls are under lookup lock, we need both ibits for GETATTR. */ @@ -620,7 +662,7 @@ int mdc_revalidate_lock(struct obd_export *exp, struct lookup_intent *it, it->d.lustre.it_lock_mode = mode; } - return !!mode; + RETURN(!!mode); } EXPORT_SYMBOL(mdc_revalidate_lock); @@ -659,10 +701,15 @@ static int mdc_finish_intent_lock(struct obd_export *exp, /* If we were revalidating a fid/name pair, mark the intent in * case we fail and get called again from lookup */ - if (data->fid2.id && (it->it_op != IT_GETATTR)) { + + if (data->fid2.id && (it->it_op != IT_GETATTR) && + ( !mdc_exp_is_2_0_server(exp) || + (mdc_exp_is_2_0_server(exp) && (it->it_flags & O_CHECK_STALE)))) { it_set_disposition(it, DISP_ENQ_COMPLETE); + /* Also: did we find the same inode? */ - if (memcmp(&data->fid2, &mds_body->fid1, sizeof(data->fid2))) + if (memcmp(&data->fid2, &mds_body->fid1, sizeof(data->fid2)) && + memcmp(&data->fid3, &mds_body->fid1, sizeof(data->fid3))) RETURN(-ESTALE); } @@ -762,10 +809,14 @@ int mdc_intent_lock(struct obd_export *exp, struct mdc_op_data *op_data, LASSERT(it); - CDEBUG(D_DLMTRACE,"name: %.*s in inode "LPU64", intent: %s flags %#o\n", - op_data->namelen, op_data->name, op_data->fid1.id, + CDEBUG(D_DLMTRACE,"name: %.*s("DFID") in inode ("DFID"), " + "intent: %s flags %#o\n", + op_data->namelen, op_data->name, + PFID(((struct lu_fid*)&op_data->fid2)), + PFID(((struct lu_fid*)&op_data->fid1)), ldlm_it2str(it->it_op), it->it_flags); + lockh.cookie = 0; if (op_data->fid2.id && (it->it_op == IT_LOOKUP || it->it_op == IT_GETATTR)) { rc = mdc_revalidate_lock(exp, it, &op_data->fid2); @@ -866,10 +917,7 @@ int mdc_intent_getattr_async(struct obd_export *exp, struct lookup_intent *it = &minfo->mi_it; struct ptlrpc_request *req; struct obd_device *obddev = class_exp2obd(exp); - struct ldlm_res_id res_id = { - .name = {op_data->fid1.id, - op_data->fid1.generation} - }; + struct ldlm_res_id res_id; ldlm_policy_data_t policy = { .l_inodebits = { MDS_INODELOCK_LOOKUP } }; @@ -882,6 +930,7 @@ int mdc_intent_getattr_async(struct obd_export *exp, op_data->namelen, op_data->name, op_data->fid1.id, ldlm_it2str(it->it_op), it->it_flags); + fid_build_reg_res_name((struct lu_fid*)&op_data->fid1, &res_id); req = mdc_intent_lookup_pack(exp, it, op_data); if (!req) RETURN(-ENOMEM); diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index 2e663f90de45cbe972f4aa3d80d0621211ef9568..0f32a5ce5bd7a4bacd785875280daccf7ba8fc26 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -70,12 +70,15 @@ int mdc_resource_get_unused(struct obd_export *exp, struct ll_fid *fid, __u64 bits) { struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; - struct ldlm_res_id res_id = { .name = {fid->id, fid->generation} }; - struct ldlm_resource *res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + struct ldlm_res_id res_id; + struct ldlm_resource *res; ldlm_policy_data_t policy = {{0}}; int count; ENTRY; + fid_build_reg_res_name((struct lu_fid*)fid, &res_id); + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + if (res == NULL) RETURN(0); @@ -107,22 +110,40 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *op_data, { CFS_LIST_HEAD(cancels); struct ptlrpc_request *req; - struct mds_rec_setattr *rec; struct mdc_rpc_lock *rpc_lock; struct obd_device *obd = exp->exp_obd; - int size[5] = { sizeof(struct ptlrpc_body), - sizeof(*rec), ealen, ea2len, - sizeof(struct ldlm_request) }; - int count, bufcount = 2, rc; + int size[7] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [REQ_REC_OFF] = sizeof(struct mds_rec_setattr), + [REQ_REC_OFF + 1] = ealen, + [REQ_REC_OFF + 2] = ea2len, + [REQ_REC_OFF + 3] = sizeof(struct ldlm_request) }; + int replysize[6] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [REPLY_REC_OFF] = sizeof(struct mds_body), + [REPLY_REC_OFF+1] = obd->u.cli.cl_max_mds_easize, + [REPLY_REC_OFF+2] = LUSTRE_POSIX_ACL_MAX_SIZE, + [REPLY_REC_OFF+3] = sizeof(struct lustre_capa), + [REPLY_REC_OFF+4] = sizeof(struct lustre_capa)}; + + int count, bufcount = 2, rc, replybufcount = 2; + int offset = REQ_REC_OFF + 3; __u64 bits; ENTRY; LASSERT(iattr != NULL); - if (ealen > 0) { - bufcount++; - if (ea2len > 0) - bufcount++; + if (mdc_exp_is_2_0_server(exp)) { + size[REQ_REC_OFF] = sizeof(struct mdt_rec_setattr); + size[REQ_REC_OFF + 1] = 0; /* capa */ + size[REQ_REC_OFF + 2] = 0; //sizeof (struct mdt_epoch); + size[REQ_REC_OFF + 3] = ealen; + size[REQ_REC_OFF + 4] = ea2len; + size[REQ_REC_OFF + 5] = sizeof(struct ldlm_request); + offset = REQ_REC_OFF + 5; + bufcount = 6; + replysize[REPLY_REC_OFF] = sizeof(struct mdt_body); + replybufcount = 6; + } else { + bufcount = 4; } bits = MDS_INODELOCK_UPDATE; @@ -131,9 +152,9 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *op_data, count = mdc_resource_get_unused(exp, &op_data->fid1, &cancels, LCK_EX, bits); if (exp_connect_cancelset(exp)) - bufcount = 5; + bufcount ++ ; req = mdc_prep_elc_req(exp, bufcount, size, - REQ_REC_OFF + 3, &cancels, count); + offset, &cancels, count); if (req == NULL) RETURN(-ENOMEM); @@ -151,8 +172,7 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *op_data, mdc_setattr_pack(req, REQ_REC_OFF, op_data, iattr, ea, ealen, ea2, ea2len); - size[REPLY_REC_OFF] = sizeof(struct mds_body); - ptlrpc_req_set_repsize(req, 2, size); + ptlrpc_req_set_repsize(req, replybufcount, replysize); rc = mdc_reint(req, rpc_lock, LUSTRE_IMP_FULL); *request = req; @@ -170,12 +190,21 @@ int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data, struct obd_device *obd = exp->exp_obd; struct ptlrpc_request *req; int level, bufcount = 3, rc; - int size[5] = { sizeof(struct ptlrpc_body), + int size[6] = { sizeof(struct ptlrpc_body), sizeof(struct mds_rec_create), op_data->namelen + 1, 0, sizeof(struct ldlm_request) }; + int offset = REQ_REC_OFF + 3; int count; ENTRY; + if (mdc_exp_is_2_0_server(exp)) { + size[REQ_REC_OFF] = sizeof(struct mdt_rec_create); + size[REQ_REC_OFF + 1] = 0; /* capa */ + size[REQ_REC_OFF + 2] = op_data->namelen + 1; + size[REQ_REC_OFF + 4] = sizeof(struct ldlm_request); + bufcount++; + offset ++; + } if (data && datalen) { size[bufcount] = datalen; bufcount++; @@ -183,10 +212,25 @@ int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data, count = mdc_resource_get_unused(exp, &op_data->fid1, &cancels, LCK_EX, MDS_INODELOCK_UPDATE); - if (exp_connect_cancelset(exp)) - bufcount = 5; + if (exp_connect_cancelset(exp)) { + if (mdc_exp_is_2_0_server(exp)) { + bufcount = 6; + } else { + bufcount = 5; + } + } + + if (mdc_exp_is_2_0_server(exp)) { + struct client_obd *cli = &obd->u.cli; + rc = mdc_fid_alloc(cli->cl_seq, (struct lu_fid*)&op_data->fid2); + if (rc) { + CERROR("fid allocation result: %d\n", rc); + RETURN(rc); + } + } + req = mdc_prep_elc_req(exp, bufcount, size, - REQ_REC_OFF + 3, &cancels, count); + offset, &cancels, count); if (req == NULL) RETURN(-ENOMEM); @@ -196,7 +240,8 @@ int mdc_create(struct obd_export *exp, struct mdc_op_data *op_data, gid, cap_effective, rdev); size[REPLY_REC_OFF] = sizeof(struct mds_body); - ptlrpc_req_set_repsize(req, 2, size); + size[REPLY_REC_OFF+1] = sizeof(struct ost_lvb); + ptlrpc_req_set_repsize(req, 3, size); level = LUSTRE_IMP_FULL; resend: @@ -220,12 +265,23 @@ int mdc_unlink(struct obd_export *exp, struct mdc_op_data *op_data, CFS_LIST_HEAD(cancels); struct obd_device *obd = class_exp2obd(exp); struct ptlrpc_request *req = *request; - int size[4] = { sizeof(struct ptlrpc_body), - sizeof(struct mds_rec_unlink), - op_data->namelen + 1, sizeof(struct ldlm_request) }; + int size[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [REQ_REC_OFF] = sizeof(struct mds_rec_unlink), + [REQ_REC_OFF + 1] = op_data->namelen + 1, + [REQ_REC_OFF + 2] = sizeof(struct ldlm_request) }; int count, rc, bufcount = 3; + int offset = REQ_REC_OFF + 2; ENTRY; + if (mdc_exp_is_2_0_server(exp)) { + size[REQ_REC_OFF] = sizeof(struct mdt_rec_unlink); + size[REQ_REC_OFF + 1] = 0 /* capa */; + size[REQ_REC_OFF + 2] = op_data->namelen + 1; + size[REQ_REC_OFF + 3] = sizeof(struct ldlm_request); + bufcount ++; + offset ++; + } + LASSERT(req == NULL); count = mdc_resource_get_unused(exp, &op_data->fid1, &cancels, LCK_EX, MDS_INODELOCK_UPDATE); @@ -233,9 +289,10 @@ int mdc_unlink(struct obd_export *exp, struct mdc_op_data *op_data, count += mdc_resource_get_unused(exp, &op_data->fid3, &cancels, LCK_EX, MDS_INODELOCK_FULL); if (exp_connect_cancelset(exp)) - bufcount = 4; + bufcount ++; + req = mdc_prep_elc_req(exp, bufcount, size, - REQ_REC_OFF + 2, &cancels, count); + offset, &cancels, count); if (req == NULL) RETURN(-ENOMEM); *request = req; @@ -259,20 +316,33 @@ int mdc_link(struct obd_export *exp, struct mdc_op_data *op_data, CFS_LIST_HEAD(cancels); struct obd_device *obd = exp->exp_obd; struct ptlrpc_request *req; - int size[4] = { sizeof(struct ptlrpc_body), - sizeof(struct mds_rec_link), - op_data->namelen + 1, sizeof(struct ldlm_request) }; + int size[6] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [REQ_REC_OFF] = sizeof(struct mds_rec_link), + [REQ_REC_OFF + 1] = op_data->namelen + 1, + [REQ_REC_OFF + 2] = sizeof(struct ldlm_request)}; int count, rc, bufcount = 3; + int offset = REQ_REC_OFF + 2; ENTRY; + if (mdc_exp_is_2_0_server(exp)) { + size[REQ_REC_OFF] = sizeof(struct mdt_rec_link); + size[REQ_REC_OFF + 1] = 0; /* capa */ + size[REQ_REC_OFF + 2] = 0; /* capa */ + size[REQ_REC_OFF + 3] = op_data->namelen + 1; + size[REQ_REC_OFF + 4] = sizeof(struct ldlm_request); + bufcount = 5; + offset += 2; + } + count = mdc_resource_get_unused(exp, &op_data->fid1, &cancels, LCK_EX, MDS_INODELOCK_UPDATE); count += mdc_resource_get_unused(exp, &op_data->fid2, &cancels, LCK_EX, MDS_INODELOCK_UPDATE); if (exp_connect_cancelset(exp)) - bufcount = 4; + bufcount++; + req = mdc_prep_elc_req(exp, bufcount, size, - REQ_REC_OFF + 2, &cancels, count); + offset, &cancels, count); if (req == NULL) RETURN(-ENOMEM); @@ -296,12 +366,26 @@ int mdc_rename(struct obd_export *exp, struct mdc_op_data *op_data, CFS_LIST_HEAD(cancels); struct obd_device *obd = exp->exp_obd; struct ptlrpc_request *req; - int size[5] = { sizeof(struct ptlrpc_body), - sizeof(struct mds_rec_rename), - oldlen + 1, newlen + 1, sizeof(struct ldlm_request) }; + int size[7] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [REQ_REC_OFF] = sizeof(struct mds_rec_rename), + [REQ_REC_OFF + 1] = oldlen + 1, + [REQ_REC_OFF + 2] = newlen + 1, + [REQ_REC_OFF + 3] = sizeof(struct ldlm_request) }; int count, rc, bufcount = 4; + int offset = REQ_REC_OFF + 3; ENTRY; + if (mdc_exp_is_2_0_server(exp)) { + size[REQ_REC_OFF] = sizeof(struct mdt_rec_rename); + size[REQ_REC_OFF + 1] = 0; /* capa */ + size[REQ_REC_OFF + 2] = 0; /* capa */ + size[REQ_REC_OFF + 3] = oldlen + 1; + size[REQ_REC_OFF + 4] = newlen + 1; + size[REQ_REC_OFF + 5] = sizeof(struct ldlm_request); + bufcount = 6; + offset += 2; + } + count = mdc_resource_get_unused(exp, &op_data->fid1, &cancels, LCK_EX, MDS_INODELOCK_UPDATE); count += mdc_resource_get_unused(exp, &op_data->fid2, &cancels, @@ -313,9 +397,10 @@ int mdc_rename(struct obd_export *exp, struct mdc_op_data *op_data, count += mdc_resource_get_unused(exp, &op_data->fid4, &cancels, LCK_EX, MDS_INODELOCK_FULL); if (exp_connect_cancelset(exp)) - bufcount = 5; + bufcount ++; + req = mdc_prep_elc_req(exp, bufcount, size, - REQ_REC_OFF + 3, &cancels, count); + offset, &cancels, count); if (req == NULL) RETURN(-ENOMEM); diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index 0d40904db96b54502a92918d9aaed741edf46f68..204cfce6ff075f1c74f586e28172c995b25d93ea 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -54,21 +54,24 @@ static int mdc_cleanup(struct obd_device *obd); extern int mds_queue_req(struct ptlrpc_request *); /* Helper that implements most of mdc_getstatus and signal_completed_replay. */ /* XXX this should become mdc_get_info("key"), sending MDS_GET_INFO RPC */ -static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid, +static int send_getstatus(struct obd_export *exp, struct ll_fid *rootfid, int level, int msg_flags) { struct ptlrpc_request *req; struct mds_body *body; - int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; + int rc, size[3] = { sizeof(struct ptlrpc_body), + sizeof(*body), + sizeof (struct lustre_capa)}; ENTRY; - req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION, MDS_GETSTATUS, 2, size, + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, MDS_GETSTATUS, 2, size, NULL); if (!req) GOTO(out, rc = -ENOMEM); + req->rq_export = class_export_get(exp); req->rq_send_state = level; - ptlrpc_req_set_repsize(req, 2, size); + ptlrpc_req_set_repsize(req, 3, size); mdc_pack_req_body(req, REQ_REC_OFF, 0, NULL, 0, 0); lustre_msg_add_flags(req->rq_reqmsg, msg_flags); @@ -100,8 +103,7 @@ static int send_getstatus(struct obd_import *imp, struct ll_fid *rootfid, /* This should be mdc_get_info("rootfid") */ int mdc_getstatus(struct obd_export *exp, struct ll_fid *rootfid) { - return send_getstatus(class_exp2cliimp(exp), rootfid, LUSTRE_IMP_FULL, - 0); + return send_getstatus(exp, rootfid, LUSTRE_IMP_FULL, 0); } static @@ -111,12 +113,11 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size, struct obd_device *obddev = class_exp2obd(exp); struct mds_body *body; void *eadata; - int size[4] = { sizeof(struct ptlrpc_body), sizeof(*body) }; + int size[6] = { sizeof(struct ptlrpc_body), sizeof(*body) }; int bufcount = 2, rc; ENTRY; - + /* request message already built */ - if (ea_size != 0) { size[bufcount++] = ea_size; CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n", @@ -127,6 +128,10 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size, CDEBUG(D_INODE, "reserved %u bytes for ACL\n", acl_size); } + if (mdc_exp_is_2_0_server(exp)) { + bufcount = 6; + } + ptlrpc_req_set_repsize(req, bufcount, size); mdc_enter_request(&obddev->u.cli); @@ -154,7 +159,7 @@ int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size, RETURN (-EPROTO); } } - + if (body->valid & OBD_MD_FLMODEASIZE) { if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize) exp->exp_obd->u.cli.cl_max_mds_easize = @@ -185,6 +190,7 @@ int mdc_getattr(struct obd_export *exp, struct ll_fid *fid, if (!req) GOTO(out, rc = -ENOMEM); + req->rq_export = class_export_get(exp); mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, ea_size, MDS_BFLAG_EXT_FLAGS/*request "new" flags(bug 9486)*/); @@ -207,20 +213,31 @@ int mdc_getattr_name(struct obd_export *exp, struct ll_fid *fid, unsigned int ea_size, struct ptlrpc_request **request) { struct ptlrpc_request *req; - struct mds_body *body; - int rc, size[3] = { sizeof(struct ptlrpc_body), sizeof(*body), namelen}; + int rc, size[4] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [REQ_REC_OFF] = sizeof(struct mds_body), + [REQ_REC_OFF + 1] = namelen}; + int bufcount = 3; + int nameoffset = REQ_REC_OFF + 1; ENTRY; + if (mdc_exp_is_2_0_server(exp)) { + size[REQ_REC_OFF + 1] = 0; + size[REQ_REC_OFF + 2] = namelen; + bufcount ++; + nameoffset ++; + } + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, - MDS_GETATTR_NAME, 3, size, NULL); + MDS_GETATTR_NAME, bufcount, size, NULL); if (!req) GOTO(out, rc = -ENOMEM); + req->rq_export = class_export_get(exp); mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, ea_size, MDS_BFLAG_EXT_FLAGS/*request "new" flags(bug 9486)*/); LASSERT(strnlen(filename, namelen) == namelen - 1); - memcpy(lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, namelen), + memcpy(lustre_msg_buf(req->rq_reqmsg, nameoffset, namelen), filename, namelen); rc = mdc_getattr_common(exp, ea_size, 0, req); @@ -241,12 +258,24 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid, { struct obd_device *obddev = class_exp2obd(exp); struct ptlrpc_request *req; - int size[4] = { sizeof(struct ptlrpc_body), sizeof(struct mds_body) }; - // int size[3] = {sizeof(struct mds_body)}, bufcnt = 1; - int rc, xattr_namelen = 0, bufcnt = 2, offset; + int size[5] = { [MSG_PTLRPC_BODY_OFF] = sizeof(struct ptlrpc_body), + [REQ_REC_OFF] = sizeof(struct mds_body), + [REQ_REC_OFF + 1] = 0, /* capa */ + [REQ_REC_OFF + 2] = 0, /* name */ + [REQ_REC_OFF + 3] = 0 }; + int rc, xattr_namelen = 0, bufcnt = 2, offset = REQ_REC_OFF + 1; void *tmp; ENTRY; + if (mdc_exp_is_2_0_server(exp)) { + bufcnt++; + offset++; + if (opcode == MDS_SETXATTR) { + size[REQ_REC_OFF] = sizeof (struct mdt_rec_setxattr); + opcode = MDS_REINT; + } + } + if (xattr_name) { xattr_namelen = strlen(xattr_name) + 1; size[bufcnt++] = xattr_namelen; @@ -261,10 +290,26 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid, if (!req) GOTO(out, rc = -ENOMEM); - /* request data */ - mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, output_size, flags); - - offset = REQ_REC_OFF + 1; + req->rq_export = class_export_get(exp); + + if (opcode == MDS_REINT && mdc_exp_is_2_0_server(exp)) { + struct mdt_rec_setxattr *rec; + rec = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, + sizeof(struct mdt_rec_setxattr)); + rec->sx_opcode = REINT_SETXATTR; + rec->sx_fsuid = current->fsuid; + rec->sx_fsgid = current->fsgid; + rec->sx_cap = current->cap_effective; + rec->sx_suppgid1 = -1; + rec->sx_suppgid2 = -1; + rec->sx_fid = *((struct lu_fid*)fid); + rec->sx_valid = valid; + rec->sx_size = output_size; + rec->sx_flags = flags; + } else { + /* request data */ + mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, output_size, flags); + } if (xattr_name) { tmp = lustre_msg_buf(req->rq_reqmsg, offset++, xattr_namelen); @@ -275,28 +320,32 @@ int mdc_xattr_common(struct obd_export *exp, struct ll_fid *fid, memcpy(tmp, input, input_size); } - /* reply buffers */ - if (opcode == MDS_GETXATTR) { - size[REPLY_REC_OFF] = sizeof(struct mds_body); + size[REPLY_REC_OFF] = sizeof(struct mds_body); + if (mdc_exp_is_2_0_server(exp)) { bufcnt = 2; } else { - bufcnt = 1; + /* reply buffers */ + if (opcode == MDS_GETXATTR) { + bufcnt = 2; + } else { + bufcnt = 1; + } + } /* we do this even output_size is 0, because server is doing that */ size[bufcnt++] = output_size; - ptlrpc_req_set_repsize(req, bufcnt, size); /* make rpc */ - if (opcode == MDS_SETXATTR) + if (opcode == MDS_SETXATTR || opcode == MDS_REINT) mdc_get_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); else mdc_enter_request(&obddev->u.cli); rc = ptlrpc_queue_wait(req); - if (opcode == MDS_SETXATTR) + if (opcode == MDS_SETXATTR || opcode == MDS_REINT) mdc_put_rpc_lock(exp->exp_obd->u.cli.cl_rpc_lock, NULL); else mdc_exit_request(&obddev->u.cli); @@ -412,6 +461,7 @@ int mdc_req2lustre_md(struct ptlrpc_request *req, int offset, struct lustre_md *md) { int rc = 0; + int iop = mdc_req_is_2_0_server(req); ENTRY; LASSERT(md); @@ -450,22 +500,24 @@ int mdc_req2lustre_md(struct ptlrpc_request *req, int offset, if (rc < sizeof(*md->lsm)) { CERROR ("lsm size too small: rc < sizeof (*md->lsm) " - "(%d < "LPSZ")\n", rc, sizeof(*md->lsm)); + "(%d < %d)\n", rc, (int)sizeof(*md->lsm)); GOTO(err_out, rc = -EPROTO); } rc = 0; - offset++; - } - - if (md->body->valid & OBD_MD_FLDIREA) { + if (!iop) + offset++; + } else if (md->body->valid & OBD_MD_FLDIREA) { if(!S_ISDIR(md->body->mode)) { CERROR("OBD_MD_FLDIREA set, should be a directory, but " "is not\n"); GOTO(err_out, rc = -EPROTO); } - offset++; + if (!iop) + offset++; } + if (iop) + offset++; /* for ACL, it's possible that FLACL is set but aclsize is zero. * only when aclsize != 0 there's an actual segment for ACL in @@ -534,6 +586,7 @@ static void mdc_replay_open(struct ptlrpc_request *req) EXIT; return; } + DEBUG_REQ(D_ERROR, req, "mdc open data found"); och = mod->mod_och; if (och != NULL) { @@ -654,11 +707,11 @@ static void mdc_commit_close(struct ptlrpc_request *req) spin_unlock(&open_req->rq_lock); } -int mdc_close(struct obd_export *exp, struct obdo *oa, +int mdc_close(struct obd_export *exp, struct mdc_op_data *data, struct obdo *oa, struct obd_client_handle *och, struct ptlrpc_request **request) { struct obd_device *obd = class_exp2obd(exp); - int reqsize[2] = { sizeof(struct ptlrpc_body), + int reqsize[4] = { sizeof(struct ptlrpc_body), sizeof(struct mds_body) }; int rc, repsize[4] = { sizeof(struct ptlrpc_body), sizeof(struct mds_body), @@ -666,12 +719,20 @@ int mdc_close(struct obd_export *exp, struct obdo *oa, obd->u.cli.cl_max_mds_cookiesize }; struct ptlrpc_request *req; struct mdc_open_data *mod; + int bufcount = 2; ENTRY; + if (mdc_exp_is_2_0_server(exp)) { + reqsize[1] = sizeof(struct mdt_epoch); + reqsize[2] = sizeof(struct mdt_rec_create); + reqsize[3] = 0; /* capa */ + bufcount = 4; + } req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, - MDS_CLOSE, 2, reqsize, NULL); + MDS_CLOSE, bufcount, reqsize, NULL); if (req == NULL) GOTO(out, rc = -ENOMEM); + req->rq_export = class_export_get(exp); /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a * portal whose threads are not taking any DLM locks and are therefore @@ -692,12 +753,13 @@ int mdc_close(struct obd_export *exp, struct obdo *oa, GOTO(out, rc = -EIO); } mod->mod_close_req = req; + DEBUG_REQ(D_RPCTRACE, mod->mod_close_req, "close req"); DEBUG_REQ(D_RPCTRACE, mod->mod_open_req, "matched open"); } else { CDEBUG(D_RPCTRACE, "couldn't find open req; expecting error\n"); } - mdc_close_pack(req, REQ_REC_OFF, oa, oa->o_valid, och); + mdc_close_pack(req, REQ_REC_OFF, data, oa, oa->o_valid, och); ptlrpc_req_set_repsize(req, 4, repsize); req->rq_commit_cb = mdc_commit_close; @@ -741,7 +803,8 @@ int mdc_close(struct obd_export *exp, struct obdo *oa, return rc; } -int mdc_done_writing(struct obd_export *exp, struct obdo *obdo) +int mdc_done_writing(struct obd_export *exp, struct mdc_op_data *data, + struct obdo *obdo) { struct ptlrpc_request *req; struct mds_body *body; @@ -753,8 +816,9 @@ int mdc_done_writing(struct obd_export *exp, struct obdo *obdo) if (req == NULL) RETURN(-ENOMEM); + req->rq_export = class_export_get(exp); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); - mdc_pack_fid(&body->fid1, obdo->o_id, 0, obdo->o_mode); + body->fid1 = data->fid1; body->size = obdo->o_size; body->blocks = obdo->o_blocks; body->flags = obdo->o_flags; @@ -785,6 +849,7 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *fid, __u64 offset, if (req == NULL) GOTO(out, rc = -ENOMEM); + req->rq_export = class_export_get(exp); req->rq_request_portal = MDS_READPAGE_PORTAL; ptlrpc_at_set_req_timeout(req); @@ -917,7 +982,7 @@ int mdc_set_info_async(struct obd_export *exp, obd_count keylen, exp->exp_obd->obd_name, imp->imp_initial_recov_bk); RETURN(0); } - if (KEY_IS("read-only")) { + if (KEY_IS(KEY_READONLY)) { struct ptlrpc_request *req; int size[3] = { sizeof(struct ptlrpc_body), keylen, vallen }; char *bufs[3] = { NULL, key, val }; @@ -940,6 +1005,7 @@ int mdc_set_info_async(struct obd_export *exp, obd_count keylen, if (req == NULL) RETURN(-ENOMEM); + req->rq_export = class_export_get(exp); ptlrpc_req_set_repsize(req, 1, NULL); if (set) { rc = 0; @@ -961,8 +1027,7 @@ int mdc_get_info(struct obd_export *exp, __u32 keylen, void *key, { int rc = -EINVAL; - if (keylen == strlen("max_easize") && - memcmp(key, "max_easize", strlen("max_easize")) == 0) { + if (KEY_IS(KEY_MAX_EASIZE)) { int mdsize, *max_easize; if (*vallen != sizeof(int)) @@ -982,19 +1047,30 @@ static int mdc_statfs(struct obd_device *obd, struct obd_statfs *osfs, { struct ptlrpc_request *req; struct obd_statfs *msfs; + struct obd_import *imp = NULL; int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*msfs) }; ENTRY; + /*Since the request might also come from lprocfs, so we need + *sync this with client_disconnect_export Bug15684*/ + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + if (!imp) + RETURN(-ENODEV); + + /* We could possibly pass max_age in the request (as an absolute * timestamp or a "seconds.usec ago") so the target can avoid doing * extra calls into the filesystem if that isn't necessary (e.g. * during mount that would help a bit). Having relative timestamps * is not so great if request processing is slow, while absolute * timestamps are not ideal because they need time synchronization. */ - req = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_MDS_VERSION, - MDS_STATFS, 1, NULL, NULL); + req = ptlrpc_prep_req(imp, LUSTRE_MDS_VERSION, MDS_STATFS, 1, NULL, + NULL); if (!req) - RETURN(-ENOMEM); + GOTO(output, rc = -ENOMEM); ptlrpc_req_set_repsize(req, 2, size); @@ -1020,25 +1096,30 @@ static int mdc_statfs(struct obd_device *obd, struct obd_statfs *osfs, EXIT; out: ptlrpc_req_finished(req); - +output: + class_import_put(imp); return rc; } -static int mdc_pin(struct obd_export *exp, obd_id ino, __u32 gen, int type, +static int mdc_pin(struct obd_export *exp, struct ll_fid *fid, struct obd_client_handle *handle, int flag) { struct ptlrpc_request *req; struct mds_body *body; - int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; + int rc, size[3] = { sizeof(struct ptlrpc_body), sizeof(*body), 0 }; + int bufcount = 2; ENTRY; + if (mdc_exp_is_2_0_server(exp)) + bufcount = 3; req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, - MDS_PIN, 2, size, NULL); + MDS_PIN, bufcount, size, NULL); if (req == NULL) RETURN(-ENOMEM); - + + req->rq_export = class_export_get(exp); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); - mdc_pack_fid(&body->fid1, ino, gen, type); + body->fid1 = *fid; body->flags = flag; ptlrpc_req_set_repsize(req, 2, size); @@ -1088,6 +1169,7 @@ static int mdc_unpin(struct obd_export *exp, if (req == NULL) RETURN(-ENOMEM); + req->rq_export = class_export_get(exp); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); memcpy(&body->handle, &handle->och_fh, sizeof(body->handle)); body->flags = flag; @@ -1110,15 +1192,20 @@ int mdc_sync(struct obd_export *exp, struct ll_fid *fid, struct ptlrpc_request **request) { struct ptlrpc_request *req; - int size[2] = { sizeof(struct ptlrpc_body), sizeof(struct mds_body) }; + int size[3] = { sizeof(struct ptlrpc_body), sizeof(struct mds_body), 0 }; + int bufcount = 2; int rc; ENTRY; + + if (mdc_exp_is_2_0_server(exp)) + bufcount = 3; req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_MDS_VERSION, - MDS_SYNC, 2, size, NULL); + MDS_SYNC, bufcount, size, NULL); if (!req) RETURN(rc = -ENOMEM); + req->rq_export = class_export_get(exp); mdc_pack_req_body(req, REQ_REC_OFF, 0, fid, 0, 0); ptlrpc_req_set_repsize(req, 2, size); @@ -1234,7 +1321,7 @@ int mdc_init_ea_size(struct obd_export *mdc_exp, struct obd_export *lov_exp) int rc, size; ENTRY; - rc = obd_get_info(lov_exp, strlen(KEY_LOVDESC) + 1, KEY_LOVDESC, + rc = obd_get_info(lov_exp, sizeof(KEY_LOVDESC), KEY_LOVDESC, &valsize, &desc); if (rc) RETURN(rc); @@ -1281,11 +1368,12 @@ static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) class_destroy_import(imp); obd->u.cli.cl_import = NULL; } - break; - case OBD_CLEANUP_SELF_EXP: rc = obd_llog_finish(obd, 0); if (rc != 0) CERROR("failed to cleanup llogging subsystems\n"); + break; + case OBD_CLEANUP_SELF_EXP: + break; case OBD_CLEANUP_OBD: break; } @@ -1320,7 +1408,7 @@ static int mdc_llog_init(struct obd_device *obd, struct obd_device *tgt, &llog_client_ops); if (rc == 0) { ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); - ctxt->loc_imp = obd->u.cli.cl_import; + llog_initiator_connect(ctxt); llog_ctxt_put(ctxt); } @@ -1328,7 +1416,7 @@ static int mdc_llog_init(struct obd_device *obd, struct obd_device *tgt, &llog_client_ops); if (rc == 0) { ctxt = llog_get_context(obd, LLOG_LOVEA_REPL_CTXT); - ctxt->loc_imp = obd->u.cli.cl_import; + llog_initiator_connect(ctxt); llog_ctxt_put(ctxt); } @@ -1360,6 +1448,57 @@ static int mdc_process_config(struct obd_device *obd, obd_count len, void *buf) return(rc); } +static int mdc_fid_init(struct obd_export *exp) +{ + struct client_obd *cli; + char *prefix; + int rc; + ENTRY; + + cli = &exp->exp_obd->u.cli; + + OBD_ALLOC_PTR(cli->cl_seq); + if (cli->cl_seq == NULL) + RETURN(-ENOMEM); + + OBD_ALLOC(prefix, MAX_OBD_NAME + 5); + if (prefix == NULL) + GOTO(out_free_seq, rc = -ENOMEM); + + snprintf(prefix, MAX_OBD_NAME + 5, "srv-%s", exp->exp_obd->obd_name); + + /* Init client side sequence-manager */ + rc = seq_client_init(cli->cl_seq, exp, + LUSTRE_SEQ_METADATA, + LUSTRE_SEQ_MAX_WIDTH, + prefix); + OBD_FREE(prefix, MAX_OBD_NAME + 5); + if (rc) + GOTO(out_free_seq, rc); + + RETURN(rc); + +out_free_seq: + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + return rc; +} + +static int mdc_fid_fini(struct obd_export *exp) +{ + struct client_obd *cli = &exp->exp_obd->u.cli; + ENTRY; + + if (cli->cl_seq != NULL) { + LASSERT(cli->cl_seq->lcs_exp == exp); + seq_client_fini(cli->cl_seq); + OBD_FREE_PTR(cli->cl_seq); + cli->cl_seq = NULL; + } + + RETURN(0); +} + struct obd_ops mdc_obd_ops = { .o_owner = THIS_MODULE, .o_setup = mdc_setup, @@ -1369,6 +1508,8 @@ struct obd_ops mdc_obd_ops = { .o_del_conn = client_import_del_conn, .o_connect = client_connect_import, .o_disconnect = client_disconnect_export, + .o_fid_init = mdc_fid_init, + .o_fid_fini = mdc_fid_fini, .o_iocontrol = mdc_iocontrol, .o_set_info_async = mdc_set_info_async, .o_get_info = mdc_get_info, diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 072591a1252cff7c1c8cb38dfcf32069018fe897..ba680967ca0c02f3a4773f83812ea68557a72f7e 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -206,6 +206,7 @@ struct dentry *mds_fid2locked_dentry(struct obd_device *obd, struct ll_fid *fid, struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, struct vfsmount **mnt) { + struct obd_device *obd = container_of(mds, struct obd_device, u.mds); char fid_name[32]; unsigned long ino = fid->id; __u32 generation = fid->generation; @@ -222,7 +223,7 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, /* under ext3 this is neither supposed to return bad inodes nor NULL inodes. */ - result = ll_lookup_one_len(fid_name, mds->mds_fid_de, strlen(fid_name)); + result = mds_lookup(obd, fid_name, mds->mds_fid_de, strlen(fid_name)); if (IS_ERR(result)) RETURN(result); @@ -233,8 +234,6 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, if (inode->i_nlink == 0) { if (inode->i_mode == 0 && LTIME_S(inode->i_ctime) == 0 ) { - struct obd_device *obd = container_of(mds, struct - obd_device, u.mds); LCONSOLE_WARN("Found inode with zero nlink, mode and " "ctime -- this may indicate disk" "corruption (device %s, inode %lu, link:" @@ -327,8 +326,7 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, { struct obd_export *exp; struct mds_export_data *med; - struct mds_client_data *mcd = NULL; - lnet_nid_t *client_nid = (lnet_nid_t *)localdata; + struct lsd_client_data *lcd = NULL; int rc, abort_recovery; ENTRY; @@ -348,7 +346,7 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, * * There is a second race between adding the export to the list, * and filling in the client data below. Hence skipping the case - * of NULL mcd above. We should already be controlling multiple + * of NULL lcd above. We should already be controlling multiple * connects at the client, and we can't hold the spinlock over * memory allocations without risk of deadlocking. */ @@ -363,21 +361,21 @@ static int mds_connect(struct lustre_handle *conn, struct obd_device *obd, if (rc) GOTO(out, rc); - OBD_ALLOC(mcd, sizeof(*mcd)); - if (!mcd) + OBD_ALLOC_PTR(lcd); + if (!lcd) GOTO(out, rc = -ENOMEM); - memcpy(mcd->mcd_uuid, cluuid, sizeof(mcd->mcd_uuid)); - med->med_mcd = mcd; + memcpy(lcd->lcd_uuid, cluuid, sizeof(lcd->lcd_uuid)); + med->med_lcd = lcd; - rc = mds_client_add(obd, exp, -1, *client_nid); + rc = mds_client_add(obd, exp, -1, localdata); GOTO(out, rc); out: if (rc) { - if (mcd) { - OBD_FREE(mcd, sizeof(*mcd)); - med->med_mcd = NULL; + if (lcd) { + OBD_FREE_PTR(lcd); + med->med_lcd = NULL; } class_disconnect(exp); } else { @@ -703,7 +701,6 @@ static int mds_getattr_internal(struct obd_device *obd, struct dentry *dentry, body = lustre_msg_buf(req->rq_repmsg, reply_off, sizeof(*body)); LASSERT(body != NULL); /* caller prepped reply */ - mds_pack_inode2fid(&body->fid1, inode); body->flags = reqbody->flags; /* copy MDS_BFLAG_EXT_FLAGS if present */ mds_pack_inode2body(body, inode); reply_off++; @@ -955,6 +952,8 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, } #endif + /* child_lockh() is only set in fixup_handle_for_resent_req() + * if MSG_RESENT is set */ if (lustre_handle_is_used(child_lockh)) { LASSERT(lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT); resent_req = 1; @@ -988,6 +987,8 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, struct ldlm_resource *res; DEBUG_REQ(D_DLMTRACE, req, "resent, not enqueuing new locks"); granted_lock = ldlm_handle2lock(child_lockh); + /* lock was granted in fixup_handle_for_resent_req() if + * MSG_RESENT is set */ LASSERTF(granted_lock != NULL, LPU64"/%u lockh "LPX64"\n", body->fid1.id, body->fid1.generation, child_lockh->cookie); @@ -997,7 +998,12 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, child_fid.id = res->lr_name.name[0]; child_fid.generation = res->lr_name.name[1]; dchild = mds_fid2dentry(&obd->u.mds, &child_fid, NULL); - LASSERT(!IS_ERR(dchild)); + if (IS_ERR(dchild)) { + rc = PTR_ERR(dchild); + LCONSOLE_WARN("Child "LPU64"/%u lookup error %d.", + child_fid.id, child_fid.generation, rc); + GOTO(cleanup, rc); + } LDLM_LOCK_PUT(granted_lock); } @@ -1179,7 +1185,6 @@ static int mds_sync(struct ptlrpc_request *req, int offset) body = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*body)); - mds_pack_inode2fid(&body->fid1, de->d_inode); mds_pack_inode2body(body, de->d_inode); l_dput(de); @@ -1356,7 +1361,7 @@ static int mds_set_info_rpc(struct obd_export *exp, struct ptlrpc_request *req) lustre_msg_set_status(req->rq_repmsg, 0); - if (KEY_IS("read-only")) { + if (KEY_IS(KEY_READONLY)) { if (val == NULL || vallen < sizeof(__u32)) { DEBUG_REQ(D_HA, req, "no set_info val"); RETURN(-EFAULT); @@ -1523,8 +1528,8 @@ int mds_handle(struct ptlrpc_request *req) /* sanity check: if the xid matches, the request must * be marked as a resent or replayed */ - if (req->rq_xid == le64_to_cpu(med->med_mcd->mcd_last_xid) || - req->rq_xid == le64_to_cpu(med->med_mcd->mcd_last_close_xid)) + if (req->rq_xid == le64_to_cpu(med->med_lcd->lcd_last_xid) || + req->rq_xid == le64_to_cpu(med->med_lcd->lcd_last_close_xid)) if (!(lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY))) { CERROR("rq_xid "LPU64" matches last_xid, " @@ -1650,7 +1655,7 @@ int mds_handle(struct ptlrpc_request *req) break; } opc = *opcp; - if (lustre_msg_swabbed(req->rq_reqmsg)) + if (lustre_req_need_swab(req)) __swab32s(&opc); DEBUG_REQ(D_INODE, req, "reint %d (%s)", opc, @@ -1822,12 +1827,11 @@ int mds_handle(struct ptlrpc_request *req) /* If we're DISCONNECTing, the mds_export_data is already freed */ if (!rc && lustre_msg_get_opc(req->rq_reqmsg) != MDS_DISCONNECT) { struct mds_export_data *med = &req->rq_export->exp_mds_data; - + /* I don't think last_xid is used for anyway, so I'm not sure if we need to care about last_close_xid here.*/ lustre_msg_set_last_xid(req->rq_repmsg, - le64_to_cpu(med->med_mcd->mcd_last_xid)); - + le64_to_cpu(med->med_lcd->lcd_last_xid)); target_committed_to_req(req); } @@ -1982,7 +1986,7 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) mds->mds_evict_ost_nids = 1; sprintf(ns_name, "mds-%s", obd->obd_uuid.uuid); - obd->obd_namespace = ldlm_namespace_new(ns_name, LDLM_NAMESPACE_SERVER, + obd->obd_namespace = ldlm_namespace_new(obd, ns_name, LDLM_NAMESPACE_SERVER, LDLM_NAMESPACE_GREEDY); if (obd->obd_namespace == NULL) { mds_cleanup(obd); @@ -2080,8 +2084,11 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) obd->obd_replayable ? "enabled" : "disabled"); } + /* Reduce the initial timeout on an MDS because it doesn't need such + * a long timeout as an OST does. Adaptive timeouts will adjust this + * value appropriately. */ if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT) - ldlm_timeout = 6; + ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT; RETURN(0); @@ -2095,7 +2102,7 @@ err_fs: err_ns: lprocfs_free_obd_stats(obd); lprocfs_obd_cleanup(obd); - ldlm_namespace_free(obd->obd_namespace, 0); + ldlm_namespace_free(obd->obd_namespace, NULL, 0); obd->obd_namespace = NULL; err_ops: fsfilt_put_ops(obd->obd_fsops); @@ -2282,7 +2289,8 @@ static int mds_cleanup(struct obd_device *obd) server_put_mount(obd->obd_name, mds->mds_vfsmnt); obd->u.obt.obt_sb = NULL; - ldlm_namespace_free(obd->obd_namespace, obd->obd_force); + ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force); + obd->obd_namespace = NULL; spin_lock_bh(&obd->obd_processing_task_lock); if (obd->obd_recovering) { @@ -2335,11 +2343,11 @@ static void fixup_handle_for_resent_req(struct ptlrpc_request *req, int offset, * and allow it. (It's probably an OPEN, for which we don't * send a lock */ if (req->rq_xid == - le64_to_cpu(exp->exp_mds_data.med_mcd->mcd_last_xid)) + le64_to_cpu(exp->exp_mds_data.med_lcd->lcd_last_xid)) return; if (req->rq_xid == - le64_to_cpu(exp->exp_mds_data.med_mcd->mcd_last_close_xid)) + le64_to_cpu(exp->exp_mds_data.med_lcd->lcd_last_close_xid)) return; /* This remote handle isn't enqueued, so we never received or diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index ba1562f43cc7dba8b2f62a7cf38fced1d23c25f7..c2aacd76a07deff4daaa34fae9a5928dbaf0f763 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -90,8 +90,8 @@ static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer, return -ENOMEM; if (obd->u.mds.mds_evict_ost_nids) { - rc = obd_set_info_async(mds->mds_osc_exp,strlen("evict_by_nid"), - "evict_by_nid", strlen(tmpbuf + 4) + 1, + rc = obd_set_info_async(mds->mds_osc_exp,sizeof(KEY_EVICT_BY_NID), + KEY_EVICT_BY_NID, strlen(tmpbuf + 4) + 1, tmpbuf + 4, set); if (rc) CERROR("Failed to evict nid %s from OSTs: rc %d\n", diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index ce27d6acc1279e1ec022ca1ed05b5dea47e4b003..a747d0b842fbbc1d69ede536eda8bfcac41732c9 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -52,15 +52,15 @@ static int mds_export_stats_init(struct obd_device *obd, struct obd_export *exp, - lnet_nid_t client_nid) + void *client_nid) { - int rc, num_stats, newnid; + int rc, num_stats, newnid = 0; rc = lprocfs_exp_setup(exp, client_nid, &newnid); if (rc) return rc; - if (client_nid && newnid) { + if (newnid) { struct nid_stat *tmp = exp->exp_nid_stats; LASSERT(tmp != NULL); @@ -92,7 +92,7 @@ static int mds_export_stats_init(struct obd_device *obd, * mds_init_server_data() callsite needs to be fixed. */ int mds_client_add(struct obd_device *obd, struct obd_export *exp, - int cl_idx, lnet_nid_t client_nid) + int cl_idx, void *localdata) { struct mds_obd *mds = &obd->u.mds; struct mds_export_data *med = &exp->exp_mds_data; @@ -104,8 +104,8 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp, LASSERT(bitmap != NULL); LASSERTF(cl_idx > -2, "%d\n", cl_idx); - /* XXX if mcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ - if (!strcmp(med->med_mcd->mcd_uuid, obd->obd_uuid.uuid)) + /* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ + if (!strcmp(med->med_lcd->lcd_uuid, obd->obd_uuid.uuid)) RETURN(0); /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so @@ -134,13 +134,13 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp, } CDEBUG(D_INFO, "client at idx %d with UUID '%s' added\n", - cl_idx, med->med_mcd->mcd_uuid); + cl_idx, med->med_lcd->lcd_uuid); med->med_lr_idx = cl_idx; med->med_lr_off = le32_to_cpu(mds->mds_server_data->lsd_client_start) + (cl_idx * le16_to_cpu(mds->mds_server_data->lsd_client_size)); LASSERTF(med->med_lr_off > 0, "med_lr_off = %llu\n", med->med_lr_off); - mds_export_stats_init(obd, exp, client_nid); + mds_export_stats_init(obd, exp, localdata); if (new_client) { struct lvfs_run_ctxt saved; @@ -162,8 +162,8 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp, exp->exp_need_sync = 1; spin_unlock(&exp->exp_lock); } - rc = fsfilt_write_record(obd, file, med->med_mcd, - sizeof(*med->med_mcd), + rc = fsfilt_write_record(obd, file, med->med_lcd, + sizeof(*med->med_lcd), &off, rc /* sync if no cb */); fsfilt_commit(obd, file->f_dentry->d_inode, handle, 0); } @@ -172,9 +172,9 @@ int mds_client_add(struct obd_device *obd, struct obd_export *exp, if (rc) return rc; - CDEBUG(D_INFO, "wrote client mcd at idx %u off %llu (len %u)\n", + CDEBUG(D_INFO, "wrote client lcd at idx %u off %llu (len %u)\n", med->med_lr_idx, med->med_lr_off, - (unsigned int)sizeof(*med->med_mcd)); + (unsigned int)sizeof(*med->med_lcd)); } return 0; } @@ -184,21 +184,21 @@ int mds_client_free(struct obd_export *exp) struct mds_export_data *med = &exp->exp_mds_data; struct mds_obd *mds = &exp->exp_obd->u.mds; struct obd_device *obd = exp->exp_obd; - struct mds_client_data zero_mcd; + struct lsd_client_data zero_lcd; struct lvfs_run_ctxt saved; int rc; loff_t off; ENTRY; - if (!med->med_mcd) + if (!med->med_lcd) RETURN(0); - /* XXX if mcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ - if (!strcmp(med->med_mcd->mcd_uuid, obd->obd_uuid.uuid)) + /* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ + if (!strcmp(med->med_lcd->lcd_uuid, obd->obd_uuid.uuid)) GOTO(free, 0); CDEBUG(D_INFO, "freeing client at idx %u, offset %lld with UUID '%s'\n", - med->med_lr_idx, med->med_lr_off, med->med_mcd->mcd_uuid); + med->med_lr_idx, med->med_lr_off, med->med_lcd->lcd_uuid); LASSERT(mds->mds_client_bitmap != NULL); @@ -223,17 +223,17 @@ int mds_client_free(struct obd_export *exp) } if (!(exp->exp_flags & OBD_OPT_FAILOVER)) { - memset(&zero_mcd, 0, sizeof zero_mcd); + memset(&zero_lcd, 0, sizeof(zero_lcd)); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - rc = fsfilt_write_record(obd, mds->mds_rcvd_filp, &zero_mcd, - sizeof(zero_mcd), &off, + rc = fsfilt_write_record(obd, mds->mds_rcvd_filp, &zero_lcd, + sizeof(zero_lcd), &off, (!exp->exp_libclient || exp->exp_need_sync)); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); CDEBUG(rc == 0 ? D_INFO : D_ERROR, "zeroing out client %s idx %u in %s rc %d\n", - med->med_mcd->mcd_uuid, med->med_lr_idx, LAST_RCVD, rc); + med->med_lcd->lcd_uuid, med->med_lr_idx, LAST_RCVD, rc); } if (!test_and_clear_bit(med->med_lr_idx, mds->mds_client_bitmap)) { @@ -250,8 +250,8 @@ int mds_client_free(struct obd_export *exp) EXIT; free: - OBD_FREE(med->med_mcd, sizeof(*med->med_mcd)); - med->med_mcd = NULL; + OBD_FREE_PTR(med->med_lcd); + med->med_lcd = NULL; return 0; } @@ -269,7 +269,7 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) { struct mds_obd *mds = &obd->u.mds; struct lr_server_data *lsd; - struct mds_client_data *mcd = NULL; + struct lsd_client_data *lcd = NULL; loff_t off = 0; unsigned long last_rcvd_size = i_size_read(file->f_dentry->d_inode); __u64 mount_count; @@ -279,8 +279,8 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) /* ensure padding in the struct is the correct size */ LASSERT(offsetof(struct lr_server_data, lsd_padding) + sizeof(lsd->lsd_padding) == LR_SERVER_SIZE); - LASSERT(offsetof(struct mds_client_data, mcd_padding) + - sizeof(mcd->mcd_padding) == LR_CLIENT_SIZE); + LASSERT(offsetof(struct lsd_client_data, lcd_padding) + + sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE); OBD_ALLOC_WAIT(lsd, sizeof(*lsd)); if (!lsd) @@ -377,59 +377,59 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) struct obd_export *exp; struct mds_export_data *med; - if (!mcd) { - OBD_ALLOC_WAIT(mcd, sizeof(*mcd)); - if (!mcd) + if (!lcd) { + OBD_ALLOC_WAIT(lcd, sizeof(*lcd)); + if (!lcd) GOTO(err_client, rc = -ENOMEM); } /* Don't assume off is incremented properly by - * fsfilt_read_record(), in case sizeof(*mcd) + * fsfilt_read_record(), in case sizeof(*lcd) * isn't the same as lsd->lsd_client_size. */ off = le32_to_cpu(lsd->lsd_client_start) + cl_idx * le16_to_cpu(lsd->lsd_client_size); - rc = fsfilt_read_record(obd, file, mcd, sizeof(*mcd), &off); + rc = fsfilt_read_record(obd, file, lcd, sizeof(*lcd), &off); if (rc) { CERROR("error reading MDS %s idx %d, off %llu: rc %d\n", LAST_RCVD, cl_idx, off, rc); break; /* read error shouldn't cause startup to fail */ } - if (mcd->mcd_uuid[0] == '\0') { + if (lcd->lcd_uuid[0] == '\0') { CDEBUG(D_INFO, "skipping zeroed client at offset %d\n", cl_idx); continue; } - last_transno = le64_to_cpu(mcd->mcd_last_transno) > - le64_to_cpu(mcd->mcd_last_close_transno) ? - le64_to_cpu(mcd->mcd_last_transno) : - le64_to_cpu(mcd->mcd_last_close_transno); + last_transno = le64_to_cpu(lcd->lcd_last_transno) > + le64_to_cpu(lcd->lcd_last_close_transno) ? + le64_to_cpu(lcd->lcd_last_transno) : + le64_to_cpu(lcd->lcd_last_close_transno); /* These exports are cleaned up by mds_disconnect(), so they * need to be set up like real exports as mds_connect() does. */ CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64 - " srv lr: "LPU64" lx: "LPU64"\n", mcd->mcd_uuid, cl_idx, + " srv lr: "LPU64" lx: "LPU64"\n", lcd->lcd_uuid, cl_idx, last_transno, le64_to_cpu(lsd->lsd_last_transno), - le64_to_cpu(mcd->mcd_last_xid)); + le64_to_cpu(lcd->lcd_last_xid)); - exp = class_new_export(obd, (struct obd_uuid *)mcd->mcd_uuid); + exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid); if (IS_ERR(exp)) { if (PTR_ERR(exp) == -EALREADY) { /* export already exists, zero out this one */ - mcd->mcd_uuid[0] = '\0'; + lcd->lcd_uuid[0] = '\0'; } else { GOTO(err_client, rc = PTR_ERR(exp)); } } else { med = &exp->exp_mds_data; - med->med_mcd = mcd; - rc = mds_client_add(obd, exp, cl_idx, 0); + med->med_lcd = lcd; + rc = mds_client_add(obd, exp, cl_idx, NULL); /* can't fail for existing client */ LASSERTF(rc == 0, "rc = %d\n", rc); - mcd = NULL; + lcd = NULL; spin_lock(&exp->exp_lock); exp->exp_replay_needed = 1; @@ -449,8 +449,8 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) mds->mds_last_transno = last_transno; } - if (mcd) - OBD_FREE(mcd, sizeof(*mcd)); + if (lcd) + OBD_FREE_PTR(lcd); obd->obd_last_committed = mds->mds_last_transno; diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index 6041da69801a0515be8aa40cab8f51b0fc45dafc..a0a890ea9323af021db55078bf5a14a551af54c4 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -9,22 +9,8 @@ #include <lustre_mds.h> #define MDT_ROCOMPAT_SUPP (OBD_ROCOMPAT_LOVOBJID) -#define MDT_INCOMPAT_SUPP (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR) - -/* Data stored per client in the last_rcvd file. In le32 order. */ -struct mds_client_data { - __u8 mcd_uuid[40]; /* client UUID */ - __u64 mcd_last_transno; /* last completed transaction ID */ - __u64 mcd_last_xid; /* xid for the last transaction */ - __u32 mcd_last_result; /* result from last RPC */ - __u32 mcd_last_data; /* per-op data (disposition for open &c.) */ - /* for MDS_CLOSE requests */ - __u64 mcd_last_close_transno; /* last completed transaction ID */ - __u64 mcd_last_close_xid; /* xid for the last transaction */ - __u32 mcd_last_close_result; /* result from last RPC */ - __u32 mcd_last_close_data; /* per-op data (disposition for open &c.) */ - __u8 mcd_padding[LR_CLIENT_SIZE - 88]; -}; +#define MDT_INCOMPAT_SUPP (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR | \ + OBD_INCOMPAT_FID) #define MDS_SERVICE_WATCHDOG_FACTOR 2000 @@ -41,6 +27,12 @@ static inline struct mds_obd *mds_req2mds(struct ptlrpc_request *req) return &req->rq_export->exp_obd->u.mds; } +static inline void mds_export_evict(struct obd_export *exp) +{ + class_fail_export(exp); + class_export_put(exp); +} + #ifdef __KERNEL__ /* Open counts for files. No longer atomic, must hold inode->i_sem */ #if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) @@ -106,18 +98,18 @@ static inline void mds_inode_unset_orphan(struct inode *inode) #define MDS_CHECK_RESENT(req, reconstruct) \ { \ if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_RESENT) { \ - struct mds_client_data *mcd = \ - req->rq_export->exp_mds_data.med_mcd; \ - if (le64_to_cpu(mcd->mcd_last_xid) == req->rq_xid) { \ + struct lsd_client_data *lcd = \ + req->rq_export->exp_mds_data.med_lcd; \ + if (le64_to_cpu(lcd->lcd_last_xid) == req->rq_xid) { \ reconstruct; \ - RETURN(le32_to_cpu(mcd->mcd_last_result)); \ + RETURN(le32_to_cpu(lcd->lcd_last_result)); \ } \ - if (le64_to_cpu(mcd->mcd_last_close_xid) == req->rq_xid) { \ + if (le64_to_cpu(lcd->lcd_last_close_xid) == req->rq_xid) { \ reconstruct; \ - RETURN(le32_to_cpu(mcd->mcd_last_close_result)); \ + RETURN(le32_to_cpu(lcd->lcd_last_close_result)); \ } \ DEBUG_REQ(D_HA, req, "no reply for RESENT req (have "LPD64")",\ - mcd->mcd_last_xid); \ + lcd->lcd_last_xid); \ } \ } @@ -135,7 +127,7 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, struct ptlrpc_request *req, int rc, __u32 op_data, int force_sync); void mds_reconstruct_generic(struct ptlrpc_request *req); -void mds_req_from_mcd(struct ptlrpc_request *req, struct mds_client_data *mcd); +void mds_req_from_lcd(struct ptlrpc_request *req, struct lsd_client_data *cd); int mds_get_parent_child_locked(struct obd_device *obd, struct mds_obd *mds, struct ll_fid *fid, struct lustre_handle *parent_lockh, @@ -165,6 +157,11 @@ int mds_get_parents_children_locked(struct obd_device *obd, struct lustre_handle *dlm_handles, int child_mode); +struct dentry *mds_lookup(struct obd_device *obd, + const char *fid_name, + struct dentry *dparent, + int fid_namelen); + void mds_shrink_reply(struct obd_device *obd, struct ptlrpc_request *req, struct mds_body *body, int md_off); int mds_get_cookie_size(struct obd_device *obd, struct lov_mds_md *lmm); @@ -235,7 +232,7 @@ int mds_join_file(struct mds_update_record *rec, struct ptlrpc_request *req, /* mds/mds_fs.c */ int mds_client_add(struct obd_device *obd, struct obd_export *exp, - int cl_off, lnet_nid_t client_nid); + int cl_off, void *localdata); int mds_client_free(struct obd_export *exp); int mds_obd_create(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md **ea, struct obd_trans_info *oti); @@ -254,7 +251,6 @@ int mds_get_md(struct obd_device *, struct inode *, void *md, int *size, int lock, int flags); int mds_pack_md(struct obd_device *, struct lustre_msg *, int offset, struct mds_body *, struct inode *, int lock, int flags); -void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode); void mds_pack_inode2body(struct mds_body *body, struct inode *inode); #endif int mds_pack_acl(struct mds_export_data *med, struct inode *inode, diff --git a/lustre/mds/mds_join.c b/lustre/mds/mds_join.c index af5ab2651c41d2ee457f26e057c27de822abaef9..7f967d481cd02ef58769319b8b3e17a103fb00ef 100644 --- a/lustre/mds/mds_join.c +++ b/lustre/mds/mds_join.c @@ -236,7 +236,6 @@ static void mds_finish_join(struct mds_obd *mds, struct ptlrpc_request *req, CDEBUG(D_INODE, "updating max_mdsize/max_cookiesize: %d/%d\n", mds->mds_max_mdsize, mds->mds_max_cookiesize); - mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); } @@ -260,7 +259,7 @@ static int mds_join_unlink_tail_inode(struct mds_update_record *rec, ldlm_lock_decref(lockh, LCK_EX); head_inode = dchild->d_inode; - mdc_pack_fid(&head_fid, head_inode->i_ino, head_inode->i_generation, + ll_pack_fid(&head_fid, head_inode->i_ino, head_inode->i_generation, head_inode->i_mode & S_IFMT); rc = mds_get_parents_children_locked(obd, mds, &join_rec->jr_fid, diff --git a/lustre/mds/mds_lib.c b/lustre/mds/mds_lib.c index 9a144a6a75d8fe0b51e44ff8ce3ed41a86f6f2b3..d39edb7e42a335f9a39beda965406bd83fd2934a 100644 --- a/lustre/mds/mds_lib.c +++ b/lustre/mds/mds_lib.c @@ -53,7 +53,7 @@ #include <lustre_lib.h> #include "mds_internal.h" -void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode) +static void mds_pack_inode2fid(struct ll_fid *fid, struct inode *inode) { fid->id = inode->i_ino; fid->generation = inode->i_generation; @@ -72,6 +72,7 @@ void mds_pack_inode2body(struct mds_body *b, struct inode *inode) b->valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLRDEV; + mds_pack_inode2fid(&b->fid1, inode); b->ino = inode->i_ino; b->atime = LTIME_S(inode->i_atime); b->mtime = LTIME_S(inode->i_mtime); @@ -413,7 +414,7 @@ int mds_update_unpack(struct ptlrpc_request *req, int offset, RETURN(-EFAULT); opcode = *opcodep; - if (lustre_msg_swabbed(req->rq_reqmsg)) + if (lustre_req_need_swab(req)) __swab32s(&opcode); if (opcode >= REINT_MAX || mds_unpackers[opcode] == NULL) { diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index f377629045709788118bfbe122a3500ed7ffe064..717b8e33de84312d3b359fab57790029cbe04148 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -140,7 +140,7 @@ void mds_lov_destroy_objids(struct obd_device *obd) } if (mds->mds_lov_objid_filp) { - rc = filp_close((struct file *)mds->mds_lov_objid_filp, 0); + rc = filp_close((struct file *)mds->mds_lov_objid_filp, NULL); mds->mds_lov_objid_filp = NULL; if (rc) CERROR("%s file won't close, rc=%d\n", LOV_OBJID, rc); @@ -235,7 +235,7 @@ out: int mds_lov_write_objids(struct obd_device *obd) { struct mds_obd *mds = &obd->u.mds; - int i, rc = 0; + int i = 0, rc = 0; ENTRY; if (cfs_bitmap_check_empty(mds->mds_lov_page_dirty)) @@ -252,8 +252,9 @@ int mds_lov_write_objids(struct obd_device *obd) /* check for particaly filled last page */ if (i == mds->mds_lov_objid_lastpage) - size = (mds->mds_lov_objid_lastidx + 1) * sizeof(obd_id); + size = (mds->mds_lov_objid_lastidx + 1) *sizeof(obd_id); + CDEBUG(D_INFO,"write %lld - %u\n", off, size); rc = fsfilt_write_record(obd, mds->mds_lov_objid_filp, data, size, &off, 0); if (rc < 0) @@ -352,7 +353,6 @@ static int mds_lov_set_one_nextid(struct obd_device * obd, __u32 idx, obd_id *id info.idx = idx; info.data = id; - rc = obd_set_info_async(mds->mds_osc_exp, sizeof(KEY_NEXT_ID), KEY_NEXT_ID, sizeof(info), &info, NULL); if (rc) @@ -897,7 +897,7 @@ out: CERROR("%s sync failed %d, deactivating\n", obd_uuid2str(uuid), rc); if (!obd->obd_stopping && mds->mds_osc_obd && - !mds->mds_osc_obd->obd_stopping && !watched->obd_stopping) + !mds->mds_osc_obd->obd_stopping && !watched->obd_stopping) obd_notify(mds->mds_osc_obd, watched, OBD_NOTIFY_INACTIVE, NULL); } else { diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index e879d0e8487a60530b0c8cb5a9929eac48fa7176..b51c1ebeec921abe584dbaaa783519fd8ab8ea09 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -439,8 +439,10 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, oinfo.oi_oa->o_fid = body->fid1.id; oinfo.oi_oa->o_generation = body->fid1.generation; oinfo.oi_oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER; + oinfo.oi_policy.l_extent.start = i_size_read(inode); + oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; - rc = obd_setattr_rqset(mds->mds_osc_exp, &oinfo, &oti); + rc = obd_punch_rqset(mds->mds_osc_exp, &oinfo, &oti); if (rc) { CERROR("error setting attrs for inode %lu: rc %d\n", inode->i_ino, rc); @@ -496,10 +498,11 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, struct lustre_handle *child_lockh) { struct mds_export_data *med = &req->rq_export->exp_mds_data; - struct mds_client_data *mcd = med->med_mcd; + struct lsd_client_data *lcd = med->med_lcd; struct mds_obd *mds = mds_req2mds(req); struct mds_file_data *mfd; - struct obd_device *obd = req->rq_export->exp_obd; + struct obd_export *exp = req->rq_export; + struct obd_device *obd = exp->exp_obd; struct dentry *parent, *dchild; struct ldlm_reply *rep; struct mds_body *body; @@ -513,8 +516,8 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, body = lustre_msg_buf(req->rq_repmsg, DLM_REPLY_REC_OFF, sizeof(*body)); /* copy rc, transno and disp; steal locks */ - mds_req_from_mcd(req, mcd); - intent_set_disposition(rep, le32_to_cpu(mcd->mcd_last_data)); + mds_req_from_lcd(req, lcd); + intent_set_disposition(rep, le32_to_cpu(lcd->lcd_last_data)); /* Only replay if create or open actually happened. */ if (!intent_disposition(rep, DISP_OPEN_CREATE | DISP_OPEN_OPEN) ) { @@ -523,10 +526,30 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, } parent = mds_fid2dentry(mds, rec->ur_fid1, NULL); - LASSERT(!IS_ERR(parent)); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + LCONSOLE_WARN("Parent "LPU64"/%u lookup error %d." + " Evicting client %s with export %s.\n", + rec->ur_fid1->id, rec->ur_fid1->generation, rc, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp)); + mds_export_evict(exp); + EXIT; + return; + } - dchild = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1); - LASSERT(!IS_ERR(dchild)); + dchild = mds_lookup(obd, rec->ur_name, parent, rec->ur_namelen - 1); + if (IS_ERR(dchild)) { + rc = PTR_ERR(dchild); + LCONSOLE_WARN("Child "LPU64"/%u lookup error %d." + " Evicting client %s with export %s.\n", + rec->ur_fid1->id, rec->ur_fid1->generation, rc, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp)); + mds_export_evict(exp); + EXIT; + return; + } if (!dchild->d_inode) GOTO(out_dput, 0); /* child not present to open */ @@ -540,7 +563,6 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, GOTO(out_dput, 0); } - mds_pack_inode2fid(&body->fid1, dchild->d_inode); mds_pack_inode2body(body, dchild->d_inode); if (S_ISREG(dchild->d_inode->i_mode)) { rc = mds_pack_md(obd, req->rq_repmsg, DLM_REPLY_REC_OFF + 1, @@ -617,11 +639,15 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, CERROR("Re-opened file \n"); mfd = mds_dentry_open(dchild, mds->mds_vfsmnt, rec->ur_flags & ~MDS_OPEN_TRUNC, req); - if (!mfd) { - CERROR("mds: out of memory\n"); - GOTO(out_dput, req->rq_status = -ENOMEM); + mntput(mds->mds_vfsmnt); + if (IS_ERR(mfd)) { + req->rq_status = PTR_ERR(mfd); + mfd = NULL; + CERROR("%s: opening inode "LPU64" failed: rc %d\n", + req->rq_export->exp_obd->obd_name, + (__u64)dchild->d_inode->i_ino, req->rq_status); + GOTO(out_dput, req->rq_status); } - put_child = 0; } else { body->handle.cookie = mfd->mfd_handle.h_cookie; CDEBUG(D_INODE, "resend mfd %p, cookie "LPX64"\n", mfd, @@ -754,6 +780,7 @@ static int mds_open_by_fid(struct ptlrpc_request *req, struct ll_fid *fid, struct mds_body *body, int flags, struct mds_update_record *rec,struct ldlm_reply *rep) { + struct obd_device *obd = req->rq_export->exp_obd; struct mds_obd *mds = mds_req2mds(req); struct dentry *dchild; char fidname[LL_FID_NAMELEN]; @@ -762,7 +789,7 @@ static int mds_open_by_fid(struct ptlrpc_request *req, struct ll_fid *fid, ENTRY; fidlen = ll_fid2str(fidname, fid->id, fid->generation); - dchild = ll_lookup_one_len(fidname, mds->mds_pending_dir, fidlen); + dchild = mds_lookup(obd, fidname, mds->mds_pending_dir, fidlen); if (IS_ERR(dchild)) { rc = PTR_ERR(dchild); CERROR("error looking up %s in PENDING: rc = %d\n",fidname, rc); @@ -783,7 +810,6 @@ static int mds_open_by_fid(struct ptlrpc_request *req, struct ll_fid *fid, RETURN(PTR_ERR(dchild)); } - mds_pack_inode2fid(&body->fid1, dchild->d_inode); mds_pack_inode2body(body, dchild->d_inode); intent_set_disposition(rep, DISP_LOOKUP_EXECD); intent_set_disposition(rep, DISP_LOOKUP_POS); @@ -989,8 +1015,8 @@ int mds_open(struct mds_update_record *rec, int offset, * refer to bug 13030. */ dchild = mds_fid2dentry(mds, rec->ur_fid1, NULL); } else { - dchild = ll_lookup_one_len(rec->ur_name, dparent, - rec->ur_namelen - 1); + dchild = mds_lookup(obd, rec->ur_name, dparent, + rec->ur_namelen - 1); } if (IS_ERR(dchild)) { rc = PTR_ERR(dchild); @@ -1093,7 +1119,6 @@ int mds_open(struct mds_update_record *rec, int offset, dchild->d_inode->i_ino, dchild->d_inode->i_generation); found_child: - mds_pack_inode2fid(&body->fid1, dchild->d_inode); mds_pack_inode2body(body, dchild->d_inode); if (S_ISREG(dchild->d_inode->i_mode)) { @@ -1278,8 +1303,7 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset, inode->i_nlink, mds_orphan_open_count(inode)); last_orphan = mds_orphan_open_dec_test(inode) && - mds_inode_is_orphan(inode); - MDS_UP_WRITE_ORPHAN_SEM(inode); + mds_inode_is_orphan(inode); /* this is half of the actual "close" */ if (mfd->mfd_mode & FMODE_WRITE) { @@ -1288,10 +1312,15 @@ int mds_mfd_close(struct ptlrpc_request *req, int offset, } else if (mfd->mfd_mode & MDS_FMODE_EXEC) { mds_allow_write_access(inode); } + /* here writecount change also needs protection from orphan write sem. + * so drop orphan write sem after mds_put_write_access, bz 12888. */ + MDS_UP_WRITE_ORPHAN_SEM(inode); if (last_orphan && unlink_orphan) { int stripe_count = 0; - LASSERT(rc == 0); /* mds_put_write_access must have succeeded */ + /* mds_put_write_access must have succeeded */ + LASSERTF(rc == 0, "inode %lu/%u: rc %d", + inode->i_ino, inode->i_generation, rc); CDEBUG(D_INODE, "destroying orphan object %s\n", fidname); @@ -1492,7 +1521,6 @@ int mds_close(struct ptlrpc_request *req, int offset) sizeof(*body)); LASSERT(body != NULL); - mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); mds_pack_md(obd, req->rq_repmsg, REPLY_REC_OFF + 1, body, inode, MDS_PACK_MD_LOCK, 0); diff --git a/lustre/mds/mds_reint.c b/lustre/mds/mds_reint.c index f366f1282d95c5569ee1c4cd9512b2148419335b..8ed9e65e3c3bf93e4c2e5451638eed5cb3f33448 100644 --- a/lustre/mds/mds_reint.c +++ b/lustre/mds/mds_reint.c @@ -60,6 +60,38 @@ struct mds_logcancel_data { struct llog_cookie mlcd_cookies[0]; }; +/** lookup child dentry in parent dentry according to the name. + * if dentry is found, delete "lustre_mdt_attrs" EA (with name "lma") + * if it exists by checking OBD_INCOMPAT_FID. + */ +struct dentry *mds_lookup(struct obd_device *obd, const char *fid_name, + struct dentry *dparent, int fid_namelen) +{ + struct dentry *dchild; + struct lr_server_data *lsd = obd->u.mds.mds_server_data; + EXIT; + + dchild = ll_lookup_one_len(fid_name, dparent, fid_namelen); + if (!IS_ERR(dchild) && + unlikely((lsd->lsd_feature_incompat & OBD_INCOMPAT_FID) || + OBD_FAIL_CHECK(OBD_FAIL_MDS_REMOVE_COMMON_EA))) { + struct inode *inode = dchild->d_inode; + void *handle; + if (inode != NULL) { + handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR, + NULL); + if (!IS_ERR(handle)) { + LOCK_INODE_MUTEX(inode); + fsfilt_set_md(obd, inode, handle, NULL, 0, + "lma"); + /* result is ignored. */ + UNLOCK_INODE_MUTEX(inode); + fsfilt_commit(obd, inode, handle, 0); + } + } + } + RETURN(dchild); +} static void mds_cancel_cookies_cb(struct obd_device *obd, __u64 transno, void *cb_data, int error) @@ -108,7 +140,7 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, int force_sync) { struct mds_export_data *med = &req->rq_export->exp_mds_data; - struct mds_client_data *mcd = med->med_mcd; + struct lsd_client_data *lcd = med->med_lcd; struct obd_device *obd = req->rq_export->exp_obd; __u64 transno, prev_transno; int err; @@ -167,20 +199,20 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, req->rq_transno = transno; lustre_msg_set_transno(req->rq_repmsg, transno); if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) { - prev_transno = le64_to_cpu(mcd->mcd_last_close_transno); - mcd->mcd_last_close_transno = cpu_to_le64(transno); - mcd->mcd_last_close_xid = cpu_to_le64(req->rq_xid); - mcd->mcd_last_close_result = cpu_to_le32(rc); - mcd->mcd_last_close_data = cpu_to_le32(op_data); + prev_transno = le64_to_cpu(lcd->lcd_last_close_transno); + lcd->lcd_last_close_transno = cpu_to_le64(transno); + lcd->lcd_last_close_xid = cpu_to_le64(req->rq_xid); + lcd->lcd_last_close_result = cpu_to_le32(rc); + lcd->lcd_last_close_data = cpu_to_le32(op_data); } else { - prev_transno = le64_to_cpu(mcd->mcd_last_transno); + prev_transno = le64_to_cpu(lcd->lcd_last_transno); if (((lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY)) == 0) || (transno > prev_transno)) { - mcd->mcd_last_transno = cpu_to_le64(transno); - mcd->mcd_last_xid = cpu_to_le64(req->rq_xid); - mcd->mcd_last_result = cpu_to_le32(rc); - mcd->mcd_last_data = cpu_to_le32(op_data); + lcd->lcd_last_transno = cpu_to_le64(transno); + lcd->lcd_last_xid = cpu_to_le64(req->rq_xid); + lcd->lcd_last_result = cpu_to_le32(rc); + lcd->lcd_last_data = cpu_to_le32(op_data); } } /* update the server data to not lose the greatest transno. Bug 11125 */ @@ -198,8 +230,8 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, handle, mds_commit_cb, NULL); - err = fsfilt_write_record(obd, mds->mds_rcvd_filp, mcd, - sizeof(*mcd), &off, + err = fsfilt_write_record(obd, mds->mds_rcvd_filp, lcd, + sizeof(*lcd), &off, force_sync | exp->exp_need_sync); if (force_sync) mds_commit_cb(obd, transno, NULL, err); @@ -213,7 +245,7 @@ int mds_finish_transno(struct mds_obd *mds, struct inode *inode, void *handle, DEBUG_REQ(log_pri, req, "wrote trans #"LPU64" rc %d client %s at idx %u: err = %d", - transno, rc, mcd->mcd_uuid, med->med_lr_idx, err); + transno, rc, lcd->lcd_uuid, med->med_lr_idx, err); err = mds_lov_write_objids(obd); if (err) { @@ -380,17 +412,17 @@ void mds_steal_ack_locks(struct ptlrpc_request *req) spin_unlock(&exp->exp_lock); } -void mds_req_from_mcd(struct ptlrpc_request *req, struct mds_client_data *mcd) +void mds_req_from_lcd(struct ptlrpc_request *req, struct lsd_client_data *lcd) { if (lustre_msg_get_opc(req->rq_reqmsg) == MDS_CLOSE) { - req->rq_transno = le64_to_cpu(mcd->mcd_last_close_transno); + req->rq_transno = le64_to_cpu(lcd->lcd_last_close_transno); lustre_msg_set_transno(req->rq_repmsg, req->rq_transno); - req->rq_status = le32_to_cpu(mcd->mcd_last_close_result); + req->rq_status = le32_to_cpu(lcd->lcd_last_close_result); lustre_msg_set_status(req->rq_repmsg, req->rq_status); } else { - req->rq_transno = le64_to_cpu(mcd->mcd_last_transno); + req->rq_transno = le64_to_cpu(lcd->lcd_last_transno); lustre_msg_set_transno(req->rq_repmsg, req->rq_transno); - req->rq_status = le32_to_cpu(mcd->mcd_last_result); + req->rq_status = le32_to_cpu(lcd->lcd_last_result); lustre_msg_set_status(req->rq_repmsg, req->rq_status); } DEBUG_REQ(D_RPCTRACE, req, "restoring transno "LPD64"/status %d", @@ -402,21 +434,29 @@ void mds_req_from_mcd(struct ptlrpc_request *req, struct mds_client_data *mcd) static void reconstruct_reint_setattr(struct mds_update_record *rec, int offset, struct ptlrpc_request *req) { - struct mds_export_data *med = &req->rq_export->exp_mds_data; - struct mds_obd *obd = &req->rq_export->exp_obd->u.mds; + struct obd_export *exp = req->rq_export; + struct mds_export_data *med = &exp->exp_mds_data; + struct mds_obd *obd = &exp->exp_obd->u.mds; struct dentry *de; struct mds_body *body; - mds_req_from_mcd(req, med->med_mcd); + mds_req_from_lcd(req, med->med_lcd); de = mds_fid2dentry(obd, rec->ur_fid1, NULL); if (IS_ERR(de)) { - LASSERT(PTR_ERR(de) == req->rq_status); + int rc; + rc = PTR_ERR(de); + LCONSOLE_WARN("FID "LPU64"/%u lookup error %d." + " Evicting client %s with export %s.\n", + rec->ur_fid1->id, rec->ur_fid1->generation, rc, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp)); + mds_export_evict(exp); + EXIT; return; } body = lustre_msg_buf(req->rq_repmsg, offset, sizeof(*body)); - mds_pack_inode2fid(&body->fid1, de->d_inode); mds_pack_inode2body(body, de->d_inode); /* Don't return OST-specific attributes if we didn't just set them */ @@ -659,7 +699,6 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, } body = lustre_msg_buf(req->rq_repmsg, offset, sizeof(*body)); - mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); /* don't return OST-specific attributes if we didn't just set them. */ @@ -737,23 +776,45 @@ static int mds_reint_setattr(struct mds_update_record *rec, int offset, static void reconstruct_reint_create(struct mds_update_record *rec, int offset, struct ptlrpc_request *req) { - struct mds_export_data *med = &req->rq_export->exp_mds_data; - struct mds_obd *obd = &req->rq_export->exp_obd->u.mds; + struct obd_export *exp = req->rq_export; + struct mds_export_data *med = &exp->exp_mds_data; + struct mds_obd *obd = &exp->exp_obd->u.mds; struct dentry *parent, *child; struct mds_body *body; + int rc; - mds_req_from_mcd(req, med->med_mcd); + mds_req_from_lcd(req, med->med_lcd); if (req->rq_status) return; parent = mds_fid2dentry(obd, rec->ur_fid1, NULL); - LASSERT(!IS_ERR(parent)); - child = ll_lookup_one_len(rec->ur_name, parent, rec->ur_namelen - 1); - LASSERT(!IS_ERR(child)); + if (IS_ERR(parent)) { + rc = PTR_ERR(parent); + LCONSOLE_WARN("Parent "LPU64"/%u lookup error %d." + " Evicting client %s with export %s.\n", + rec->ur_fid1->id, rec->ur_fid1->generation, rc, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp)); + mds_export_evict(exp); + EXIT; + return; + } + child = mds_lookup(exp->exp_obd, rec->ur_name, parent, + rec->ur_namelen - 1); + if (IS_ERR(child)) { + rc = PTR_ERR(child); + LCONSOLE_WARN("Child "LPU64"/%u lookup error %d." + " Evicting client %s with export %s.\n", + rec->ur_fid1->id, rec->ur_fid1->generation, rc, + obd_uuid2str(&exp->exp_client_uuid), + obd_export_nid2str(exp)); + mds_export_evict(exp); + EXIT; + return; + } body = lustre_msg_buf(req->rq_repmsg, offset, sizeof(*body)); - mds_pack_inode2fid(&body->fid1, child->d_inode); mds_pack_inode2body(body, child->d_inode); l_dput(parent); @@ -813,7 +874,7 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, ldlm_lock_dump_handle(D_OTHER, &lockh); - dchild = ll_lookup_one_len(rec->ur_name, dparent, rec->ur_namelen - 1); + dchild = mds_lookup(obd, rec->ur_name, dparent, rec->ur_namelen - 1); if (IS_ERR(dchild)) { rc = PTR_ERR(dchild); CDEBUG(D_DENTRY, "child lookup error %d\n", rc); @@ -954,7 +1015,6 @@ static int mds_reint_create(struct mds_update_record *rec, int offset, } body = lustre_msg_buf(req->rq_repmsg, offset, sizeof(*body)); - mds_pack_inode2fid(&body->fid1, inode); mds_pack_inode2body(body, inode); } EXIT; @@ -1368,7 +1428,7 @@ int mds_get_parent_child_locked(struct obd_device *obd, struct mds_obd *mds, cleanup_phase = 1; /* parent dentry */ /* Step 2: Lookup child (without DLM lock, to get resource name) */ - *dchildp = ll_lookup_one_len(name, *dparentp, namelen - 1); + *dchildp = mds_lookup(obd, name, *dparentp, namelen - 1); if (IS_ERR(*dchildp)) { rc = PTR_ERR(*dchildp); CDEBUG(D_INODE, "child lookup error %d\n", rc); @@ -1451,7 +1511,7 @@ void mds_reconstruct_generic(struct ptlrpc_request *req) { struct mds_export_data *med = &req->rq_export->exp_mds_data; - mds_req_from_mcd(req, med->med_mcd); + mds_req_from_lcd(req, med->med_lcd); } /* If we are unlinking an open file/dir (i.e. creating an orphan) then @@ -1666,7 +1726,6 @@ static int mds_reint_unlink(struct mds_update_record *rec, int offset, LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); cleanup_phase = 5; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */ } else if (S_ISREG(child_inode->i_mode)) { - mds_pack_inode2fid(&body->fid1, child_inode); mds_pack_inode2body(body, child_inode); mds_pack_md(obd, req->rq_repmsg, offset + 1, body, child_inode, MDS_PACK_MD_LOCK, 0); @@ -1759,8 +1818,8 @@ cleanup: rc = mds_finish_transno(mds, dparent ? dparent->d_inode : NULL, handle, req, rc, 0, 0); if (!rc) - (void)obd_set_info_async(mds->mds_osc_exp, strlen("unlinked"), - "unlinked", 0, NULL, NULL); + (void)obd_set_info_async(mds->mds_osc_exp, sizeof(KEY_UNLINKED), + KEY_UNLINKED, 0, NULL, NULL); switch(cleanup_phase) { case 5: /* pending_dir semaphore */ UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); @@ -1878,7 +1937,7 @@ static int mds_reint_link(struct mds_update_record *rec, int offset, } /* Step 3: Lookup the child */ - dchild = ll_lookup_one_len(rec->ur_name, de_tgt_dir, rec->ur_namelen-1); + dchild = mds_lookup(obd, rec->ur_name, de_tgt_dir, rec->ur_namelen-1); if (IS_ERR(dchild)) { rc = PTR_ERR(dchild); if (rc != -EPERM && rc != -EACCES && rc != -ENAMETOOLONG) @@ -2017,7 +2076,7 @@ int mds_get_parents_children_locked(struct obd_device *obd, p2_res_id.name[1] = (*de_tgtdirp)->d_inode->i_generation; /* Step 3: Lookup the source child entry */ - *de_oldp = ll_lookup_one_len(old_name, *de_srcdirp, old_len - 1); + *de_oldp = mds_lookup(obd, old_name, *de_srcdirp, old_len - 1); if (IS_ERR(*de_oldp)) { rc = PTR_ERR(*de_oldp); CDEBUG(D_INODE, "old child lookup error (%.*s): rc %d\n", @@ -2041,7 +2100,7 @@ int mds_get_parents_children_locked(struct obd_device *obd, /* Step 4: Lookup the target child entry */ if (!new_name) GOTO(retry_locks, rc); - *de_newp = ll_lookup_one_len(new_name, *de_tgtdirp, new_len - 1); + *de_newp = mds_lookup(obd, new_name, *de_tgtdirp, new_len - 1); if (IS_ERR(*de_newp)) { rc = PTR_ERR(*de_newp); CDEBUG(D_DENTRY, "new child lookup error (%.*s): rc %d\n", @@ -2052,8 +2111,14 @@ int mds_get_parents_children_locked(struct obd_device *obd, cleanup_phase = 4; /* target dentry */ inode = (*de_newp)->d_inode; - if (inode != NULL) + if (inode != NULL) { + if (is_bad_inode(inode)) { + CERROR("bad inode returned %lu/%u\n", + inode->i_ino, inode->i_generation); + GOTO(cleanup, rc = -ENOENT); + } inode = igrab(inode); + } if (inode == NULL) goto retry_locks; @@ -2067,8 +2132,6 @@ retry_locks: maxres_tgt = &p2_res_id; cleanup_phase = 4; /* target dentry */ - if (c1_res_id.name[0] != 0 && res_gt(&c1_res_id, &p1_res_id,NULL,NULL)) - maxres_src = &c1_res_id; if (c2_res_id.name[0] != 0 && res_gt(&c2_res_id, &p2_res_id,NULL,NULL)) maxres_tgt = &c2_res_id; @@ -2105,6 +2168,11 @@ retry_locks: if (!new_name) GOTO(cleanup, rc); + + /* Safe to skip check for child res being all zero */ + if (res_gt(&c1_res_id, maxres_src, NULL, NULL)) + maxres_src = &c1_res_id; + /* Step 6b: Re-lookup target child to verify it hasn't changed */ rc = mds_verify_child(obd, &p2_res_id, &dlm_handles[1], *de_tgtdirp, parent_mode, &c2_res_id, &dlm_handles[3], de_newp, @@ -2242,7 +2310,6 @@ static int mds_reint_rename(struct mds_update_record *rec, int offset, LOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); cleanup_phase = 4; /* UNLOCK_INODE_MUTEX(mds->mds_pending_dir->d_inode); */ } else if (S_ISREG(new_inode->i_mode)) { - mds_pack_inode2fid(&body->fid1, new_inode); mds_pack_inode2body(body, new_inode); mds_pack_md(obd, req->rq_repmsg, offset + 1, body, new_inode, MDS_PACK_MD_LOCK, 0); diff --git a/lustre/mgc/libmgc.c b/lustre/mgc/libmgc.c index b72e8bbc6267ce45bbcd4d56ee8f3fd99000d667..0d5924abf74a4d6deffa20f9c7b5e9fbaf3b7150 100644 --- a/lustre/mgc/libmgc.c +++ b/lustre/mgc/libmgc.c @@ -73,12 +73,12 @@ static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) switch (stage) { case OBD_CLEANUP_EARLY: case OBD_CLEANUP_EXPORTS: - break; - case OBD_CLEANUP_SELF_EXP: rc = obd_llog_finish(obd, 0); if (rc != 0) CERROR("failed to cleanup llogging subsystems\n"); break; + case OBD_CLEANUP_SELF_EXP: + break; case OBD_CLEANUP_OBD: break; } @@ -111,7 +111,7 @@ static int mgc_llog_init(struct obd_device *obd, struct obd_device *tgt, &llog_client_ops); if (rc == 0) { ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); - ctxt->loc_imp = obd->u.cli.cl_import; + llog_initiator_connect(ctxt); llog_ctxt_put(ctxt); } diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index e1ab586eb1c925e9fe80138b949207bf631e2c2d..660229daa3937dfbc758a9d663d84089f8b60704 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -469,12 +469,12 @@ static int mgc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) spin_unlock(&config_list_lock); cfs_waitq_signal(&rq_waitq); } - break; - case OBD_CLEANUP_SELF_EXP: rc = obd_llog_finish(obd, 0); if (rc != 0) CERROR("failed to cleanup llogging subsystems\n"); break; + case OBD_CLEANUP_SELF_EXP: + break; case OBD_CLEANUP_OBD: break; } @@ -828,7 +828,7 @@ int mgc_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(0); } /* FIXME move this to mgc_process_config */ - if (KEY_IS("register_target")) { + if (KEY_IS(KEY_REGISTER_TARGET)) { struct mgs_target_info *mti; if (vallen != sizeof(struct mgs_target_info)) RETURN(-EINVAL); @@ -838,7 +838,7 @@ int mgc_set_info_async(struct obd_export *exp, obd_count keylen, rc = mgc_target_register(exp, mti); RETURN(rc); } - if (KEY_IS("set_fs")) { + if (KEY_IS(KEY_SET_FS)) { struct super_block *sb = (struct super_block *)val; struct lustre_sb_info *lsi; if (vallen != sizeof(struct super_block)) @@ -850,7 +850,7 @@ int mgc_set_info_async(struct obd_export *exp, obd_count keylen, } RETURN(rc); } - if (KEY_IS("clear_fs")) { + if (KEY_IS(KEY_CLEAR_FS)) { if (vallen != 0) RETURN(-EINVAL); rc = mgc_fs_cleanup(exp->exp_obd); @@ -921,7 +921,7 @@ static int mgc_llog_init(struct obd_device *obd, struct obd_device *tgt, &llog_client_ops); if (rc == 0) { ctxt = llog_get_context(obd, LLOG_CONFIG_REPL_CTXT); - ctxt->loc_imp = obd->u.cli.cl_import; + llog_initiator_connect(ctxt); llog_ctxt_put(ctxt); } diff --git a/lustre/mgs/mgs_fs.c b/lustre/mgs/mgs_fs.c index 51abb0e03fc44a7828818d3786c7c5e6baa5788a..4e8033d6f5ee876fafec8cd5c7ffbf81bb4451ce 100644 --- a/lustre/mgs/mgs_fs.c +++ b/lustre/mgs/mgs_fs.c @@ -45,9 +45,9 @@ static int mgs_export_stats_init(struct obd_device *obd, struct obd_export *exp) { - int rc, num_stats, newnid; + int rc, num_stats, newnid = 0; - rc = lprocfs_exp_setup(exp, 0, &newnid); + rc = lprocfs_exp_setup(exp, NULL, &newnid); if (rc) return rc; diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c index 6b7ba3ad496b54bca7ce0447b2b2a00fd4bc5552..c67e48f2fd0d33e5af45a8a645de5e50ebf7ff7e 100644 --- a/lustre/mgs/mgs_handler.c +++ b/lustre/mgs/mgs_handler.c @@ -145,7 +145,7 @@ static int mgs_setup(struct obd_device *obd, obd_count len, void *buf) GOTO(err_put, rc = PTR_ERR(obd->obd_fsops)); /* namespace for mgs llog */ - obd->obd_namespace = ldlm_namespace_new("MGS", LDLM_NAMESPACE_SERVER, + obd->obd_namespace = ldlm_namespace_new(obd, "MGS", LDLM_NAMESPACE_SERVER, LDLM_NAMESPACE_MODEST); if (obd->obd_namespace == NULL) GOTO(err_ops, rc = -ENOMEM); @@ -212,7 +212,7 @@ err_fs: /* No extra cleanup needed for llog_init_commit_thread() */ mgs_fs_cleanup(obd); err_ns: - ldlm_namespace_free(obd->obd_namespace, 0); + ldlm_namespace_free(obd->obd_namespace, NULL, 0); obd->obd_namespace = NULL; err_ops: fsfilt_put_ops(obd->obd_fsops); @@ -241,17 +241,6 @@ static int mgs_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) RETURN(rc); } -static int mgs_ldlm_nsfree(void *data) -{ - struct ldlm_namespace *ns = (struct ldlm_namespace *)data; - int rc; - ENTRY; - - ptlrpc_daemonize("ll_mgs_nsfree"); - rc = ldlm_namespace_free(ns, 1 /* obd_force should always be on */); - RETURN(rc); -} - static int mgs_cleanup(struct obd_device *obd) { struct mgs_obd *mgs = &obd->u.mgs; @@ -271,12 +260,8 @@ static int mgs_cleanup(struct obd_device *obd) server_put_mount(obd->obd_name, mgs->mgs_vfsmnt); mgs->mgs_sb = NULL; - /* Free the namespace in it's own thread, so that if the - ldlm_cancel_handler put the last mgs obd ref, we won't - deadlock here. */ - cfs_kernel_thread(mgs_ldlm_nsfree, obd->obd_namespace, - CLONE_VM | CLONE_FILES); - + ldlm_namespace_free(obd->obd_namespace, NULL, 1); + obd->obd_namespace = NULL; fsfilt_put_ops(obd->obd_fsops); LCONSOLE_INFO("%s has stopped.\n", obd->obd_name); diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c index 8917d28b92c5552ecf2a71937977f8cd7632ba33..2134bd8a5a1e60322c3c59eee09b686d9b95a75f 100644 --- a/lustre/mgs/mgs_llog.c +++ b/lustre/mgs/mgs_llog.c @@ -476,12 +476,14 @@ int mgs_set_index(struct obd_device *obd, struct mgs_target_info *mti) INDEX_MAP_SIZE * 8); RETURN(-ERANGE); } - + if (test_bit(mti->mti_stripe_index, imap)) { - if (mti->mti_flags & LDD_F_VIRGIN) { + if ((mti->mti_flags & LDD_F_VIRGIN) && + !(mti->mti_flags & LDD_F_WRITECONF)) { LCONSOLE_ERROR_MSG(0x140, "Server %s requested index " "%d, but that index is already in " - "use\n", mti->mti_svname, + "use. Use --writeconf to force\n", + mti->mti_svname, mti->mti_stripe_index); RETURN(-EADDRINUSE); } else { diff --git a/lustre/obdclass/class_hash.c b/lustre/obdclass/class_hash.c index c5785642517089791f5ba2d51088572c864c415e..880cd2bdec748e59b68a64819a48c7b14ba3bffb 100644 --- a/lustre/obdclass/class_hash.c +++ b/lustre/obdclass/class_hash.c @@ -669,7 +669,7 @@ void nidstats_refcount_put(struct hlist_node * actual_hnode) data = hlist_entry(actual_hnode, struct nid_stat, nid_hash); data->nid_exp_ref_count--; - + EXIT; } /*******************************************************************************/ diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 0ada5c989da2246bbe479588e1833720461e10ba..ef4cc0ecdb6658c44eac91ebf7e55b253c43b565 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -490,7 +490,7 @@ int obd_init_checks(void) } if ((u64val & ~CFS_PAGE_MASK) >= CFS_PAGE_SIZE) { CWARN("mask failed: u64val "LPU64" >= %lu\n", u64val, - CFS_PAGE_SIZE); + (unsigned long)CFS_PAGE_SIZE); ret = -EINVAL; } @@ -610,6 +610,7 @@ static void cleanup_obdclass(void) class_handle_cleanup(); class_exit_uuidlist(); + obd_zombie_impexp_stop(); memory_leaked = obd_memory_sum(); pages_leaked = obd_pages_sum(); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 63d52c26b54f4332b4b6b37baf3f7c3a72c66760..9e9a1a2f11da2f18c6f7318449b81f66c8eefaba 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -45,9 +45,7 @@ cfs_mem_cache_t *import_cachep; struct list_head obd_zombie_imports; struct list_head obd_zombie_exports; spinlock_t obd_zombie_impexp_lock; -void (*obd_zombie_impexp_notify)(void) = NULL; -EXPORT_SYMBOL(obd_zombie_impexp_notify); - +static void obd_zombie_impexp_notify(void); int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); @@ -70,8 +68,14 @@ EXPORT_SYMBOL(obd_device_alloc); static void obd_device_free(struct obd_device *obd) { LASSERT(obd != NULL); - LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n", - obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic " + "%08x != %08x\n", obd, obd->obd_magic, OBD_DEVICE_MAGIC); + if (obd->obd_namespace != NULL) { + CERROR("obd %p: namespace %p was not properly cleaned up " + "(obd_force=%d)!\n", + obd, obd->obd_namespace, obd->obd_force); + LBUG(); + } OBD_SLAB_FREE_PTR(obd, obd_device_cachep); } EXPORT_SYMBOL(obd_device_free); @@ -100,7 +104,7 @@ struct obd_type *class_get_type(const char *name) #ifdef CONFIG_KMOD if (!type) { const char *modname = name; - if (strcmp(modname, LUSTRE_MDT_NAME) == 0) + if (strcmp(modname, LUSTRE_MDT_NAME) == 0) modname = LUSTRE_MDS_NAME; if (!request_module(modname)) { CDEBUG(D_INFO, "Loaded module '%s'\n", modname); @@ -200,7 +204,7 @@ int class_unregister_type(const char *name) RETURN(-EBUSY); } - if (type->typ_procroot) + if (type->typ_procroot) lprocfs_remove(&type->typ_procroot); spin_lock(&obd_types_lock); @@ -233,7 +237,7 @@ struct obd_device *class_newdev(const char *type_name, const char *name) } newdev = obd_device_alloc(); - if (newdev == NULL) { + if (newdev == NULL) { class_put_type(type); RETURN(ERR_PTR(-ENOMEM)); } @@ -268,13 +272,13 @@ struct obd_device *class_newdev(const char *type_name, const char *name) } } spin_unlock(&obd_dev_lock); - + if (result == NULL && i >= class_devno_max()) { CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n", class_devno_max()); result = ERR_PTR(-EOVERFLOW); } - + if (IS_ERR(result)) { obd_device_free(newdev); class_put_type(type); @@ -519,7 +523,7 @@ int obd_init_caches(void) LASSERT(obd_device_cachep == NULL); obd_device_cachep = cfs_mem_cache_create("ll_obd_dev_cache", - sizeof(struct obd_device), + sizeof(struct obd_device), 0, 0); if (!obd_device_cachep) GOTO(out, -ENOMEM); @@ -613,13 +617,12 @@ void __class_export_put(struct obd_export *exp) CDEBUG(D_IOCTL, "final put %p/%s\n", exp, exp->exp_client_uuid.uuid); - + spin_lock(&obd_zombie_impexp_lock); list_add(&exp->exp_obd_chain, &obd_zombie_exports); spin_unlock(&obd_zombie_impexp_lock); - if (obd_zombie_impexp_notify != NULL) - obd_zombie_impexp_notify(); + obd_zombie_impexp_notify(); } } EXPORT_SYMBOL(__class_export_put); @@ -640,6 +643,7 @@ void class_export_destroy(struct obd_export *exp) ptlrpc_put_connection_superhack(exp->exp_connection); LASSERT(list_empty(&exp->exp_outstanding_replies)); + LASSERT(list_empty(&exp->exp_req_replay_queue)); obd_destroy_export(exp); OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle); @@ -664,6 +668,7 @@ struct obd_export *class_new_export(struct obd_device *obd, atomic_set(&export->exp_rpc_count, 0); export->exp_obd = obd; CFS_INIT_LIST_HEAD(&export->exp_outstanding_replies); + CFS_INIT_LIST_HEAD(&export->exp_req_replay_queue); /* XXX this should be in LDLM init */ CFS_INIT_LIST_HEAD(&export->exp_ldlm_data.led_held_locks); spin_lock_init(&export->exp_ldlm_data.led_lock); @@ -679,7 +684,7 @@ struct obd_export *class_new_export(struct obd_device *obd, obd_init_export(export); if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) { - rc = lustre_hash_additem_unique(obd->obd_uuid_hash_body, cluuid, + rc = lustre_hash_additem_unique(obd->obd_uuid_hash_body, cluuid, &export->exp_uuid_hash); if (rc != 0) { CWARN("%s: denying duplicate export for %s\n", @@ -710,7 +715,7 @@ void class_unlink_export(struct obd_export *exp) spin_lock(&exp->exp_obd->obd_dev_lock); /* delete an uuid-export hashitem from hashtables */ if (!hlist_unhashed(&exp->exp_uuid_hash)) { - lustre_hash_delitem(exp->exp_obd->obd_uuid_hash_body, + lustre_hash_delitem(exp->exp_obd->obd_uuid_hash_body, &exp->exp_client_uuid, &exp->exp_uuid_hash); } list_del_init(&exp->exp_obd_chain); @@ -753,22 +758,22 @@ void class_import_put(struct obd_import *import) if (atomic_dec_and_test(&import->imp_refcount)) { CDEBUG(D_INFO, "final put import %p\n", import); - + spin_lock(&obd_zombie_impexp_lock); list_add(&import->imp_zombie_chain, &obd_zombie_imports); spin_unlock(&obd_zombie_impexp_lock); - if (obd_zombie_impexp_notify != NULL) - obd_zombie_impexp_notify(); + obd_zombie_impexp_notify(); } EXIT; } +EXPORT_SYMBOL(class_import_put); void class_import_destroy(struct obd_import *import) { ENTRY; - + CDEBUG(D_IOCTL, "destroying import %p\n", import); LASSERT(atomic_read(&import->imp_refcount) == 0); @@ -789,7 +794,6 @@ void class_import_destroy(struct obd_import *import) OBD_FREE_RCU(import, sizeof(*import), &import->imp_handle); EXIT; } -EXPORT_SYMBOL(class_import_put); static void init_imp_at(struct imp_at *at) { int i; @@ -830,6 +834,10 @@ struct obd_import *class_new_import(struct obd_device *obd) class_handle_hash(&imp->imp_handle, import_handle_addref); init_imp_at(&imp->imp_at); +/* b1_8 supports both v1 & v2. but HEAD only supports v2. + * So let's use v2. + */ +#define HAVE_DEFAULT_V2_CONNECT 1 #ifdef HAVE_DEFAULT_V2_CONNECT /* the default magic is V2, will be used in connect RPC, and * then adjusted according to the flags in request/reply. */ @@ -1205,7 +1213,7 @@ char *obd_export_nid2str(struct obd_export *exp) { if (exp->exp_connection != NULL) return libcfs_nid2str(exp->exp_connection->c_peer.nid); - + return "(no nid)"; } EXPORT_SYMBOL(obd_export_nid2str); @@ -1228,7 +1236,7 @@ int obd_export_evict_by_nid(struct obd_device *obd, char *nid) "nid %s found, wanted nid %s, requested nid %s\n", obd_export_nid2str(doomed_exp), libcfs_nid2str(nid_key), nid); - + exports_evicted++; CDEBUG(D_HA, "%s: evict NID '%s' (%s) #%d at adminstrative request\n", obd->obd_name, nid, doomed_exp->exp_client_uuid.uuid, @@ -1256,7 +1264,7 @@ int obd_export_evict_by_uuid(struct obd_device *obd, char *uuid) return exports_evicted; } - doomed_exp = lustre_hash_get_object_by_key(obd->obd_uuid_hash_body, + doomed_exp = lustre_hash_get_object_by_key(obd->obd_uuid_hash_body, &doomed); if (doomed_exp == NULL) { @@ -1274,11 +1282,11 @@ int obd_export_evict_by_uuid(struct obd_device *obd, char *uuid) } EXPORT_SYMBOL(obd_export_evict_by_uuid); -void obd_zombie_impexp_cull(void) +void obd_zombie_impexp_cull(void) { struct obd_import *import; struct obd_export *export; - + do { spin_lock (&obd_zombie_impexp_lock); @@ -1289,7 +1297,7 @@ void obd_zombie_impexp_cull(void) imp_zombie_chain); list_del(&import->imp_zombie_chain); } - + export = NULL; if (!list_empty(&obd_zombie_exports)) { export = list_entry(obd_zombie_exports.next, @@ -1299,7 +1307,7 @@ void obd_zombie_impexp_cull(void) } spin_unlock(&obd_zombie_impexp_lock); - + if (import != NULL) class_import_destroy(import); @@ -1308,11 +1316,121 @@ void obd_zombie_impexp_cull(void) } while (import != NULL || export != NULL); } -EXPORT_SYMBOL(obd_zombie_impexp_cull); -void obd_zombie_impexp_init(void) +static struct completion obd_zombie_start; +static struct completion obd_zombie_stop; +static unsigned long obd_zombie_flags; +static cfs_waitq_t obd_zombie_waitq; + +enum { + OBD_ZOMBIE_STOP = 1 +}; + +int obd_zombi_impexp_check(void *arg) +{ + int rc; + + spin_lock(&obd_zombie_impexp_lock); + rc = list_empty(&obd_zombie_imports) && + list_empty(&obd_zombie_exports) && + !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags); + + spin_unlock(&obd_zombie_impexp_lock); + + RETURN(rc); +} + +static void obd_zombie_impexp_notify(void) +{ + cfs_waitq_signal(&obd_zombie_waitq); +} + +#ifdef __KERNEL__ + +static int obd_zombie_impexp_thread(void *unused) +{ + int rc; + + if ((rc = cfs_daemonize_ctxt("obd_zombid"))) { + complete(&obd_zombie_start); + RETURN(rc); + } + + complete(&obd_zombie_start); + + while(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) { + struct l_wait_info lwi = { 0 }; + + l_wait_event(obd_zombie_waitq, !obd_zombi_impexp_check(NULL), &lwi); + + obd_zombie_impexp_cull(); + } + + complete(&obd_zombie_stop); + + RETURN(0); +} + +#else /* ! KERNEL */ + +static atomic_t zombi_recur = ATOMIC_INIT(0); +static void *obd_zombi_impexp_work_cb; +static void *obd_zombi_impexp_idle_cb; + +int obd_zombi_impexp_kill(void *arg) { + int rc = 0; + + if (atomic_inc_return(&zombi_recur) == 1) { + obd_zombie_impexp_cull(); + rc = 1; + } + atomic_dec(&zombi_recur); + return rc; +} + +#endif + +int obd_zombie_impexp_init(void) +{ + int rc; + CFS_INIT_LIST_HEAD(&obd_zombie_imports); CFS_INIT_LIST_HEAD(&obd_zombie_exports); spin_lock_init(&obd_zombie_impexp_lock); + init_completion(&obd_zombie_start); + init_completion(&obd_zombie_stop); + cfs_waitq_init(&obd_zombie_waitq); + +#ifdef __KERNEL__ + rc = cfs_kernel_thread(obd_zombie_impexp_thread, NULL, 0); + if (rc < 0) + RETURN(rc); + + wait_for_completion(&obd_zombie_start); +#else + + obd_zombi_impexp_work_cb = + liblustre_register_wait_callback("obd_zombi_impexp_kill", + &obd_zombi_impexp_kill, NULL); + + obd_zombi_impexp_idle_cb = + liblustre_register_idle_callback("obd_zombi_impexp_check", + &obd_zombi_impexp_check, NULL); + rc = 0; + +#endif + RETURN(rc); +} + +void obd_zombie_impexp_stop(void) +{ + set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags); + obd_zombie_impexp_notify(); +#ifdef __KERNEL__ + wait_for_completion(&obd_zombie_stop); +#else + liblustre_deregister_wait_callback(obd_zombi_impexp_work_cb); + liblustre_deregister_idle_callback(obd_zombi_impexp_idle_cb); +#endif } diff --git a/lustre/obdclass/llog.c b/lustre/obdclass/llog.c index 977b489367f4a9b38351ce9bccab8af634398e84..b95e30acb119484de7ca2cd8c42b7efe261eae93 100644 --- a/lustre/obdclass/llog.c +++ b/lustre/obdclass/llog.c @@ -42,6 +42,7 @@ #include <obd_class.h> #include <lustre_log.h> #include <libcfs/list.h> +#include "llog_internal.h" /* Allocate a new log or catalog handle */ struct llog_handle *llog_alloc_handle(void) @@ -204,22 +205,30 @@ int llog_close(struct llog_handle *loghandle) } EXPORT_SYMBOL(llog_close); -int llog_process(struct llog_handle *loghandle, llog_cb_t cb, - void *data, void *catdata) +static int llog_process_thread(void *arg) { - struct llog_log_hdr *llh = loghandle->lgh_hdr; - struct llog_process_cat_data *cd = catdata; - char *buf; - __u64 cur_offset = LLOG_CHUNK_SIZE, last_offset; - int rc = 0, index = 1, last_index; - int saved_index = 0, last_called_index = 0; - ENTRY; + struct llog_process_info *lpi = (struct llog_process_info *)arg; + struct llog_handle *loghandle = lpi->lpi_loghandle; + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = lpi->lpi_catdata; + char *buf; + __u64 cur_offset = LLOG_CHUNK_SIZE; + __u64 last_offset; + int rc = 0, index = 1, last_index; + int saved_index = 0, last_called_index = 0; LASSERT(llh); OBD_ALLOC(buf, LLOG_CHUNK_SIZE); - if (!buf) - RETURN(-ENOMEM); + if (!buf) { + lpi->lpi_rc = -ENOMEM; +#ifdef __KERNEL__ + complete(&lpi->lpi_completion); +#endif + return 0; + } + + cfs_daemonize_ctxt("llog_process_thread"); if (cd != NULL) { last_called_index = cd->first_idx; @@ -267,7 +276,7 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n", rec->lrh_type, rec->lrh_index); - + if (rec->lrh_index == 0) GOTO(out, 0); /* no more records */ @@ -284,18 +293,19 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, continue; } - CDEBUG(D_OTHER, + CDEBUG(D_OTHER, "lrh_index: %d lrh_len: %d (%d remains)\n", rec->lrh_index, rec->lrh_len, (int)(buf + LLOG_CHUNK_SIZE - (char *)rec)); - loghandle->lgh_cur_idx = rec->lrh_index; + loghandle->lgh_cur_idx = rec->lrh_index; loghandle->lgh_cur_offset = (char *)rec - (char *)buf + - last_offset; + last_offset; /* if set, process the callback on this record */ if (ext2_test_bit(index, llh->llh_bitmap)) { - rc = cb(loghandle, rec, data); + rc = lpi->lpi_cb(loghandle, rec, + lpi->lpi_cbdata); last_called_index = index; if (rc == LLOG_PROC_BREAK) { CDEBUG(D_HA, "recovery from log: "LPX64 @@ -304,7 +314,8 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, loghandle->lgh_id.lgl_ogen); GOTO(out, rc); } else if (rc == LLOG_DEL_RECORD) { - llog_cancel_rec(loghandle, rec->lrh_index); + llog_cancel_rec(loghandle, + rec->lrh_index); rc = 0; } if (rc) @@ -325,6 +336,44 @@ int llog_process(struct llog_handle *loghandle, llog_cb_t cb, cd->last_idx = last_called_index; if (buf) OBD_FREE(buf, LLOG_CHUNK_SIZE); + lpi->lpi_rc = rc; +#ifdef __KERNEL__ + complete(&lpi->lpi_completion); +#endif + return 0; +} + +int llog_process(struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata) +{ + struct llog_process_info *lpi; + int rc; + ENTRY; + + OBD_ALLOC_PTR(lpi); + if (lpi == NULL) { + CERROR("cannot alloc pointer\n"); + RETURN(-ENOMEM); + } + lpi->lpi_loghandle = loghandle; + lpi->lpi_cb = cb; + lpi->lpi_cbdata = data; + lpi->lpi_catdata = catdata; + +#ifdef __KERNEL__ + init_completion(&lpi->lpi_completion); + rc = cfs_kernel_thread(llog_process_thread, lpi, CLONE_VM | CLONE_FILES); + if (rc < 0) { + CERROR("cannot start thread: %d\n", rc); + OBD_FREE_PTR(lpi); + RETURN(rc); + } + wait_for_completion(&lpi->lpi_completion); +#else + llog_process_thread(lpi); +#endif + rc = lpi->lpi_rc; + OBD_FREE_PTR(lpi); RETURN(rc); } EXPORT_SYMBOL(llog_process); diff --git a/lustre/obdclass/llog_internal.h b/lustre/obdclass/llog_internal.h index d4be19c2820ce86cfe9d869b8c05fc53cf9d62b4..82bb2e3e992a1d034d34301938f761e0c90a4a18 100644 --- a/lustre/obdclass/llog_internal.h +++ b/lustre/obdclass/llog_internal.h @@ -1,6 +1,17 @@ #ifndef __LLOG_INTERNAL_H__ #define __LLOG_INTERNAL_H__ +#include <lustre_log.h> + +struct llog_process_info { + struct llog_handle *lpi_loghandle; + llog_cb_t lpi_cb; + void *lpi_cbdata; + void *lpi_catdata; + int lpi_rc; + struct completion lpi_completion; +}; + int llog_put_cat_list(struct obd_device *obd, struct obd_device *disk_obd, char *name, int count, struct llog_catid *idarray); int llog_cat_id2handle(struct llog_handle *cathandle, struct llog_handle **res, diff --git a/lustre/obdclass/llog_obd.c b/lustre/obdclass/llog_obd.c index c9e9a95d1ae99e150b6fa384fabdc7d0864082f6..599e6fcbf20c85884dd69f3eb8f32f0d8a634c44 100644 --- a/lustre/obdclass/llog_obd.c +++ b/lustre/obdclass/llog_obd.c @@ -56,6 +56,10 @@ static void llog_ctxt_destroy(struct llog_ctxt *ctxt) { if (ctxt->loc_exp) class_export_put(ctxt->loc_exp); + if (ctxt->loc_imp) { + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = NULL; + } OBD_FREE(ctxt, sizeof(*ctxt)); return; } @@ -147,8 +151,12 @@ int llog_setup(struct obd_device *obd, int index, struct obd_device *disk_obd, ctxt->loc_logops = op; sema_init(&ctxt->loc_sem, 1); - if (op->lop_setup) - rc = op->lop_setup(obd, index, disk_obd, count, logid); + if (op->lop_setup) { + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP)) + rc = -EOPNOTSUPP; + else + rc = op->lop_setup(obd, index, disk_obd, count, logid); + } if (rc) { llog_ctxt_destroy(ctxt); diff --git a/lustre/obdclass/llog_swab.c b/lustre/obdclass/llog_swab.c index 4f45df02cd7328a688472b119561f74b1e70ff2a..af9809bc741348b69f7adfffe7448af8b9e4b0a2 100644 --- a/lustre/obdclass/llog_swab.c +++ b/lustre/obdclass/llog_swab.c @@ -85,6 +85,14 @@ void lustre_swab_ll_fid(struct ll_fid *fid) } EXPORT_SYMBOL(lustre_swab_ll_fid); +void lustre_swab_lu_fid(struct lu_fid *fid) +{ + __swab64s(&fid->f_seq); + __swab32s(&fid->f_oid); + __swab32s(&fid->f_ver); +} +EXPORT_SYMBOL(lustre_swab_lu_fid); + void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct llog_rec_tail *tail) { __swab32s(&rec->lrh_len); diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 99edb9d51abf749002b580dba990d04085595fc2..209728d8d28b5b946ac5951bafefdb28a7b4074f 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -69,7 +69,7 @@ struct proc_dir_entry *lprocfs_srch(struct proc_dir_entry *head, temp = temp->next; } - LPROCFS_ENTRY(); + LPROCFS_EXIT(); return NULL; } @@ -172,7 +172,7 @@ out: static ssize_t lprocfs_fops_write(struct file *f, const char __user *buf, size_t size, loff_t *ppos) { struct proc_dir_entry *dp = PDE(f->f_dentry->d_inode); - int rc = 0; + int rc = -EIO; LPROCFS_ENTRY(); if (!dp->deleted && dp->write_proc) @@ -204,7 +204,6 @@ int lprocfs_evict_client_release(struct inode *inode, struct file *f) atomic_dec(&obd->obd_evict_inprogress); wake_up(&obd->obd_evict_inprogress_waitq); - LPROCFS_EXIT(); return 0; } @@ -360,11 +359,13 @@ int lprocfs_wr_uint(struct file *file, const char *buffer, unsigned long count, void *data) { unsigned *p = data; - char dummy[MAX_STRING_SIZE + 1], *end; + char dummy[MAX_STRING_SIZE + 1] = { '\0' }, *end; unsigned long tmp; - dummy[MAX_STRING_SIZE] = '\0'; - if (copy_from_user(dummy, buffer, MAX_STRING_SIZE)) + if (count >= sizeof(dummy) || count == 0) + return -EINVAL; + + if (copy_from_user(dummy, buffer, count)) return -EFAULT; tmp = simple_strtoul(dummy, &end, 0); @@ -753,10 +754,8 @@ int lprocfs_obd_cleanup(struct obd_device *obd) return 0; } -void lprocfs_free_client_stats(void *obj, void *data) +static void lprocfs_free_client_stats(struct nid_stat *client_stat) { - struct nid_stat *client_stat = obj; - CDEBUG(D_CONFIG, "stat %p - data %p/%p/%p\n", client_stat, client_stat->nid_proc, client_stat->nid_stats, client_stat->nid_brw_stats); @@ -765,7 +764,6 @@ void lprocfs_free_client_stats(void *obj, void *data) client_stat->nid_exp_ref_count); hlist_del_init(&client_stat->nid_hash); - list_del(&client_stat->nid_list); if (client_stat->nid_proc) lprocfs_remove(&client_stat->nid_proc); @@ -787,10 +785,12 @@ void lprocfs_free_per_client_stats(struct obd_device *obd) ENTRY; /* we need extra list - because hash_exit called to early */ + /* not need locking because all clients is died */ while(!list_empty(&obd->obd_nid_stats)) { stat = list_entry(obd->obd_nid_stats.next, struct nid_stat, nid_list); - lprocfs_free_client_stats(stat, NULL); + list_del_init(&stat->nid_list); + lprocfs_free_client_stats(stat); } EXIT; @@ -800,9 +800,8 @@ struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags) { struct lprocfs_stats *stats; - struct lprocfs_percpu *percpu; unsigned int percpusize; - unsigned int i; + unsigned int i, j; unsigned int num_cpu; if (num == 0) @@ -825,12 +824,20 @@ struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, stats->ls_flags = 0; } - percpusize = offsetof(typeof(*percpu), lp_cntr[num]); + percpusize = offsetof(struct lprocfs_percpu, lp_cntr[num]); if (num_cpu > 1) percpusize = L1_CACHE_ALIGN(percpusize); - stats->ls_percpu_size = num_cpu * percpusize; - OBD_ALLOC(stats->ls_percpu[0], stats->ls_percpu_size); + for (i = 0; i < num_cpu; i++) { + OBD_ALLOC(stats->ls_percpu[i], percpusize); + if (stats->ls_percpu[i] == NULL) { + for (j = 0; j < i; j++) { + OBD_FREE(stats->ls_percpu[j], percpusize); + stats->ls_percpu[j] = NULL; + } + break; + } + } if (stats->ls_percpu[0] == NULL) { OBD_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_cpu])); @@ -838,10 +845,6 @@ struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, } stats->ls_num = num; - for (i = 1; i < num_cpu; i++) - stats->ls_percpu[i] = (void *)(stats->ls_percpu[i - 1]) + - percpusize; - return stats; } @@ -849,6 +852,8 @@ void lprocfs_free_stats(struct lprocfs_stats **statsh) { struct lprocfs_stats *stats = *statsh; unsigned int num_cpu; + unsigned int percpusize; + unsigned int i; if (!stats || (stats->ls_num == 0)) return; @@ -858,7 +863,11 @@ void lprocfs_free_stats(struct lprocfs_stats **statsh) else num_cpu = num_possible_cpus(); - OBD_FREE(stats->ls_percpu[0], stats->ls_percpu_size); + percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]); + if (num_cpu > 1) + percpusize = L1_CACHE_ALIGN(percpusize); + for (i = 0; i < num_cpu; i++) + OBD_FREE(stats->ls_percpu[i], percpusize); OBD_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_cpu])); } @@ -1085,6 +1094,8 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect); LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect); LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini); LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs); LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async); LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd); @@ -1101,6 +1112,8 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw); LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw_async); LPROCFS_OBD_OP_INIT(num_private_stats, stats, prep_async_page); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, reget_short_lock); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, release_short_lock); LPROCFS_OBD_OP_INIT(num_private_stats, stats, queue_async_io); LPROCFS_OBD_OP_INIT(num_private_stats, stats, queue_group_io); LPROCFS_OBD_OP_INIT(num_private_stats, stats, trigger_group_io); @@ -1246,24 +1259,29 @@ EXPORT_SYMBOL(lprocfs_nid_stats_clear_read); void lprocfs_nid_stats_clear_write_cb(void *obj, void *data) { - struct nid_stat *client_stat = obj; + struct nid_stat *stat = obj; int i; - if(client_stat->nid_exp_ref_count == 1) { - hlist_del_init(&client_stat->nid_hash); - lprocfs_free_client_stats(client_stat, data); - OBD_FREE(client_stat, sizeof(struct nid_stat)); + /* object has only hash + iterate_all references. + * add/delete blocked by hash bucket lock */ + CDEBUG(D_INFO,"refcnt %d\n", stat->nid_exp_ref_count); + if(stat->nid_exp_ref_count == 2) { + hlist_del_init(&stat->nid_hash); + stat->nid_exp_ref_count--; + spin_lock(&stat->nid_obd->obd_nid_lock); + list_del_init(&stat->nid_list); + spin_unlock(&stat->nid_obd->obd_nid_lock); + list_add(&stat->nid_list, data); EXIT; return; } /* we has reference to object - only clear data*/ - if (client_stat->nid_stats) { - lprocfs_clear_stats(client_stat->nid_stats); - } + if (stat->nid_stats) + lprocfs_clear_stats(stat->nid_stats); - if (client_stat->nid_brw_stats) { + if (stat->nid_brw_stats) { for (i = 0; i < BRW_LAST; i++) - lprocfs_oh_clear(&client_stat->nid_brw_stats->hist[i]); + lprocfs_oh_clear(&stat->nid_brw_stats->hist[i]); } EXIT; return; @@ -1274,16 +1292,23 @@ int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, unsigned long count, void *data) { struct obd_device *obd = (struct obd_device *)data; - + struct nid_stat *client_stat; + CFS_LIST_HEAD(free_list); lustre_hash_iterate_all(obd->obd_nid_stats_hash_body, - lprocfs_free_client_stats, NULL); + lprocfs_nid_stats_clear_write_cb, &free_list); + + while (!list_empty(&free_list)) { + client_stat = list_entry(free_list.next, struct nid_stat, nid_list); + list_del_init(&client_stat->nid_list); + lprocfs_free_client_stats(client_stat); + } return count; } EXPORT_SYMBOL(lprocfs_nid_stats_clear_write); -int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t nid, int *newnid) +int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid) { int rc = 0; struct nid_stat *tmp = NULL, *tmp1; @@ -1296,7 +1321,10 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t nid, int *newnid) !exp->exp_obd->obd_nid_stats_hash_body) RETURN(-EINVAL); - if (!nid) + /* not test against zero because eric say: + * You may only test nid against another nid, or LNET_NID_ANY. Anything else is + * nonsense.*/ + if (!nid || *nid == LNET_NID_ANY) RETURN(0); obd = exp->exp_obd; @@ -1307,26 +1335,31 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t nid, int *newnid) if (tmp == NULL) RETURN(-ENOMEM); - tmp->nid = nid; + tmp->nid = *nid; tmp->nid_obd = exp->exp_obd; tmp->nid_exp_ref_count = 1; /* need live in hash after destroy export */ - tmp1= lustre_hash_findadd_unique(obd->obd_nid_stats_hash_body, &nid, + /* protect competitive add to list, not need locking on destroy */ + spin_lock(&obd->obd_nid_lock); + list_add(&tmp->nid_list, &obd->obd_nid_stats); + spin_unlock(&obd->obd_nid_lock); + + tmp1= lustre_hash_findadd_unique(obd->obd_nid_stats_hash_body, nid, &tmp->nid_hash); CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n", - tmp1, libcfs_nid2str(nid), tmp->nid_exp_ref_count); + tmp1, libcfs_nid2str(*nid), tmp->nid_exp_ref_count); if (tmp1 != tmp) { exp->exp_nid_stats = tmp1; GOTO(destroy_new, rc = 0); } /* not found - create */ - tmp->nid_proc = proc_mkdir(libcfs_nid2str(nid), + tmp->nid_proc = proc_mkdir(libcfs_nid2str(*nid), obd->obd_proc_exports_entry); if (!tmp->nid_proc) { CERROR("Error making export directory for" - " nid %s\n", libcfs_nid2str(nid)); - lustre_hash_delitem(obd->obd_nid_stats_hash_body, &nid, + " nid %s\n", libcfs_nid2str(*nid)); + lustre_hash_delitem(obd->obd_nid_stats_hash_body, nid, &tmp->nid_hash); GOTO(destroy_new, rc = -ENOMEM); } @@ -1336,19 +1369,18 @@ int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t nid, int *newnid) if (rc) CWARN("Error adding the uuid file\n"); - /* protect competitive add to list, not need locking on destroy */ - spin_lock(&obd->nid_lock); - list_add(&tmp->nid_list, &obd->obd_nid_stats); - spin_unlock(&obd->nid_lock); - exp->exp_nid_stats = tmp; *newnid = 1; RETURN(rc); destroy_new: + spin_lock(&obd->obd_nid_lock); + list_del(&tmp->nid_list); + spin_unlock(&obd->obd_nid_lock); OBD_FREE(tmp, sizeof(struct nid_stat)); RETURN(rc); } + int lprocfs_exp_cleanup(struct obd_export *exp) { struct nid_stat *stat = exp->exp_nid_stats; @@ -1477,7 +1509,7 @@ int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count, __u64 whole, frac = 0, units; unsigned frac_d = 1; - if (count > (sizeof(kernbuf) - 1) ) + if (count > (sizeof(kernbuf) - 1)) return -EINVAL; if (copy_from_user(kernbuf, buffer, count)) diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 54cdc8f73fa97963e2ab18e0d631fe9614af4415..6a2b6dbbe97ea8200b89437ee77de24db9a6b62a 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -34,6 +34,7 @@ #include <obd_class.h> #include <obd.h> #endif +#include <lustre_disk.h> #include <lustre_log.h> #include <lprocfs_status.h> #include <libcfs/list.h> @@ -188,10 +189,14 @@ int class_attach(struct lustre_cfg *lcfg) LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0, "%p obd_name %s != %s\n", obd, obd->obd_name, name); + rwlock_init(&obd->obd_pool_lock); + obd->obd_pool_limit = 0; + obd->obd_pool_slv = 0; + CFS_INIT_LIST_HEAD(&obd->obd_exports); CFS_INIT_LIST_HEAD(&obd->obd_exports_timed); CFS_INIT_LIST_HEAD(&obd->obd_nid_stats); - spin_lock_init(&obd->nid_lock); + spin_lock_init(&obd->obd_nid_lock); spin_lock_init(&obd->obd_dev_lock); sema_init(&obd->obd_dev_sem, 1); spin_lock_init(&obd->obd_osfs_lock); @@ -214,8 +219,8 @@ int class_attach(struct lustre_cfg *lcfg) len = strlen(uuid); if (len >= sizeof(obd->obd_uuid)) { - CERROR("uuid must be < "LPSZ" bytes long\n", - sizeof(obd->obd_uuid)); + CERROR("uuid must be < %d bytes long\n", + (int)sizeof(obd->obd_uuid)); GOTO(out, rc = -EINVAL); } memcpy(obd->obd_uuid.uuid, uuid, len); @@ -355,10 +360,10 @@ int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg) obd->obd_name, obd->obd_uuid.uuid); class_decref(obd); - + /* not strictly necessary, but cleans up eagerly */ obd_zombie_impexp_cull(); - + RETURN(0); } @@ -858,7 +863,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, ENTRY; if (lcfg->lcfg_command != LCFG_PARAM) { - CERROR("Unknown command: %d\n", lcfg->lcfg_command); + CERROR("Unknown command: %x\n", lcfg->lcfg_command); RETURN(-EINVAL); } @@ -904,8 +909,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, if (!matched) { CERROR("%s: unknown param %s\n", (char *)lustre_cfg_string(lcfg, 0), key); - rc = -EINVAL; - /* continue parsing other params */ + /* rc = -EINVAL; continue parsing other params */ } else { LCONSOLE_INFO("%s.%.*s: set parameter %.*s=%s\n", (char *)lustre_cfg_string(lcfg, 0), @@ -1001,6 +1005,26 @@ static int class_config_llog_handler(struct llog_handle * handle, break; } + /** + * For interop mode between 1.8 and 2.0: + * skip "lmv" configuration which exists since 2.0. + */ + { + char *devname = lustre_cfg_string(lcfg, 0); + char *typename = lustre_cfg_string(lcfg, 1); + + if (devname) + devname += strlen(devname) - strlen("clilmv"); + + if ((lcfg->lcfg_command == LCFG_ATTACH && typename && + strcmp(typename, "lmv") == 0) || + (devname && strcmp(devname, "clilmv") == 0)) { + CWARN("skipping 'lmv' config: cmd=%x,%s:%s\n", + lcfg->lcfg_command, devname, typename); + GOTO(out, rc = 0); + } + } + if ((clli->cfg_flags & CFG_F_EXCLUDE) && (lcfg->lcfg_command == LCFG_LOV_ADD_OBD)) /* Add inactive instead */ diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 8e42a9a9ad75ff11c42ae587d8393cb8d58493f3..642278d8b8ddd18d30a85a9a4f6665afced0fa9e 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -593,7 +593,7 @@ static int lustre_start_mgc(struct super_block *sb) recov_bk++; CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk); rc = obd_set_info_async(obd->obd_self_export, - strlen(KEY_INIT_RECOV_BACKUP), + sizeof(KEY_INIT_RECOV_BACKUP), KEY_INIT_RECOV_BACKUP, sizeof(recov_bk), &recov_bk, NULL); GOTO(out, rc = 0); @@ -694,7 +694,7 @@ static int lustre_start_mgc(struct super_block *sb) /* Try all connections, but only once. */ recov_bk = 1; rc = obd_set_info_async(obd->obd_self_export, - strlen(KEY_INIT_RECOV_BACKUP), + sizeof(KEY_INIT_RECOV_BACKUP), KEY_INIT_RECOV_BACKUP, sizeof(recov_bk), &recov_bk, NULL); if (rc) @@ -704,7 +704,8 @@ static int lustre_start_mgc(struct super_block *sb) OBD_ALLOC_PTR(data); if (data == NULL) GOTO(out, rc = -ENOMEM); - data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT; + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT | + OBD_CONNECT_FID; data->ocd_version = LUSTRE_VERSION_CODE; /* We connect to the MGS at setup, and don't disconnect until cleanup */ rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), data, NULL); @@ -744,8 +745,8 @@ static int lustre_stop_mgc(struct super_block *sb) obd = lsi->lsi_mgc; if (!obd) RETURN(-ENOENT); - lsi->lsi_mgc = NULL; + mutex_down(&mgc_start_lock); if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { /* This is not fatal, every client that stops @@ -813,7 +814,7 @@ static int server_mgc_set_fs(struct obd_device *mgc, struct super_block *sb) /* cl_mgc_sem in mgc insures we sleep if the mgc_fs is busy */ rc = obd_set_info_async(mgc->obd_self_export, - strlen("set_fs"), "set_fs", + sizeof(KEY_SET_FS), KEY_SET_FS, sizeof(*sb), sb, NULL); if (rc) { CERROR("can't set_fs %d\n", rc); @@ -830,7 +831,7 @@ static int server_mgc_clear_fs(struct obd_device *mgc) CDEBUG(D_MOUNT, "Unassign mgc disk\n"); rc = obd_set_info_async(mgc->obd_self_export, - strlen("clear_fs"), "clear_fs", + sizeof(KEY_CLEAR_FS), KEY_CLEAR_FS, 0, NULL, NULL); RETURN(rc); } @@ -958,7 +959,7 @@ int server_register_target(struct super_block *sb) /* Register the target */ /* FIXME use mgc_process_config instead */ rc = obd_set_info_async(mgc->u.cli.cl_mgc_mgsexp, - strlen("register_target"), "register_target", + sizeof(KEY_REGISTER_TARGET), KEY_REGISTER_TARGET, sizeof(*mti), mti, NULL); if (rc) GOTO(out, rc); @@ -1354,7 +1355,8 @@ static void server_put_super(struct super_block *sb) CDEBUG(D_MOUNT, "server put_super %s\n", tmpname); /* Stop the target */ - if (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd)) { + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOSVC) && + (IS_MDT(lsi->lsi_ldd) || IS_OST(lsi->lsi_ldd))) { struct lustre_profile *lprof = NULL; /* tell the mgc to drop the config log */ @@ -1391,7 +1393,9 @@ static void server_put_super(struct super_block *sb) /* stop the mgc before the mgs so the connection gets cleaned up */ lustre_stop_mgc(sb); - server_stop_mgs(sb); + /* if MDS start with --nomgs, don't stop MGS then */ + if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) + server_stop_mgs(sb); } /* Clean the mgc and sb */ @@ -1579,7 +1583,7 @@ static int server_fill_super(struct super_block *sb) } /* start MGS before MGC */ - if (IS_MGS(lsi->lsi_ldd)) { + if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) { rc = server_start_mgs(sb); if (rc) GOTO(out_mnt, rc); @@ -1816,6 +1820,9 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) } else if (strncmp(s1, "nosvc", 5) == 0) { lmd->lmd_flags |= LMD_FLG_NOSVC; clear++; + } else if (strncmp(s1, "nomgs", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOMGS; + clear++; /* ost exclusion list */ } else if (strncmp(s1, "exclude=", 8) == 0) { rc = lmd_make_exclusion(lmd, s1 + 7); @@ -1832,8 +1839,11 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) must be the last one. */ *s1 = '\0'; break; + } else if (strncmp(s1, "loop=", 5) == 0) { + clear++; } + /* Find next opt */ s2 = strchr(s1, ','); if (s2 == NULL) { @@ -2008,7 +2018,8 @@ struct file_system_type lustre_fs_type = { .name = "lustre", .get_sb = lustre_get_sb, .kill_sb = lustre_kill_super, - .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV, + .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV | + LL_RENAME_DOES_D_MOVE, }; #else diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index f8f9826b87878478531622e60b13eac9ffdef47f..b822c56da38c606230222f4585412486a81a817d 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -465,7 +465,7 @@ static int echo_setup(struct obd_device *obd, obd_count len, void *buf) spin_lock_init(&obd->u.echo.eo_lock); obd->u.echo.eo_lastino = ECHO_INIT_OBJID; - obd->obd_namespace = ldlm_namespace_new("echo-tgt", + obd->obd_namespace = ldlm_namespace_new(obd, "echo-tgt", LDLM_NAMESPACE_SERVER, LDLM_NAMESPACE_GREEDY); if (obd->obd_namespace == NULL) { @@ -510,7 +510,8 @@ static int echo_cleanup(struct obd_device *obd) set_current_state (TASK_UNINTERRUPTIBLE); cfs_schedule_timeout (CFS_TASK_UNINT, cfs_time_seconds(1)); - ldlm_namespace_free(obd->obd_namespace, obd->obd_force); + ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force); + obd->obd_namespace = NULL; leaked = atomic_read(&obd->u.echo.eo_prep); if (leaked != 0) diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index e46f8f4fd183c45bffcb010d5240eec97df89e39..b1dc7f567bac55a90056faa4a0189417e0ca22c0 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -582,90 +582,6 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, return (rc); } -#ifdef __KERNEL__ -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -#include <linux/iobuf.h> - -static int echo_client_ubrw(struct obd_device *obd, int rw, - struct obdo *oa, struct lov_stripe_md *lsm, - obd_off offset, obd_size count, char *buffer, - struct obd_trans_info *oti) -{ - struct echo_client_obd *ec = &obd->u.echo_client; - struct obd_info oinfo = { { { 0 } } }; - obd_count npages; - struct brw_page *pga; - struct brw_page *pgp; - obd_off off; - struct kiobuf *kiobuf; - int i; - int rc; - - LASSERT (rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); - - /* NB: for now, only whole pages, page aligned */ - - if (count <= 0 || - ((long)buffer & (~CFS_PAGE_MASK)) != 0 || - (count & (~CFS_PAGE_MASK)) != 0 || - (lsm != NULL && lsm->lsm_object_id != oa->o_id)) - return (-EINVAL); - - /* XXX think again with misaligned I/O */ - npages = count >> CFS_PAGE_SHIFT; - - OBD_ALLOC(pga, npages * sizeof(*pga)); - if (pga == NULL) - return (-ENOMEM); - - rc = alloc_kiovec (1, &kiobuf); - if (rc != 0) - goto out_1; - - rc = map_user_kiobuf ((rw == OBD_BRW_READ) ? READ : WRITE, - kiobuf, (unsigned long)buffer, count); - if (rc != 0) - goto out_2; - - LASSERT (kiobuf->offset == 0); - LASSERT (kiobuf->nr_pages == npages); - - for (i = 0, off = offset, pgp = pga; - i < npages; - i++, off += CFS_PAGE_SIZE, pgp++) { - pgp->off = off; - pgp->pg = kiobuf->maplist[i]; - pgp->count = CFS_PAGE_SIZE; - pgp->flag = 0; - } - - oinfo.oi_oa = oa; - oinfo.oi_md = lsm; - rc = obd_brw(rw, ec->ec_exp, &oinfo, npages, pga, oti); - - // if (rw == OBD_BRW_READ) - // mark_dirty_kiobuf (kiobuf, count); - - unmap_kiobuf (kiobuf); - out_2: - free_kiovec (1, &kiobuf); - out_1: - OBD_FREE(pga, npages * sizeof(*pga)); - return (rc); -} -#else -static int echo_client_ubrw(struct obd_device *obd, int rw, - struct obdo *oa, struct lov_stripe_md *lsm, - obd_off offset, obd_size count, char *buffer, - struct obd_trans_info *oti) -{ - /* echo_client_ubrw() needs to be ported on 2.6 yet */ - LBUG(); - return 0; -} -#endif -#endif - struct echo_async_state; #define EAP_MAGIC 79277927 @@ -1020,18 +936,9 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp, switch((long)data->ioc_pbuf1) { case 1: - if (data->ioc_pbuf2 == NULL) { // NULL user data pointer - rc = echo_client_kbrw(obd, rw, &data->ioc_obdo1, - eco->eco_lsm, data->ioc_offset, - data->ioc_count, &dummy_oti); - } else { -#ifdef __KERNEL__ - rc = echo_client_ubrw(obd, rw, &data->ioc_obdo1, - eco->eco_lsm, data->ioc_offset, - data->ioc_count, data->ioc_pbuf2, - &dummy_oti); -#endif - } + rc = echo_client_kbrw(obd, rw, &data->ioc_obdo1, + eco->eco_lsm, data->ioc_offset, + data->ioc_count, &dummy_oti); break; case 2: rc = echo_client_async_page(ec->ec_exp, rw, &data->ioc_obdo1, diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 6ebdc58747bd026282bc90d49b722527ba260a0a..333c6a202d7723e0b7b5a32195c0f24b7c12de4a 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -78,7 +78,7 @@ int filter_finish_transno(struct obd_export *exp, struct obd_trans_info *oti, { struct filter_obd *filter = &exp->exp_obd->u.filter; struct filter_export_data *fed = &exp->exp_filter_data; - struct filter_client_data *fcd = fed->fed_fcd; + struct lsd_client_data *lcd = fed->fed_lcd; __u64 last_rcvd; loff_t off; int err, log_pri = D_RPCTRACE; @@ -105,10 +105,10 @@ int filter_finish_transno(struct obd_export *exp, struct obd_trans_info *oti, cpu_to_le64(last_rcvd); spin_unlock(&filter->fo_translock); } - fcd->fcd_last_rcvd = cpu_to_le64(last_rcvd); + lcd->lcd_last_transno = cpu_to_le64(last_rcvd); /* could get xid from oti, if it's ever needed */ - fcd->fcd_last_xid = 0; + lcd->lcd_last_xid = 0; off = fed->fed_lr_off; if (off <= 0) { @@ -124,7 +124,7 @@ int filter_finish_transno(struct obd_export *exp, struct obd_trans_info *oti, NULL); err = fsfilt_write_record(exp->exp_obd, filter->fo_rcvd_filp, - fcd, sizeof(*fcd), &off, + lcd, sizeof(*lcd), &off, force_sync | exp->exp_need_sync); if (force_sync) filter_commit_cb(exp->exp_obd, last_rcvd, NULL, err); @@ -136,7 +136,7 @@ int filter_finish_transno(struct obd_export *exp, struct obd_trans_info *oti, } CDEBUG(log_pri, "wrote trans "LPU64" for client %s at #%d: err = %d\n", - last_rcvd, fcd->fcd_uuid, fed->fed_lr_idx, err); + last_rcvd, lcd->lcd_uuid, fed->fed_lr_idx, err); RETURN(rc); } @@ -182,7 +182,7 @@ static int lprocfs_init_rw_stats(struct obd_device *obd, plus the procfs overhead :( */ static int filter_export_stats_init(struct obd_device *obd, struct obd_export *exp, - lnet_nid_t client_nid) + void *client_nid) { struct filter_export_data *fed = &exp->exp_filter_data; struct proc_dir_entry *brw_entry; @@ -198,7 +198,7 @@ static int filter_export_stats_init(struct obd_device *obd, if (rc) RETURN(rc); - if (client_nid && newnid) { + if (newnid) { struct nid_stat *tmp = exp->exp_nid_stats; LASSERT(tmp != NULL); @@ -234,7 +234,7 @@ static int filter_export_stats_init(struct obd_device *obd, * Otherwise, we have just read the data from the last_rcvd file and * we know its offset. */ static int filter_client_add(struct obd_device *obd, struct obd_export *exp, - int cl_idx, lnet_nid_t client_nid) + int cl_idx) { struct filter_obd *filter = &obd->u.filter; struct filter_export_data *fed = &exp->exp_filter_data; @@ -247,7 +247,7 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp, LASSERTF(cl_idx > -2, "%d\n", cl_idx); /* Self-export */ - if (strcmp(fed->fed_fcd->fcd_uuid, obd->obd_uuid.uuid) == 0) + if (strcmp(fed->fed_lcd->lcd_uuid, obd->obd_uuid.uuid) == 0) RETURN(0); /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so @@ -280,7 +280,7 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp, LASSERTF(fed->fed_lr_off > 0, "fed_lr_off = %llu\n", fed->fed_lr_off); CDEBUG(D_INFO, "client at index %d (%llu) with UUID '%s' added\n", - fed->fed_lr_idx, fed->fed_lr_off, fed->fed_fcd->fcd_uuid); + fed->fed_lr_idx, fed->fed_lr_off, fed->fed_lcd->lcd_uuid); if (new_client) { struct lvfs_run_ctxt saved; @@ -288,8 +288,8 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp, void *handle; int rc; - CDEBUG(D_INFO, "writing client fcd at idx %u (%llu) (len %u)\n", - fed->fed_lr_idx,off,(unsigned int)sizeof(*fed->fed_fcd)); + CDEBUG(D_INFO, "writing client lcd at idx %u (%llu) (len %u)\n", + fed->fed_lr_idx,off,(unsigned int)sizeof(*fed->fed_lcd)); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); /* Transaction needed to fix bug 1403 */ @@ -308,8 +308,8 @@ static int filter_client_add(struct obd_device *obd, struct obd_export *exp, spin_unlock(&exp->exp_lock); } rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, - fed->fed_fcd, - sizeof(*fed->fed_fcd), + fed->fed_lcd, + sizeof(*fed->fed_lcd), &off, rc /* sync if no cb */); fsfilt_commit(obd, filter->fo_rcvd_filp->f_dentry->d_inode, @@ -331,21 +331,21 @@ static int filter_client_free(struct obd_export *exp) struct filter_export_data *fed = &exp->exp_filter_data; struct filter_obd *filter = &exp->exp_obd->u.filter; struct obd_device *obd = exp->exp_obd; - struct filter_client_data zero_fcd; + struct lsd_client_data zero_lcd; struct lvfs_run_ctxt saved; int rc; loff_t off; ENTRY; - if (fed->fed_fcd == NULL) + if (fed->fed_lcd == NULL) RETURN(0); - /* XXX if fcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ - if (strcmp(fed->fed_fcd->fcd_uuid, obd->obd_uuid.uuid ) == 0) + /* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ + if (strcmp(fed->fed_lcd->lcd_uuid, obd->obd_uuid.uuid ) == 0) GOTO(free, 0); CDEBUG(D_INFO, "freeing client at idx %u, offset %lld with UUID '%s'\n", - fed->fed_lr_idx, off, fed->fed_fcd->fcd_uuid); + fed->fed_lr_idx, off, fed->fed_lcd->lcd_uuid); LASSERT(filter->fo_last_rcvd_slots != NULL); @@ -368,10 +368,10 @@ static int filter_client_free(struct obd_export *exp) } if (!(exp->exp_flags & OBD_OPT_FAILOVER)) { - memset(&zero_fcd, 0, sizeof zero_fcd); + memset(&zero_lcd, 0, sizeof(zero_lcd)); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, &zero_fcd, - sizeof(zero_fcd), &off, + rc = fsfilt_write_record(obd, filter->fo_rcvd_filp, &zero_lcd, + sizeof(zero_lcd), &off, (!exp->exp_libclient || exp->exp_need_sync)); @@ -384,7 +384,7 @@ static int filter_client_free(struct obd_export *exp) CDEBUG(rc == 0 ? D_INFO : D_ERROR, "zeroing out client %s at idx %u (%llu) in %s rc %d\n", - fed->fed_fcd->fcd_uuid, fed->fed_lr_idx, fed->fed_lr_off, + fed->fed_lcd->lcd_uuid, fed->fed_lr_idx, fed->fed_lr_off, LAST_RCVD, rc); } @@ -396,8 +396,8 @@ static int filter_client_free(struct obd_export *exp) EXIT; free: - OBD_FREE(fed->fed_fcd, sizeof(*fed->fed_fcd)); - fed->fed_fcd = NULL; + OBD_FREE_PTR(fed->fed_lcd); + fed->fed_lcd = NULL; return 0; } @@ -648,7 +648,7 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) { struct filter_obd *filter = &obd->u.filter; struct lr_server_data *fsd; - struct filter_client_data *fcd = NULL; + struct lsd_client_data *lcd = NULL; struct inode *inode = filp->f_dentry->d_inode; unsigned long last_rcvd_size = i_size_read(inode); __u64 mount_count; @@ -659,8 +659,8 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) /* ensure padding in the struct is the correct size */ CLASSERT (offsetof(struct lr_server_data, lsd_padding) + sizeof(fsd->lsd_padding) == LR_SERVER_SIZE); - CLASSERT (offsetof(struct filter_client_data, fcd_padding) + - sizeof(fcd->fcd_padding) == LR_CLIENT_SIZE); + CLASSERT (offsetof(struct lsd_client_data, lcd_padding) + + sizeof(lcd->lcd_padding) == LR_CLIENT_SIZE); OBD_ALLOC(fsd, sizeof(*fsd)); if (!fsd) @@ -751,57 +751,57 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) struct obd_export *exp; struct filter_export_data *fed; - if (!fcd) { - OBD_ALLOC(fcd, sizeof(*fcd)); - if (!fcd) + if (!lcd) { + OBD_ALLOC_PTR(lcd); + if (!lcd) GOTO(err_client, rc = -ENOMEM); } /* Don't assume off is incremented properly by - * fsfilt_read_record(), in case sizeof(*fcd) + * fsfilt_read_record(), in case sizeof(*lcd) * isn't the same as fsd->lsd_client_size. */ off = le32_to_cpu(fsd->lsd_client_start) + cl_idx * le16_to_cpu(fsd->lsd_client_size); - rc = fsfilt_read_record(obd, filp, fcd, sizeof(*fcd), &off); + rc = fsfilt_read_record(obd, filp, lcd, sizeof(*lcd), &off); if (rc) { CERROR("error reading FILT %s idx %d off %llu: rc %d\n", LAST_RCVD, cl_idx, off, rc); break; /* read error shouldn't cause startup to fail */ } - if (fcd->fcd_uuid[0] == '\0') { + if (lcd->lcd_uuid[0] == '\0') { CDEBUG(D_INFO, "skipping zeroed client at offset %d\n", cl_idx); continue; } - last_rcvd = le64_to_cpu(fcd->fcd_last_rcvd); + last_rcvd = le64_to_cpu(lcd->lcd_last_transno); /* These exports are cleaned up by filter_disconnect(), so they * need to be set up like real exports as filter_connect() does. */ - exp = class_new_export(obd, (struct obd_uuid *)fcd->fcd_uuid); + exp = class_new_export(obd, (struct obd_uuid *)lcd->lcd_uuid); CDEBUG(D_HA, "RCVRNG CLIENT uuid: %s idx: %d lr: "LPU64 - " srv lr: "LPU64"\n", fcd->fcd_uuid, cl_idx, + " srv lr: "LPU64"\n", lcd->lcd_uuid, cl_idx, last_rcvd, le64_to_cpu(fsd->lsd_last_transno)); if (IS_ERR(exp)) { if (PTR_ERR(exp) == -EALREADY) { /* export already exists, zero out this one */ CERROR("Zeroing out duplicate export due to " "bug 10479.\n"); - fcd->fcd_uuid[0] = '\0'; + lcd->lcd_uuid[0] = '\0'; } else { GOTO(err_client, rc = PTR_ERR(exp)); } } else { fed = &exp->exp_filter_data; - fed->fed_fcd = fcd; - filter_export_stats_init(obd, exp, 0); - rc = filter_client_add(obd, exp, cl_idx, 0); + fed->fed_lcd = lcd; + filter_export_stats_init(obd, exp, NULL); + rc = filter_client_add(obd, exp, cl_idx); /* can't fail for existing client */ LASSERTF(rc == 0, "rc = %d\n", rc); - fcd = NULL; + lcd = NULL; spin_lock(&exp->exp_lock); exp->exp_replay_needed = 1; @@ -821,8 +821,8 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) fsd->lsd_last_transno = cpu_to_le64(last_rcvd); } - if (fcd) - OBD_FREE(fcd, sizeof(*fcd)); + if (lcd) + OBD_FREE_PTR(lcd); obd->obd_last_committed = le64_to_cpu(fsd->lsd_last_transno); @@ -1684,6 +1684,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, __u8 *uuid_ptr; char *str, *label; char ns_name[48]; + request_queue_t *q; int rc; ENTRY; @@ -1765,7 +1766,7 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, filter->fo_fmd_max_age = FILTER_FMD_MAX_AGE_DEFAULT; sprintf(ns_name, "filter-%s", obd->obd_uuid.uuid); - obd->obd_namespace = ldlm_namespace_new(ns_name, LDLM_NAMESPACE_SERVER, + obd->obd_namespace = ldlm_namespace_new(obd, ns_name, LDLM_NAMESPACE_SERVER, LDLM_NAMESPACE_GREEDY); if (obd->obd_namespace == NULL) GOTO(err_post, rc = -ENOMEM); @@ -1786,6 +1787,15 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, if (rc) GOTO(err_post, rc); + q = bdev_get_queue(mnt->mnt_sb->s_bdev); + if (q->max_sectors < q->max_hw_sectors && + q->max_sectors < PTLRPC_MAX_BRW_SIZE >> 9) + LCONSOLE_INFO("%s: underlying device %s should be tuned " + "for larger I/O requests: max_sectors = %u " + "could be up to max_hw_sectors=%u\n", + obd->obd_name, mnt->mnt_sb->s_id, + q->max_sectors, q->max_hw_sectors); + uuid_ptr = fsfilt_uuid(obd, obd->u.obt.obt_sb); if (uuid_ptr != NULL) { class_uuid_unparse(uuid_ptr, &uuid); @@ -1949,9 +1959,9 @@ static int filter_llog_finish(struct obd_device *obd, int count) int rc = 0, rc2 = 0; ENTRY; - if (obd->u.filter.fo_lcm) { + if (obd->u.filter.fo_lcm) { llog_cleanup_commit_master((struct llog_commit_master *) - obd->u.filter.fo_lcm, 0); + obd->u.filter.fo_lcm, 1); OBD_FREE(obd->u.filter.fo_lcm, sizeof(struct llog_commit_master)); obd->u.filter.fo_lcm = NULL; @@ -2016,16 +2026,16 @@ static int filter_cleanup(struct obd_device *obd) lquota_cleanup(filter_quota_interface_ref, obd); - ldlm_namespace_free(obd->obd_namespace, obd->obd_force); + ldlm_namespace_free(obd->obd_namespace, NULL, obd->obd_force); + obd->obd_namespace = NULL; if (obd->u.obt.obt_sb == NULL) RETURN(0); filter_post(obd); - shrink_dcache_parent(obd->u.obt.obt_sb->s_root); - LL_DQUOT_OFF(obd->u.obt.obt_sb); + shrink_dcache_sb(obd->u.obt.obt_sb); server_put_mount(obd->obd_name, filter->fo_vfsmnt); obd->u.obt.obt_sb = NULL; @@ -2159,8 +2169,7 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd, { struct obd_export *exp; struct filter_export_data *fed; - struct filter_client_data *fcd = NULL; - lnet_nid_t *client_nid = (lnet_nid_t *)localdata; + struct lsd_client_data *lcd = NULL; int rc; ENTRY; @@ -2179,29 +2188,29 @@ static int filter_connect(struct lustre_handle *conn, struct obd_device *obd, if (rc) GOTO(cleanup, rc); - filter_export_stats_init(obd, exp, *client_nid); + filter_export_stats_init(obd, exp, localdata); if (!obd->obd_replayable) GOTO(cleanup, rc = 0); - OBD_ALLOC(fcd, sizeof(*fcd)); - if (!fcd) { + OBD_ALLOC_PTR(lcd); + if (!lcd) { CERROR("filter: out of memory for client data\n"); GOTO(cleanup, rc = -ENOMEM); } - memcpy(fcd->fcd_uuid, cluuid, sizeof(fcd->fcd_uuid)); - fed->fed_fcd = fcd; + memcpy(lcd->lcd_uuid, cluuid, sizeof(lcd->lcd_uuid)); + fed->fed_lcd = lcd; - rc = filter_client_add(obd, exp, -1, *client_nid); + rc = filter_client_add(obd, exp, -1); GOTO(cleanup, rc); cleanup: if (rc) { - if (fcd) { - OBD_FREE(fcd, sizeof(*fcd)); - fed->fed_fcd = NULL; + if (lcd) { + OBD_FREE_PTR(lcd); + fed->fed_lcd = NULL; } class_disconnect(exp); } else { @@ -3459,7 +3468,7 @@ static int filter_get_info(struct obd_export *exp, __u32 keylen, RETURN(-EINVAL); } - if (KEY_IS("blocksize")) { + if (KEY_IS(KEY_BLOCKSIZE)) { __u32 *blocksize = val; if (blocksize) { if (*vallen < sizeof(*blocksize)) @@ -3470,7 +3479,7 @@ static int filter_get_info(struct obd_export *exp, __u32 keylen, RETURN(0); } - if (KEY_IS("blocksize_bits")) { + if (KEY_IS(KEY_BLOCKSIZE_BITS)) { __u32 *blocksize_bits = val; if (blocksize_bits) { if (*vallen < sizeof(*blocksize_bits)) @@ -3481,7 +3490,7 @@ static int filter_get_info(struct obd_export *exp, __u32 keylen, RETURN(0); } - if (KEY_IS("last_id")) { + if (KEY_IS(KEY_LAST_ID)) { obd_id *last_id = val; /* FIXME: object groups */ if (last_id) { diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h index 27611c94de98f3b25cc1f54d0ede524fba0633c3..4620d0dde34947af35adc917b48557725f857ccd 100644 --- a/lustre/obdfilter/filter_internal.h +++ b/lustre/obdfilter/filter_internal.h @@ -32,25 +32,11 @@ extern struct file_operations filter_per_export_stats_fops; extern struct file_operations filter_per_nid_stats_fops; -/* Data stored per client in the last_rcvd file. In le32 order. */ -struct filter_client_data { - __u8 fcd_uuid[40]; /* client UUID */ - __u64 fcd_last_rcvd; /* last completed transaction ID */ - __u64 fcd_last_xid; /* client RPC xid for the last transaction */ - __u8 fcd_padding[LR_CLIENT_SIZE - 56]; -}; - /* Limit the returned fields marked valid to those that we actually might set */ #define FILTER_VALID_FLAGS (OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLGENER |\ OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | OBD_MD_FLBLKSZ|\ OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME) -struct filter_fid { - struct ll_fid ff_fid; /* ff_fid.f_type == file stripe number */ - __u64 ff_objid; - __u64 ff_group; -}; - /* per-client-per-object persistent state (LRU) */ struct filter_mod_data { struct list_head fmd_list; /* linked to fed_mod_list */ diff --git a/lustre/obdfilter/filter_lvb.c b/lustre/obdfilter/filter_lvb.c index cb0eb9ce0da71c3749fcbf31b8d4cb534b8a2219..678119b2b1bbd23cb63d4d47fa5e5b49da81fcfe 100644 --- a/lustre/obdfilter/filter_lvb.c +++ b/lustre/obdfilter/filter_lvb.c @@ -107,7 +107,7 @@ out_dentry: * * If 'increase_only' is true, don't allow values to move backwards. */ -static int filter_lvbo_update(struct ldlm_resource *res, struct lustre_msg *m, +static int filter_lvbo_update(struct ldlm_resource *res, struct ptlrpc_request *r, int buf_idx, int increase_only) { int rc = 0; @@ -131,11 +131,12 @@ static int filter_lvbo_update(struct ldlm_resource *res, struct lustre_msg *m, } /* Update the LVB from the network message */ - if (m != NULL) { + if (r != NULL) { struct ost_lvb *new; - new = lustre_swab_buf(m, buf_idx, sizeof(*new), - lustre_swab_ost_lvb); + /* XXX update always from reply buffer */ + new = lustre_swab_repbuf(r, buf_idx, sizeof(*new), + lustre_swab_ost_lvb); if (new == NULL) { CERROR("lustre_swab_buf failed\n"); goto disk_update; diff --git a/lustre/osc/cache.c b/lustre/osc/cache.c index c144d96e887125f896c48e81e4a7d39ce3b5877a..4f4ddd91f496c0246ce79d9bd1f9d205009bf739 100644 --- a/lustre/osc/cache.c +++ b/lustre/osc/cache.c @@ -137,16 +137,39 @@ int cache_add_extent(struct lustre_cache *cache, struct ldlm_res_id *res, RETURN(0); } +static void cache_extent_removal_get(struct page_removal_cb_element *element) +{ + atomic_inc(&element->prce_refcnt); +} + +static void cache_extent_removal_put(struct page_removal_cb_element *element) +{ + if(atomic_dec_and_test(&element->prce_refcnt)) + OBD_FREE_PTR(element); +} + static int cache_extent_removal_event(struct lustre_cache *cache, void *data, int discard) { struct page *page = data; + struct list_head *iter; struct page_removal_cb_element *element; - list_for_each_entry(element, &cache->lc_page_removal_callback_list, - prce_list) { + read_lock(&cache->lc_page_removal_cb_lock); + iter = cache->lc_page_removal_callback_list.next; + while(iter != &cache->lc_page_removal_callback_list) { + element = list_entry(iter, struct page_removal_cb_element, prce_list); + cache_extent_removal_get(element); + read_unlock(&cache->lc_page_removal_cb_lock); + element->prce_callback(page, discard); + + read_lock(&cache->lc_page_removal_cb_lock); + iter = iter->next; + cache_extent_removal_put(element); } + read_unlock(&cache->lc_page_removal_cb_lock); + return 0; } @@ -166,12 +189,17 @@ int cache_add_extent_removal_cb(struct lustre_cache *cache, if (!func_cb) return 0; - OBD_ALLOC(element, sizeof(*element)); + + OBD_ALLOC_PTR(element); if (!element) return -ENOMEM; element->prce_callback = func_cb; + atomic_set(&element->prce_refcnt, 1); + + write_lock(&cache->lc_page_removal_cb_lock); list_add_tail(&element->prce_list, &cache->lc_page_removal_callback_list); + write_unlock(&cache->lc_page_removal_cb_lock); cache->lc_pin_extent_cb = pin_cb; return 0; @@ -187,17 +215,21 @@ int cache_del_extent_removal_cb(struct lustre_cache *cache, int found = 0; struct page_removal_cb_element *element, *t; + write_lock(&cache->lc_page_removal_cb_lock); list_for_each_entry_safe(element, t, &cache->lc_page_removal_callback_list, prce_list) { if (element->prce_callback == func_cb) { list_del(&element->prce_list); - OBD_FREE(element, sizeof(*element)); + write_unlock(&cache->lc_page_removal_cb_lock); found = 1; + cache_extent_removal_put(element); + write_lock(&cache->lc_page_removal_cb_lock); /* We continue iterating the list in case this function was registered more than once */ } } + write_unlock(&cache->lc_page_removal_cb_lock); if (list_empty(&cache->lc_page_removal_callback_list)) cache->lc_pin_extent_cb = NULL; @@ -357,6 +389,7 @@ struct lustre_cache *cache_create(struct obd_device *obd) spin_lock_init(&cache->lc_locks_list_lock); CFS_INIT_LIST_HEAD(&cache->lc_locks_list); CFS_INIT_LIST_HEAD(&cache->lc_page_removal_callback_list); + rwlock_init(&cache->lc_page_removal_cb_lock); cache->lc_obd = obd; out: diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index c26d2e3494e3fe45175f6abd8c3930422d1b0fba..b82bf620851ec2f6860d8c302361c21e36afa65b 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -87,7 +87,8 @@ static int osc_interpret_create(struct ptlrpc_request *req, void *data, int rc) DEBUG_REQ(D_INODE, req, "Got EGAIN - resend \n"); break; case -ENOSPC: - case -EROFS: { + case -EROFS: + case -EFBIG: { oscc->oscc_flags |= OSCC_FLAG_NOSPC; if (body && rc == -ENOSPC) { oscc->oscc_grow_count = OST_MIN_PRECREATE; @@ -132,8 +133,14 @@ static int oscc_internal_create(struct osc_creator *oscc) LASSERT_SPIN_LOCKED(&oscc->oscc_lock); + if (oscc->oscc_flags & OSCC_FLAG_CREATING || + oscc->oscc_flags & OSCC_FLAG_RECOVERING) { + spin_unlock(&oscc->oscc_lock); + RETURN(0); + } + if (oscc->oscc_grow_count < OST_MAX_PRECREATE && - !(oscc->oscc_flags & (OSCC_FLAG_LOW | OSCC_FLAG_RECOVERING)) && + ((oscc->oscc_flags & OSCC_FLAG_LOW) == 0) && (__s64)(oscc->oscc_last_id - oscc->oscc_next_id) <= (oscc->oscc_grow_count / 4 + 1)) { oscc->oscc_flags |= OSCC_FLAG_LOW; @@ -143,11 +150,6 @@ static int oscc_internal_create(struct osc_creator *oscc) if (oscc->oscc_grow_count > OST_MAX_PRECREATE / 2) oscc->oscc_grow_count = OST_MAX_PRECREATE / 2; - if (oscc->oscc_flags & OSCC_FLAG_CREATING || - oscc->oscc_flags & OSCC_FLAG_RECOVERING) { - spin_unlock(&oscc->oscc_lock); - RETURN(0); - } oscc->oscc_flags |= OSCC_FLAG_CREATING; spin_unlock(&oscc->oscc_lock); @@ -322,6 +324,9 @@ int osc_create(struct obd_export *exp, struct obdo *oa, RETURN(0); } oscc->oscc_flags |= OSCC_FLAG_SYNC_IN_PROGRESS; + /* seting flag LOW we prevent extra grow precreate size + * and enforce use last assigned size */ + oscc->oscc_flags |= OSCC_FLAG_LOW; spin_unlock(&oscc->oscc_lock); CDEBUG(D_HA, "%s: oscc recovery started - delete to "LPU64"\n", oscc->oscc_obd->obd_name, oscc->oscc_next_id - 1); diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index 2f0f41b7d7af46edce496d172d1e44e73a6f0c33..b26fbf5d9ff0fa057b2318bca46c1cb2f985382f 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -93,5 +93,10 @@ static inline int osc_should_resend(int resend, struct client_obd *cli) atomic_read(&cli->cl_resends) > resend : 1; } +static inline int osc_exp_is_2_0_server(struct obd_export *exp) { + LASSERT(exp); + return !!(exp->exp_connect_flags & OBD_CONNECT_FID); +} + #endif /* OSC_INTERNAL_H */ diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index c96ca76752401cdacc25a7eb8213fa35c63edb75..a9a6f9c04f1e598d4d9933b216495d8c1aa2f095 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -63,6 +63,7 @@ static quota_interface_t *quota_interface = NULL; extern quota_interface_t osc_quota_interface; static void osc_release_ppga(struct brw_page **ppga, obd_count count); +static int brw_interpret(struct ptlrpc_request *request, void *data, int rc); int osc_cleanup(struct obd_device *obd); static quota_interface_t *quota_interface; @@ -319,12 +320,17 @@ static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo, { struct ptlrpc_request *req; struct ost_body *body; - int size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; + int size[3] = { sizeof(struct ptlrpc_body), sizeof(*body), 0 }; + int bufcount = 2; struct osc_async_args *aa; ENTRY; + if (osc_exp_is_2_0_server(exp)) { + bufcount = 3; + } + req = ptlrpc_prep_req(class_exp2cliimp(exp), LUSTRE_OST_VERSION, - OST_SETATTR, 2, size, NULL); + OST_SETATTR, bufcount, size, NULL); if (!req) RETURN(-ENOMEM); @@ -548,16 +554,18 @@ static int osc_sync(struct obd_export *exp, struct obdo *oa, /* Find and cancel locally locks matched by @mode in the resource found by * @objid. Found locks are added into @cancel list. Returns the amount of * locks added to @cancels list. */ -static int osc_resource_get_unused(struct obd_export *exp, __u64 objid, +static int osc_resource_get_unused(struct obd_export *exp, struct obdo *oa, struct list_head *cancels, ldlm_mode_t mode, int lock_flags) { struct ldlm_namespace *ns = exp->exp_obd->obd_namespace; - struct ldlm_res_id res_id = { .name = { objid } }; - struct ldlm_resource *res = ldlm_resource_get(ns, NULL, res_id, 0, 0); + struct ldlm_res_id res_id; + struct ldlm_resource *res; int count; ENTRY; + osc_build_res_name(oa->o_id, oa->o_gr, &res_id); + res = ldlm_resource_get(ns, NULL, res_id, 0, 0); if (res == NULL) RETURN(0); @@ -625,7 +633,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa, LASSERT(oa->o_id != 0); - count = osc_resource_get_unused(exp, oa->o_id, &cancels, LCK_PW, + count = osc_resource_get_unused(exp, oa, &cancels, LCK_PW, LDLM_FL_DISCARD_DATA); if (exp_connect_cancelset(exp)) bufcount = 3; @@ -814,7 +822,7 @@ static void osc_update_grant(struct client_obd *cli, struct ost_body *body) CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant); if (body->oa.o_valid & OBD_MD_FLGRANT) cli->cl_avail_grant += body->oa.o_grant; - /* waiters are woken in brw_interpret_oap */ + /* waiters are woken in brw_interpret */ client_obd_list_unlock(&cli->cl_loi_list_lock); } @@ -870,7 +878,7 @@ static int check_write_rcs(struct ptlrpc_request *req, CERROR("Missing/short RC vector on BRW_WRITE reply\n"); return(-EPROTO); } - if (lustre_msg_swabbed(req->rq_repmsg)) + if (lustre_rep_need_swab(req)) for (i = 0; i < niocount; i++) __swab32s(&remote_rcs[i]); @@ -1425,33 +1433,6 @@ int osc_brw_redo_request(struct ptlrpc_request *request, RETURN(0); } -static int brw_interpret(struct ptlrpc_request *request, void *data, int rc) -{ - struct osc_brw_async_args *aa = data; - int i; - ENTRY; - - rc = osc_brw_fini_request(request, rc); - CDEBUG(D_INODE, "request %p aa %p rc %d\n", request, aa, rc); - if (osc_recoverable_error(rc)) { - rc = osc_brw_redo_request(request, aa); - if (rc == 0) - RETURN(0); - } - client_obd_list_lock(&aa->aa_cli->cl_loi_list_lock); - if (lustre_msg_get_opc(request->rq_reqmsg) == OST_WRITE) - aa->aa_cli->cl_w_in_flight--; - else - aa->aa_cli->cl_r_in_flight--; - - for (i = 0; i < aa->aa_page_count; i++) - osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1); - client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock); - osc_release_ppga(aa->aa_ppga, aa->aa_page_count); - - RETURN(rc); -} - static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *lsm, obd_count page_count, struct brw_page **pga, struct ptlrpc_request_set *set) @@ -1487,6 +1468,7 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa, cli->cl_w_in_flight); ptlrpc_lprocfs_brw(request, OST_WRITE, aa->aa_requested_nob); } + LASSERT(list_empty(&aa->aa_oaps)); if (rc == 0) { request->rq_interpret_reply = brw_interpret; @@ -1497,10 +1479,12 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa, else cli->cl_w_in_flight++; client_obd_list_unlock(&cli->cl_loi_list_lock); + OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DIO_PAUSE, 3); } else if (cmd == OBD_BRW_WRITE) { client_obd_list_lock(&cli->cl_loi_list_lock); for (i = 0; i < page_count; i++) osc_release_write_grant(cli, pga[i], 0); + osc_wake_cache_waiters(cli); client_obd_list_unlock(&cli->cl_loi_list_lock); } @@ -1957,10 +1941,9 @@ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa, EXIT; } -static int brw_interpret_oap(struct ptlrpc_request *request, void *data, int rc) +static int brw_interpret(struct ptlrpc_request *request, void *data, int rc) { struct osc_brw_async_args *aa = data; - struct osc_async_page *oap, *tmp; struct client_obd *cli; ENTRY; @@ -1983,19 +1966,24 @@ static int brw_interpret_oap(struct ptlrpc_request *request, void *data, int rc) else cli->cl_r_in_flight--; - /* the caller may re-use the oap after the completion call so - * we need to clean it up a little */ - list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, oap_rpc_item) { - list_del_init(&oap->oap_rpc_item); - osc_ap_completion(cli, aa->aa_oa, oap, 1, rc); + if (!list_empty(&aa->aa_oaps)) { /* from osc_send_oap_rpc() */ + struct osc_async_page *oap, *tmp; + /* the caller may re-use the oap after the completion call so + * we need to clean it up a little */ + list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, oap_rpc_item) { + list_del_init(&oap->oap_rpc_item); + osc_ap_completion(cli, aa->aa_oa, oap, 1, rc); + } + OBDO_FREE(aa->aa_oa); + } else { /* from async_internal() */ + int i; + for (i = 0; i < aa->aa_page_count; i++) + osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1); } - osc_wake_cache_waiters(cli); osc_check_rpcs(cli); client_obd_list_unlock(&cli->cl_loi_list_lock); - OBDO_FREE(aa->aa_oa); - osc_release_ppga(aa->aa_ppga, aa->aa_page_count); RETURN(rc); } @@ -2080,6 +2068,17 @@ out: /* the loi lock is held across this function but it's allowed to release * and reacquire it during its work */ +/** + * prepare pages for ASYNC io and put pages in send queue. + * + * \param cli - + * \param loi - + * \param cmd - OBD_BRW_* macroses + * \param lop - pending pages + * + * \return zero if pages successfully add to send queue. + * \return not zere if error occurring. + */ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, int cmd, struct loi_oap_pages *lop) { @@ -2153,12 +2152,14 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, /* * Page submitted for IO has to be locked. Either by * ->ap_make_ready() or by higher layers. - * - * XXX nikita: this assertion should be adjusted when lustre - * starts using PG_writeback for pages being written out. */ #if defined(__KERNEL__) && defined(__linux__) - LASSERT(PageLocked(oap->oap_page)); + if(!(PageLocked(oap->oap_page) && + (CheckWriteback(oap->oap_page, cmd) || oap->oap_oig !=NULL))) { + CDEBUG(D_PAGE, "page %p lost wb %lx/%x\n", + oap->oap_page, (long)oap->oap_page->flags, oap->oap_async_flags); + LBUG(); + } #endif /* If there is a gap at the start of this page, it can't merge * with any previous page, so we'll hand the network a @@ -2282,7 +2283,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight", page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight); - req->rq_interpret_reply = brw_interpret_oap; + req->rq_interpret_reply = brw_interpret; ptlrpcd_add_req(req); RETURN(1); } @@ -2465,6 +2466,35 @@ static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi, RETURN(-EDQUOT); } +static int osc_reget_short_lock(struct obd_export *exp, + struct lov_stripe_md *lsm, + void **res, int rw, + obd_off start, obd_off end, + void **cookie) +{ + struct osc_async_page *oap = *res; + int rc; + + ENTRY; + + spin_lock(&oap->oap_lock); + rc = ldlm_lock_fast_match(oap->oap_ldlm_lock, rw, + start, end, cookie); + spin_unlock(&oap->oap_lock); + + RETURN(rc); +} + +static int osc_release_short_lock(struct obd_export *exp, + struct lov_stripe_md *lsm, obd_off end, + void *cookie, int rw) +{ + ENTRY; + ldlm_lock_fast_release(cookie, rw); + /* no error could have happened at this layer */ + RETURN(0); +} + int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, struct lov_oinfo *loi, cfs_page_t *page, obd_off offset, struct obd_async_page_ops *ops, @@ -2474,7 +2504,7 @@ int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, struct osc_async_page *oap; struct ldlm_res_id oid = {{0}}; int rc = 0; - + ENTRY; if (!page) @@ -2502,7 +2532,7 @@ int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, /* If the page was marked as notcacheable - don't add to any locks */ if (!nocache) { - oid.name[0] = loi->loi_id; + osc_build_res_name(loi->loi_id, loi->loi_gr, &oid); /* This is the only place where we can call cache_add_extent without oap_lock, because this page is locked now, and the lock we are adding it to is referenced, so cannot lose @@ -2881,9 +2911,10 @@ static void osc_set_data_with_check(struct lustre_handle *lockh, void *data, static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, ldlm_iterator_t replace, void *data) { - struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; + struct ldlm_res_id res_id; struct obd_device *obd = class_exp2obd(exp); + osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_gr, &res_id); ldlm_resource_iterate(obd->obd_namespace, &res_id, replace, data); return 0; } @@ -2967,7 +2998,7 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, struct ldlm_enqueue_info *einfo, struct ptlrpc_request_set *rqset) { - struct ldlm_res_id res_id = { .name = {oinfo->oi_md->lsm_object_id} }; + struct ldlm_res_id res_id; struct obd_device *obd = exp->exp_obd; struct ldlm_reply *rep; struct ptlrpc_request *req = NULL; @@ -2976,6 +3007,8 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, int rc; ENTRY; + osc_build_res_name(oinfo->oi_md->lsm_object_id, + oinfo->oi_md->lsm_object_gr, &res_id); /* Filesystem lock extents are extended to page boundaries so that * dealing with the page cache is a little smoother. */ oinfo->oi_policy.l_extent.start -= @@ -3083,12 +3116,14 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm, __u32 type, ldlm_policy_data_t *policy, __u32 mode, int *flags, void *data, struct lustre_handle *lockh) { - struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; + struct ldlm_res_id res_id; struct obd_device *obd = exp->exp_obd; int lflags = *flags; ldlm_mode_t rc; ENTRY; + osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_gr, &res_id); + OBD_FAIL_RETURN(OBD_FAIL_OSC_MATCH, -EIO); /* Filesystem lock extents are extended to page boundaries so that @@ -3134,19 +3169,30 @@ static int osc_cancel_unused(struct obd_export *exp, struct lov_stripe_md *lsm, int flags, void *opaque) { struct obd_device *obd = class_exp2obd(exp); - struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; + struct ldlm_res_id res_id, *resp = NULL; + + if (lsm != NULL) { + resp = osc_build_res_name(lsm->lsm_object_id, + lsm->lsm_object_gr, &res_id); + } + + return ldlm_cli_cancel_unused(obd->obd_namespace, resp, flags, opaque); - return ldlm_cli_cancel_unused(obd->obd_namespace, &res_id, flags, - opaque); } static int osc_join_lru(struct obd_export *exp, struct lov_stripe_md *lsm, int join) { struct obd_device *obd = class_exp2obd(exp); - struct ldlm_res_id res_id = { .name = {lsm->lsm_object_id} }; + struct ldlm_res_id res_id, *resp = NULL; + + if (lsm != NULL) { + resp = osc_build_res_name(lsm->lsm_object_id, + lsm->lsm_object_gr, &res_id); + } + + return ldlm_cli_join_lru(obd->obd_namespace, resp, join); - return ldlm_cli_join_lru(obd->obd_namespace, &res_id, join); } static int osc_statfs_interpret(struct ptlrpc_request *req, @@ -3213,17 +3259,29 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs, { struct obd_statfs *msfs; struct ptlrpc_request *req; + struct obd_import *imp = NULL; int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(*osfs) }; ENTRY; + /*Since the request might also come from lprocfs, so we need + *sync this with client_disconnect_export Bug15684*/ + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) + imp = class_import_get(obd->u.cli.cl_import); + up_read(&obd->u.cli.cl_sem); + if (!imp) + RETURN(-ENODEV); + /* We could possibly pass max_age in the request (as an absolute * timestamp or a "seconds.usec ago") so the target can avoid doing * extra calls into the filesystem if that isn't necessary (e.g. * during mount that would help a bit). Having relative timestamps * is not so great if request processing is slow, while absolute * timestamps are not ideal because they need time synchronization. */ - req = ptlrpc_prep_req(obd->u.cli.cl_import, LUSTRE_OST_VERSION, + req = ptlrpc_prep_req(imp, LUSTRE_OST_VERSION, OST_STATFS, 1, NULL, NULL); + + class_import_put(imp); if (!req) RETURN(-ENOMEM); @@ -3416,12 +3474,12 @@ static int osc_get_info(struct obd_export *exp, obd_count keylen, if (!vallen || !val) RETURN(-EFAULT); - if (KEY_IS("lock_to_stripe")) { + if (KEY_IS(KEY_LOCK_TO_STRIPE)) { __u32 *stripe = val; *vallen = sizeof(*stripe); *stripe = 0; RETURN(0); - } else if (KEY_IS("last_id")) { + } else if (KEY_IS(KEY_LAST_ID)) { struct ptlrpc_request *req; obd_id *reply; char *bufs[2] = { NULL, key }; @@ -3505,7 +3563,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(0); } - if (KEY_IS("unlinked")) { + if (KEY_IS(KEY_UNLINKED)) { struct osc_creator *oscc = &obd->u.cli.cl_oscc; spin_lock(&oscc->oscc_lock); oscc->oscc_flags &= ~OSCC_FLAG_NOSPC; @@ -3525,7 +3583,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(0); } - if (KEY_IS("checksum")) { + if (KEY_IS(KEY_CHECKSUM)) { if (vallen != sizeof(int)) RETURN(-EINVAL); exp->exp_obd->u.cli.cl_checksum = (*(int *)val) ? 1 : 0; @@ -3719,6 +3777,7 @@ static int osc_import_event(struct obd_device *obd, oscc->oscc_flags &= ~OSCC_FLAG_NOSPC; spin_unlock(&oscc->oscc_lock); } + CDEBUG(D_INFO, "notify server \n"); rc = obd_notify_observer(obd, obd, OBD_NOTIFY_ACTIVE, NULL); break; } @@ -3768,7 +3827,7 @@ int osc_setup(struct obd_device *obd, obd_count len, void *buf) oscc_init(obd); /* We need to allocate a few requests more, because - brw_interpret_oap tries to create new requests before freeing + brw_interpret tries to create new requests before freeing previous ones. Ideally we want to have 2x max_rpcs_in_flight reserved, but I afraid that might be too much wasted RAM in fact, so 2 is just my guess and still should work. */ @@ -3813,13 +3872,13 @@ static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) class_destroy_import(imp); obd->u.cli.cl_import = NULL; } - break; - } - case OBD_CLEANUP_SELF_EXP: rc = obd_llog_finish(obd, 0); if (rc != 0) CERROR("failed to cleanup llogging subsystems\n"); break; + } + case OBD_CLEANUP_SELF_EXP: + break; case OBD_CLEANUP_OBD: break; } @@ -3923,6 +3982,8 @@ struct obd_ops osc_obd_ops = { .o_brw = osc_brw, .o_brw_async = osc_brw_async, .o_prep_async_page = osc_prep_async_page, + .o_reget_short_lock = osc_reget_short_lock, + .o_release_short_lock = osc_release_short_lock, .o_queue_async_io = osc_queue_async_io, .o_set_async_flags = osc_set_async_flags, .o_queue_group_io = osc_queue_group_io, diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index e191bf2e872746af9f26dc2da129337fa85390cd..f2a43eafd932a0aaa1849c467d6750a4443ffc76 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -725,7 +725,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) CERROR("Missing/short niobuf\n"); GOTO(out, rc = -EFAULT); } - if (lustre_msg_swabbed(req->rq_reqmsg)) { /* swab remaining niobufs */ + if (lustre_req_need_swab(req)) { + /* swab remaining niobufs */ for (i = 1; i < niocount; i++) lustre_swab_niobuf_remote (&remote_nb[i]); } @@ -967,7 +968,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) if (exp->exp_failed) GOTO(out, rc = -ENOTCONN); - swab = lustre_msg_swabbed(req->rq_reqmsg); + swab = lustre_req_need_swab(req); body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), lustre_swab_ost_body); if (body == NULL) { @@ -975,7 +976,6 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) GOTO(out, rc = -EFAULT); } - lustre_set_req_swabbed(req, REQ_REC_OFF + 1); objcount = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF + 1) / sizeof(*ioo); if (objcount == 0) { @@ -987,6 +987,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) GOTO(out, rc = -EFAULT); } + lustre_set_req_swabbed(req, REQ_REC_OFF + 1); ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, objcount * sizeof(*ioo)); LASSERT (ioo != NULL); @@ -1293,7 +1294,7 @@ static int ost_set_info(struct obd_export *exp, struct ptlrpc_request *req) if (vallen) val = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, 0); - if (KEY_IS("evict_by_nid")) { + if (KEY_IS(KEY_EVICT_BY_NID)) { if (val && vallen) obd_export_evict_by_nid(exp->exp_obd, val); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index b7b22eca7db3b98047bbcad1b4a642a1daf49bf8..1f2a4cb02511885dc7e025d8b2ad8f363a0bda44 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -258,9 +258,9 @@ static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) time_t now = cfs_time_current_sec(); LASSERT(req->rq_import); - + st = lustre_msg_get_service_time(req->rq_repmsg); - + /* Network latency is total time less server processing time */ nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/; if (st > now - req->rq_sent + 2 /* rounding */) @@ -281,15 +281,16 @@ static int unpack_reply(struct ptlrpc_request *req) { int rc; - /* Clear reply swab mask; we may have already swabbed an early reply */ req->rq_rep_swab_mask = 0; - rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received); - if (rc) { + if (rc < 0) { DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); return(-EPROTO); } + if (rc > 0) + lustre_set_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); + rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); if (rc) { DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); @@ -547,7 +548,6 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode, atomic_set(&request->rq_refcount, 1); lustre_msg_set_opc(request->rq_reqmsg, opcode); - lustre_msghdr_set_flags(request->rq_reqmsg, imp->imp_msghdr_flags); RETURN(request); } @@ -851,14 +851,9 @@ static int after_reply(struct ptlrpc_request *req) RETURN(rc); } } else { - /* Let's look if server send slv. Do it only for RPC with + /* Let's look if server sent slv. Do it only for RPC with * rc == 0. */ - if (imp->imp_obd->obd_namespace) { - /* Disconnect rpc is sent when namespace is already - * destroyed. Let's check this and will not try update - * pool. */ - ldlm_cli_update_pool(req); - } + ldlm_cli_update_pool(req); } /* Store transno in reqmsg for replay. */ @@ -939,7 +934,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) lustre_msg_set_status(req->rq_reqmsg, cfs_curproc_pid()); CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc" - " %s:%s:%d:"LPU64":%s:%d\n", cfs_curproc_comm(), + " %s:%s:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(), imp->imp_obd->obd_uuid.uuid, lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, libcfs_nid2str(imp->imp_connection->c_peer.nid), @@ -1171,7 +1166,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) req->rq_phase = RQ_PHASE_COMPLETE; CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:nid:" - "opc %s:%s:%d:"LPU64":%s:%d\n", cfs_curproc_comm(), + "opc %s:%s:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(), imp->imp_obd->obd_uuid.uuid, lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, libcfs_nid2str(imp->imp_connection->c_peer.nid), @@ -1778,7 +1773,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) lustre_msg_set_status(req->rq_reqmsg, cfs_curproc_pid()); LASSERT(imp->imp_obd != NULL); CDEBUG(D_RPCTRACE, "Sending RPC pname:cluuid:pid:xid:nid:opc " - "%s:%s:%d:"LPU64":%s:%d\n", cfs_curproc_comm(), + "%s:%s:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(), imp->imp_obd->obd_uuid.uuid, lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, libcfs_nid2str(imp->imp_connection->c_peer.nid), @@ -1885,7 +1880,7 @@ restart: } CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:nid:opc " - "%s:%s:%d:"LPU64":%s:%d\n", cfs_curproc_comm(), + "%s:%s:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(), imp->imp_obd->obd_uuid.uuid, lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, libcfs_nid2str(imp->imp_connection->c_peer.nid), diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index fc2c87b8e02e2d3ffdb27ede5c4d4aca33f0de6c..3125de847eeff234875a1010303c75033b16a1f1 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -199,7 +199,14 @@ void ptlrpc_invalidate_import(struct obd_import *imp) atomic_inc(&imp->imp_inval_count); - if (!imp->imp_invalid) + /* + * If this is an invalid MGC connection, then don't bother + * waiting for imp_inflight to drop to 0. + */ + if (imp->imp_invalid && imp->imp_recon_bk && !imp->imp_obd->obd_no_recov) + goto out; + + if (!imp->imp_invalid || imp->imp_obd->obd_no_recov) ptlrpc_deactivate_import(imp); LASSERT(imp->imp_invalid); @@ -230,8 +237,9 @@ void ptlrpc_invalidate_import(struct obd_import *imp) LASSERT(atomic_read(&imp->imp_inflight) == 0); } + out: obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); - + atomic_dec(&imp->imp_inval_count); cfs_waitq_signal(&imp->imp_recovery_waitq); } @@ -488,7 +496,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) /* Don't retry if connect fails */ rc = 0; obd_set_info_async(obd->obd_self_export, - strlen(KEY_INIT_RECOV), + sizeof(KEY_INIT_RECOV), KEY_INIT_RECOV, sizeof(rc), &rc, NULL); } @@ -655,7 +663,14 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, *lustre_msg_get_handle(request->rq_repmsg); IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); - ptlrpc_activate_import(imp); + spin_lock(&imp->imp_lock); + if (imp->imp_invalid) { + spin_unlock(&imp->imp_lock); + ptlrpc_activate_import(imp); + } else { + spin_unlock(&imp->imp_lock); + } + GOTO(finish, rc = 0); } else { spin_unlock(&imp->imp_lock); diff --git a/lustre/ptlrpc/llog_client.c b/lustre/ptlrpc/llog_client.c index 27578a4f5eb15a0a143bf53347b0314b13fa4778..ff5fddd7bde9b9aeab545f97cc5a7e9f4ab11601 100644 --- a/lustre/ptlrpc/llog_client.c +++ b/lustre/ptlrpc/llog_client.c @@ -43,6 +43,31 @@ #include <lustre_net.h> #include <libcfs/list.h> +#define LLOG_CLIENT_ENTRY(ctxt, imp) do { \ + mutex_down(&ctxt->loc_sem); \ + if (ctxt->loc_imp) { \ + imp = class_import_get(ctxt->loc_imp); \ + } else { \ + CERROR("ctxt->loc_imp == NULL for context idx %d." \ + "Unable to complete MDS/OSS recovery," \ + "but I'll try again next time. Not fatal.\n", \ + ctxt->loc_idx); \ + imp = NULL; \ + mutex_up(&ctxt->loc_sem); \ + return (-EINVAL); \ + } \ + mutex_up(&ctxt->loc_sem); \ +} while(0) + +#define LLOG_CLIENT_EXIT(ctxt, imp) do { \ + mutex_down(&ctxt->loc_sem); \ + if (ctxt->loc_imp != imp) \ + CWARN("loc_imp has changed from %p to %p", \ + ctxt->loc_imp, imp); \ + class_import_put(imp); \ + mutex_up(&ctxt->loc_sem); \ +} while(0) + /* This is a callback from the llog_* functions. * Assumes caller has already pushed us into the kernel context. */ static int llog_client_create(struct llog_ctxt *ctxt, struct llog_handle **res, @@ -59,18 +84,11 @@ static int llog_client_create(struct llog_ctxt *ctxt, struct llog_handle **res, int rc; ENTRY; - if (ctxt->loc_imp == NULL) { - /* This used to be an assert; bug 6200 */ - CERROR("ctxt->loc_imp == NULL for context idx %d. Unable to " - "complete MDS/OSS recovery, but I'll try again next " - "time. Not fatal.\n", ctxt->loc_idx); - RETURN(-EINVAL); - } - imp = ctxt->loc_imp; + LLOG_CLIENT_ENTRY(ctxt, imp); handle = llog_alloc_handle(); if (handle == NULL) - RETURN(-ENOMEM); + GOTO(out, rc = -ENOMEM); *res = handle; memset(&req_body, 0, sizeof(req_body)); @@ -107,6 +125,7 @@ static int llog_client_create(struct llog_ctxt *ctxt, struct llog_handle **res, out: if (req) ptlrpc_req_finished(req); + LLOG_CLIENT_EXIT(ctxt, imp); RETURN(rc); err_free: @@ -116,17 +135,18 @@ err_free: static int llog_client_destroy(struct llog_handle *loghandle) { - struct obd_import *imp = loghandle->lgh_ctxt->loc_imp; + struct obd_import *imp; struct ptlrpc_request *req = NULL; struct llogd_body *body; int size[] = { sizeof(struct ptlrpc_body), sizeof(*body) }; int rc; ENTRY; + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, LLOG_ORIGIN_HANDLE_DESTROY, 2, size, NULL); if (!req) - RETURN(-ENOMEM); + GOTO(out, rc = -ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); body->lgd_logid = loghandle->lgh_id; @@ -136,6 +156,8 @@ static int llog_client_destroy(struct llog_handle *loghandle) rc = ptlrpc_queue_wait(req); ptlrpc_req_finished(req); +out: + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); RETURN(rc); } @@ -144,7 +166,7 @@ static int llog_client_next_block(struct llog_handle *loghandle, int *cur_idx, int next_idx, __u64 *cur_offset, void *buf, int len) { - struct obd_import *imp = loghandle->lgh_ctxt->loc_imp; + struct obd_import *imp; struct ptlrpc_request *req = NULL; struct llogd_body *body; void * ptr; @@ -152,10 +174,11 @@ static int llog_client_next_block(struct llog_handle *loghandle, int rc; ENTRY; + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, LLOG_ORIGIN_HANDLE_NEXT_BLOCK, 2, size, NULL); if (!req) - GOTO(out, rc = -ENOMEM); + GOTO(out, rc =-ENOMEM); body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); body->lgd_logid = loghandle->lgh_id; @@ -194,13 +217,14 @@ static int llog_client_next_block(struct llog_handle *loghandle, out: if (req) ptlrpc_req_finished(req); + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); RETURN(rc); } static int llog_client_prev_block(struct llog_handle *loghandle, int prev_idx, void *buf, int len) { - struct obd_import *imp = loghandle->lgh_ctxt->loc_imp; + struct obd_import *imp; struct ptlrpc_request *req = NULL; struct llogd_body *body; void * ptr; @@ -208,6 +232,7 @@ static int llog_client_prev_block(struct llog_handle *loghandle, int rc; ENTRY; + LLOG_CLIENT_ENTRY(loghandle->lgh_ctxt, imp); req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, LLOG_ORIGIN_HANDLE_PREV_BLOCK, 2, size, NULL); if (!req) @@ -244,12 +269,13 @@ static int llog_client_prev_block(struct llog_handle *loghandle, out: if (req) ptlrpc_req_finished(req); + LLOG_CLIENT_EXIT(loghandle->lgh_ctxt, imp); RETURN(rc); } static int llog_client_read_header(struct llog_handle *handle) { - struct obd_import *imp = handle->lgh_ctxt->loc_imp; + struct obd_import *imp; struct ptlrpc_request *req = NULL; struct llogd_body *body; struct llog_log_hdr *hdr; @@ -259,6 +285,7 @@ static int llog_client_read_header(struct llog_handle *handle) int rc; ENTRY; + LLOG_CLIENT_ENTRY(handle->lgh_ctxt, imp); req = ptlrpc_prep_req(imp, LUSTRE_LOG_VERSION, LLOG_ORIGIN_HANDLE_READ_HEADER, 2, size, NULL); if (!req) @@ -301,6 +328,7 @@ static int llog_client_read_header(struct llog_handle *handle) out: if (req) ptlrpc_req_finished(req); + LLOG_CLIENT_EXIT(handle->lgh_ctxt, imp); RETURN(rc); } diff --git a/lustre/ptlrpc/llog_net.c b/lustre/ptlrpc/llog_net.c index 3779ed5a52c1e12eb5e590a2783e2db0ec2a50eb..8d955d3f9c5a15a1690eaa8abb7a00b27192fbd0 100644 --- a/lustre/ptlrpc/llog_net.c +++ b/lustre/ptlrpc/llog_net.c @@ -147,20 +147,18 @@ int llog_receptor_accept(struct llog_ctxt *ctxt, struct obd_import *imp) { ENTRY; LASSERT(ctxt); - ctxt->loc_imp = imp; + mutex_down(&ctxt->loc_sem); + if (ctxt->loc_imp != imp) { + CWARN("changing the import %p - %p\n", ctxt->loc_imp, imp); + if (ctxt->loc_imp) + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = class_import_get(imp); + } + mutex_up(&ctxt->loc_sem); RETURN(0); } EXPORT_SYMBOL(llog_receptor_accept); -int llog_initiator_connect(struct llog_ctxt *ctxt) -{ - ENTRY; - LASSERT(ctxt); - ctxt->loc_imp = ctxt->loc_obd->u.cli.cl_import; - RETURN(0); -} -EXPORT_SYMBOL(llog_initiator_connect); - #else /* !__KERNEL__ */ int llog_origin_connect(struct llog_ctxt *ctxt, int count, @@ -169,9 +167,21 @@ int llog_origin_connect(struct llog_ctxt *ctxt, int count, { return 0; } +#endif int llog_initiator_connect(struct llog_ctxt *ctxt) { - return 0; + struct obd_import *new_imp; + ENTRY; + LASSERT(ctxt); + new_imp = ctxt->loc_obd->u.cli.cl_import; + mutex_down(&ctxt->loc_sem); + if (ctxt->loc_imp != new_imp) { + if (ctxt->loc_imp) + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = class_import_get(new_imp); + } + mutex_up(&ctxt->loc_sem); + RETURN(0); } -#endif +EXPORT_SYMBOL(llog_initiator_connect); diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 8a2c3756a2242fbe55ca89406708616ee8e1c969..a8811551342ae02fbf9e4d1d62ca89dea2132ba8 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -104,6 +104,8 @@ struct ll_rpc_opcode { { LLOG_CATINFO, "llog_catinfo" }, { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "llog_origin_handle_prev_block" }, { LLOG_ORIGIN_HANDLE_DESTROY, "llog_origin_handle_destroy" }, + { FLD_QUERY, "fld_query" }, + { SEQ_QUERY, "seq_query" }, }; struct ll_eopcode { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index d8960d22c65a26cc7c0ca40f5b6abd1c914548ae..39282b4770370af0a4847b1bfcdfb2cb65c1455d 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -360,11 +360,17 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int flags) } /* Report actual service time for client latency calc */ lustre_msg_set_service_time(req->rq_repmsg, service_time); - /* Report service time estimate for future client reqs */ - lustre_msg_set_timeout(req->rq_repmsg, at_get(&svc->srv_at_estimate)); + /* Report service time estimate for future client reqs, but report 0 + * (to be ignored by client) if it's a error reply during recovery. + * (bz15815) */ + if (req->rq_type == PTL_RPC_MSG_ERR && + (req->rq_export == NULL || req->rq_export->exp_obd->obd_recovering)) + lustre_msg_set_timeout(req->rq_repmsg, 0); + else + lustre_msg_set_timeout(req->rq_repmsg, + at_get(&svc->srv_at_estimate)); - if (req->rq_export && req->rq_export->exp_obd) - target_pack_pool_reply(req); + target_pack_pool_reply(req); if (lustre_msghdr_get_flags(req->rq_reqmsg) & MSGHDR_AT_SUPPORT) { /* early replies go to offset 0, regular replies go after that*/ @@ -417,7 +423,7 @@ int ptlrpc_reply (struct ptlrpc_request *req) return (ptlrpc_send_reply (req, 0)); } -int ptlrpc_error(struct ptlrpc_request *req) +int ptlrpc_send_error(struct ptlrpc_request *req, int may_be_difficult) { int rc; ENTRY; @@ -430,10 +436,15 @@ int ptlrpc_error(struct ptlrpc_request *req) req->rq_type = PTL_RPC_MSG_ERR; - rc = ptlrpc_send_reply(req, 0); + rc = ptlrpc_send_reply(req, may_be_difficult); RETURN(rc); } +int ptlrpc_error(struct ptlrpc_request *req) +{ + return ptlrpc_send_error(req, 0); +} + int ptl_send_rpc(struct ptlrpc_request *request, int noreply) { int rc; @@ -474,6 +485,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) lustre_msg_set_type(request->rq_reqmsg, PTL_RPC_MSG_REQUEST); lustre_msg_set_conn_cnt(request->rq_reqmsg, request->rq_import->imp_conn_cnt); + lustre_msghdr_set_flags(request->rq_reqmsg, + request->rq_import->imp_msghdr_flags); if (!noreply) { LASSERT (request->rq_replen != 0); @@ -529,7 +542,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) /* ...but the MD attach didn't succeed... */ request->rq_receiving_reply = 0; spin_unlock(&request->rq_lock); - GOTO(cleanup_me, rc -ENOMEM); + GOTO(cleanup_me, rc = -ENOMEM); } CDEBUG(D_NET, "Setup reply buffer: %u bytes, xid "LPU64 diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 3f3d746538a00b44b7efd81ff92a99d08fb6724f..6cf2c9a581b3091f2c2401c0875948a614185ad5 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -41,8 +41,6 @@ #if LUSTRE_VERSION_CODE > OBD_OCD_VERSION(1,8,0,0) #error "lustre_msg_v1 has been deprecated since 1.6.0, please remove it" -#elif LUSTRE_VERSION_CODE > OBD_OCD_VERSION(1,6,50,0) -#warning "lustre_msg_v1 has been deprecated since 1.6.0, consider removing it" #endif static inline int lustre_msg_hdr_size_v1(int count) @@ -55,14 +53,13 @@ static inline int lustre_msg_hdr_size_v2(int count) return size_round(offsetof(struct lustre_msg_v2, lm_buflens[count])); } -int lustre_msg_swabbed(struct lustre_msg *msg) +static int lustre_msg_need_swab(struct lustre_msg *msg) { return (msg->lm_magic == LUSTRE_MSG_MAGIC_V1_SWABBED) || (msg->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED); } -static inline int -lustre_msg_check_version_v2(struct lustre_msg_v2 *msg, __u32 version) +int lustre_msg_check_version_v2(struct lustre_msg_v2 *msg, __u32 version) { __u32 ver = lustre_msg_get_version(msg); return (ver & LUSTRE_VERSION_MASK) != version; @@ -72,21 +69,40 @@ int lustre_msg_check_version(struct lustre_msg *msg, __u32 version) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 0; case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: return lustre_msg_check_version_v2(msg, version); default: - LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + CERROR("incorrect message magic: %08x\n", msg->lm_magic); return -EINVAL; } } +static int ptlrpc_reqbuf_need_swab(struct ptlrpc_request *req, int index) +{ + int swabb; + + swabb = (!lustre_req_need_swab(req)) || + lustre_req_swabbed(req, index); + + return !swabb; +} + +static int ptlrpc_repbuf_need_swab(struct ptlrpc_request *req, int index) +{ + int swabb; + + swabb = (!lustre_rep_need_swab(req)) || + lustre_rep_swabbed(req, index); + + return !swabb; +} + + /* early reply size */ int lustre_msg_early_size() { static int size = 0; - if (!size) + if (!size) size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL); return size; } @@ -346,7 +362,7 @@ static struct ptlrpc_reply_state *lustre_get_emerg_rs(struct ptlrpc_service *svc goto out; spin_lock(&svc->srv_lock); } - + rs = list_entry(svc->srv_free_rs_list.next, struct ptlrpc_reply_state, rs_list); list_del(&rs->rs_list); @@ -460,14 +476,12 @@ int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, int *lens, switch (req->rq_reqmsg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return lustre_pack_reply_v1(req, count - 1, lens + 1, bufs ? bufs + 1 : NULL, flags); case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: return lustre_pack_reply_v2(req, count, lens, bufs, flags); default: - LASSERTF(0, "incorrect message magic: %08x\n", + CERROR("incorrect message magic: %08x\n", req->rq_reqmsg->lm_magic); return -EINVAL; } @@ -546,13 +560,11 @@ void *lustre_msg_buf(struct lustre_msg *m, int n, int min_size) { switch (m->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return lustre_msg_buf_v1(m, n - 1, min_size); case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: return lustre_msg_buf_v2(m, n, min_size); default: - LASSERTF(0, "incorrect message magic: %08x\n", m->lm_magic); + CERROR("incorrect message magic: %08x\n", m->lm_magic); return NULL; } } @@ -706,9 +718,9 @@ int lustre_unpack_msg_v1(void *msg, int len) RETURN(-EINVAL); } - flipped = lustre_msg_swabbed((struct lustre_msg *)m); - + flipped = m->lm_magic == LUSTRE_MSG_MAGIC_V1_SWABBED; if (flipped) { + __swab32s(&m->lm_magic); __swab32s(&m->lm_type); __swab32s(&m->lm_version); __swab32s(&m->lm_opc); @@ -763,9 +775,9 @@ static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) RETURN(-EINVAL); } - flipped = lustre_msg_swabbed(m); - + flipped = m->lm_magic == LUSTRE_MSG_MAGIC_V2_SWABBED; if (flipped) { + __swab32s(&m->lm_magic); __swab32s(&m->lm_bufcount); __swab32s(&m->lm_secflvr); __swab32s(&m->lm_repsize); @@ -800,9 +812,15 @@ static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) return 0; } +/* + * return 1 if some endianness conversions are needed for the req buffers, + * 0 if not neeed, or + * -EINVAL if message has wrong magic + */ int lustre_unpack_msg(struct lustre_msg *m, int len) { int required_len, rc; + int swab_needed; ENTRY; /* We can provide a slightly better error log, if we check the @@ -820,6 +838,8 @@ int lustre_unpack_msg(struct lustre_msg *m, int len) RETURN(-EINVAL); } + swab_needed = lustre_msg_need_swab(m); + switch (m->lm_magic) { case LUSTRE_MSG_MAGIC_V1: case LUSTRE_MSG_MAGIC_V1_SWABBED: @@ -830,15 +850,18 @@ int lustre_unpack_msg(struct lustre_msg *m, int len) rc = lustre_unpack_msg_v2(m, len); break; default: - CERROR("bad lustre msg magic: %#08X\n", m->lm_magic); + CERROR("incorrect message magic: %08x\n", m->lm_magic); return -EINVAL; } + if (!rc) + rc = swab_needed; + RETURN(rc); } static inline int lustre_unpack_ptlrpc_body_v2(struct lustre_msg_v2 *m, - int offset) + int offset, int swab_needed) { struct ptlrpc_body *pb; @@ -847,7 +870,7 @@ static inline int lustre_unpack_ptlrpc_body_v2(struct lustre_msg_v2 *m, CERROR("error unpacking ptlrpc body\n"); return -EFAULT; } - if (lustre_msg_swabbed(m)) + if (swab_needed) lustre_swab_ptlrpc_body(pb); if ((pb->pb_version & ~LUSTRE_VERSION_MASK) != PTLRPC_MSG_VERSION) { @@ -862,14 +885,15 @@ int lustre_unpack_req_ptlrpc_body(struct ptlrpc_request *req, int offset) { switch (req->rq_reqmsg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 0; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: + case LUSTRE_MSG_MAGIC_V2: { + int swab_needed = ptlrpc_reqbuf_need_swab(req, offset); lustre_set_req_swabbed(req, offset); - return lustre_unpack_ptlrpc_body_v2(req->rq_reqmsg, offset); + return lustre_unpack_ptlrpc_body_v2(req->rq_reqmsg, offset, + swab_needed); + } default: - CERROR("bad lustre msg magic: %#08X\n", + CERROR("incorrect message magic: %08x\n", req->rq_reqmsg->lm_magic); return -EINVAL; } @@ -879,14 +903,15 @@ int lustre_unpack_rep_ptlrpc_body(struct ptlrpc_request *req, int offset) { switch (req->rq_repmsg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 0; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: + case LUSTRE_MSG_MAGIC_V2:{ + int swab_needed = ptlrpc_repbuf_need_swab(req, offset); lustre_set_rep_swabbed(req, offset); - return lustre_unpack_ptlrpc_body_v2(req->rq_repmsg, offset); + return lustre_unpack_ptlrpc_body_v2(req->rq_repmsg, offset, + swab_needed); + } default: - CERROR("bad lustre msg magic: %#08X\n", + CERROR("incorrect message magic: %08x\n", req->rq_repmsg->lm_magic); return -EINVAL; } @@ -922,10 +947,8 @@ int lustre_msg_buflen(struct lustre_msg *m, int n) { switch (m->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return lustre_msg_buflen_v1(m, n - 1); case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: return lustre_msg_buflen_v2(m, n); default: CERROR("incorrect message magic: %08x\n", m->lm_magic); @@ -976,10 +999,8 @@ int lustre_msg_bufcount(struct lustre_msg *m) { switch (m->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)m)->lm_bufcount + 1; case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: return m->lm_bufcount; default: CERROR("incorrect message magic: %08x\n", m->lm_magic); @@ -996,12 +1017,10 @@ char *lustre_msg_string(struct lustre_msg *m, int index, int max_len) switch (m->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: str = lustre_msg_buf_v1(m, index - 1, 0); blen = lustre_msg_buflen_v1(m, index - 1); break; case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: str = lustre_msg_buf_v2(m, index, 0); blen = lustre_msg_buflen_v2(m, index); break; @@ -1047,11 +1066,9 @@ void *lustre_swab_buf(struct lustre_msg *msg, int index, int min_size, switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: ptr = lustre_msg_buf_v1(msg, index - 1, min_size); break; case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: ptr = lustre_msg_buf_v2(msg, index, min_size); break; default: @@ -1060,8 +1077,10 @@ void *lustre_swab_buf(struct lustre_msg *msg, int index, int min_size, if (ptr == NULL) return NULL; - if (swabber != NULL && lustre_msg_swabbed(msg)) + if (swabber != NULL) { + CDEBUG(D_NET, "Swab buffer %d\n", index); ((void (*)(void *))swabber)(ptr); + } return ptr; } @@ -1069,6 +1088,9 @@ void *lustre_swab_buf(struct lustre_msg *msg, int index, int min_size, void *lustre_swab_reqbuf(struct ptlrpc_request *req, int index, int min_size, void *swabber) { + if (!ptlrpc_reqbuf_need_swab(req, index)) + swabber = NULL; + lustre_set_req_swabbed(req, index); return lustre_swab_buf(req->rq_reqmsg, index, min_size, swabber); } @@ -1076,6 +1098,9 @@ void *lustre_swab_reqbuf(struct ptlrpc_request *req, int index, int min_size, void *lustre_swab_repbuf(struct ptlrpc_request *req, int index, int min_size, void *swabber) { + if (!ptlrpc_repbuf_need_swab(req, index)) + swabber = NULL; + lustre_set_rep_swabbed(req, index); return lustre_swab_buf(req->rq_repmsg, index, min_size, swabber); } @@ -1084,10 +1109,8 @@ __u32 lustre_msghdr_get_flags(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 0; case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: /* already in host endian */ return msg->lm_flags; default: @@ -1113,11 +1136,9 @@ __u32 lustre_msg_get_flags(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)msg)->lm_flags & MSG_GEN_FLAG_MASK; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1128,6 +1149,7 @@ __u32 lustre_msg_get_flags(struct lustre_msg *msg) return pb->pb_flags; } default: + CERROR("Wrong magic %x\n", msg->lm_magic); /* flags might be printed in debug code while message * uninitialized */ return 0; @@ -1179,12 +1201,10 @@ void lustre_msg_clear_flags(struct lustre_msg *msg, int flags) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: ((struct lustre_msg_v1 *)msg)->lm_flags &= ~(MSG_GEN_FLAG_MASK & flags); return; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1201,11 +1221,9 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)msg)->lm_flags >> MSG_OP_FLAG_SHIFT; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1216,6 +1234,7 @@ __u32 lustre_msg_get_op_flags(struct lustre_msg *msg) return pb->pb_op_flags; } default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); return 0; } } @@ -1265,10 +1284,8 @@ struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return &((struct lustre_msg_v1 *)msg)->lm_handle; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1288,10 +1305,8 @@ __u32 lustre_msg_get_type(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)msg)->lm_type; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1311,10 +1326,8 @@ __u32 lustre_msg_get_version(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)msg)->lm_version; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1334,10 +1347,8 @@ void lustre_msg_add_version(struct lustre_msg *msg, int version) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1354,10 +1365,8 @@ __u32 lustre_msg_get_opc(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)msg)->lm_opc; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1377,10 +1386,8 @@ __u64 lustre_msg_get_last_xid(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)msg)->lm_last_xid; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1400,10 +1407,8 @@ __u64 lustre_msg_get_last_committed(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)msg)->lm_last_committed; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1423,10 +1428,8 @@ __u64 lustre_msg_get_transno(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)msg)->lm_transno; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1460,6 +1463,7 @@ int lustre_msg_get_status(struct lustre_msg *msg) return pb->pb_status; } default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); /* status might be printed in debug code while message * uninitialized */ return -EINVAL; @@ -1470,10 +1474,8 @@ __u64 lustre_msg_get_slv(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 1; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1484,7 +1486,7 @@ __u64 lustre_msg_get_slv(struct lustre_msg *msg) return pb->pb_slv; } default: - CERROR("invalid msg magic %x\n", msg->lm_magic); + CERROR("incorrect message magic: %08x\n", msg->lm_magic); return -EINVAL; } } @@ -1494,10 +1496,8 @@ void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1509,7 +1509,7 @@ void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv) return; } default: - CERROR("invalid msg magic %x\n", msg->lm_magic); + CERROR("incorrect message magic: %08x\n", msg->lm_magic); return; } } @@ -1518,10 +1518,8 @@ __u32 lustre_msg_get_limit(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 1; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1532,7 +1530,7 @@ __u32 lustre_msg_get_limit(struct lustre_msg *msg) return pb->pb_limit; } default: - CERROR("invalid msg magic %x\n", msg->lm_magic); + CERROR("incorrect message magic: %08x\n", msg->lm_magic); return -EINVAL; } } @@ -1542,10 +1540,8 @@ void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1557,7 +1553,7 @@ void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit) return; } default: - CERROR("invalid msg magic %x\n", msg->lm_magic); + CERROR("incorrect message magic: %08x\n", msg->lm_magic); return; } } @@ -1566,10 +1562,8 @@ __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return ((struct lustre_msg_v1 *)msg)->lm_conn_cnt; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); @@ -1614,17 +1608,14 @@ __u32 lustre_msg_get_timeout(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 0; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; - + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); if (!pb) { CERROR("invalid msg %p: no ptlrpc body!\n", msg); return 0; - } return pb->pb_timeout; } @@ -1638,17 +1629,14 @@ __u32 lustre_msg_get_service_time(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 0; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; - + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); if (!pb) { CERROR("invalid msg %p: no ptlrpc body!\n", msg); return 0; - } return pb->pb_service_time; } @@ -1662,10 +1650,8 @@ __u32 lustre_msg_get_cksum(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 0; case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: return msg->lm_cksum; default: CERROR("incorrect message magic: %08x\n", msg->lm_magic); @@ -1677,10 +1663,8 @@ __u32 lustre_msg_calc_cksum(struct lustre_msg *msg) { switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 0; - case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: { + case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); @@ -1851,7 +1835,7 @@ void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout) return; case LUSTRE_MSG_MAGIC_V2: { struct ptlrpc_body *pb; - + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); pb->pb_timeout = timeout; @@ -2209,17 +2193,6 @@ void lustre_swab_lov_desc (struct lov_desc *ld) /* uuid endian insensitive */ } -static void print_lum (struct lov_user_md *lum) -{ - CDEBUG(D_OTHER, "lov_user_md %p:\n", lum); - CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lum->lmm_magic); - CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lum->lmm_pattern); - CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lum->lmm_object_id); - CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lum->lmm_object_gr); - CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lum->lmm_stripe_size); - CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lum->lmm_stripe_count); - CDEBUG(D_OTHER, "\tlmm_stripe_offset: %#x\n", lum->lmm_stripe_offset); -} void lustre_swab_lov_user_md(struct lov_user_md *lum) { @@ -2232,22 +2205,9 @@ void lustre_swab_lov_user_md(struct lov_user_md *lum) __swab32s(&lum->lmm_stripe_size); __swab16s(&lum->lmm_stripe_count); __swab16s(&lum->lmm_stripe_offset); - print_lum(lum); EXIT; } -static void print_lumj (struct lov_user_md_join *lumj) -{ - CDEBUG(D_OTHER, "lov_user_md %p:\n", lumj); - CDEBUG(D_OTHER, "\tlmm_magic: %#x\n", lumj->lmm_magic); - CDEBUG(D_OTHER, "\tlmm_pattern: %#x\n", lumj->lmm_pattern); - CDEBUG(D_OTHER, "\tlmm_object_id: "LPU64"\n", lumj->lmm_object_id); - CDEBUG(D_OTHER, "\tlmm_object_gr: "LPU64"\n", lumj->lmm_object_gr); - CDEBUG(D_OTHER, "\tlmm_stripe_size: %#x\n", lumj->lmm_stripe_size); - CDEBUG(D_OTHER, "\tlmm_stripe_count: %#x\n", lumj->lmm_stripe_count); - CDEBUG(D_OTHER, "\tlmm_extent_count: %#x\n", lumj->lmm_extent_count); -} - void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj) { ENTRY; @@ -2259,25 +2219,6 @@ void lustre_swab_lov_user_md_join(struct lov_user_md_join *lumj) __swab32s(&lumj->lmm_stripe_size); __swab32s(&lumj->lmm_stripe_count); __swab32s(&lumj->lmm_extent_count); - print_lumj(lumj); - EXIT; -} - -static void print_lum_objs(struct lov_user_md *lum) -{ - struct lov_user_ost_data *lod; - int i; - ENTRY; - if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */ - return; - CDEBUG(D_OTHER, "lov_user_md_objects: %p\n", lum); - for (i = 0; i < lum->lmm_stripe_count; i++) { - lod = &lum->lmm_objects[i]; - CDEBUG(D_OTHER, "(%i) lod->l_object_id: "LPX64"\n", i, lod->l_object_id); - CDEBUG(D_OTHER, "(%i) lod->l_object_gr: "LPX64"\n", i, lod->l_object_gr); - CDEBUG(D_OTHER, "(%i) lod->l_ost_gen: %#x\n", i, lod->l_ost_gen); - CDEBUG(D_OTHER, "(%i) lod->l_ost_idx: %#x\n", i, lod->l_ost_idx); - } EXIT; } @@ -2293,7 +2234,6 @@ void lustre_swab_lov_user_md_objects(struct lov_user_md *lum) __swab32s(&lod->l_ost_gen); __swab32s(&lod->l_ost_idx); } - print_lum_objs(lum); EXIT; } @@ -2372,18 +2312,6 @@ void lustre_swab_qdata(struct qunit_data *d) __swab64s (&d->qd_qunit); } -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) -void lustre_swab_qdata_old(struct qunit_data_old *d) -{ - __swab32s (&d->qd_id); - __swab32s (&d->qd_type); - __swab32s (&d->qd_count); - __swab32s (&d->qd_isblk); -} -#else -#warning "remove quota code above for format absolete in new release" -#endif - #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 9, 0, 0) void lustre_swab_qdata_old2(struct qunit_data_old2 *d) { @@ -2397,42 +2325,6 @@ void lustre_swab_qdata_old2(struct qunit_data_old2 *d) #ifdef __KERNEL__ -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) -void qdata_v1_v3(struct qunit_data_old *d, - struct qunit_data *qdata) -{ - LASSERT(d); - LASSERT(qdata); - - qdata->qd_id = d->qd_id; - if (d->qd_type) - QDATA_SET_GRP(qdata); - if (d->qd_isblk) - QDATA_SET_BLK(qdata); - qdata->qd_count = d->qd_count; -} - -struct qunit_data_old *qdata_v3_to_v1(struct qunit_data *d) -{ - struct qunit_data tmp; - struct qunit_data_old *ret; - ENTRY; - - if (!d) - return NULL; - - tmp = *d; - ret = (struct qunit_data_old *)d; - ret->qd_id = tmp.qd_id; - ret->qd_type = (QDATA_IS_GRP(&tmp) ? GRPQUOTA : USRQUOTA); - ret->qd_count = (__u32)tmp.qd_count; - ret->qd_isblk = (QDATA_IS_BLK(&tmp) ? 1 : 0); - RETURN(ret); -} -#else -#warning "remove quota code above for format absolete in new release" -#endif - #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 9, 0, 0) void qdata_v2_to_v3(struct qunit_data_old2 *d, struct qunit_data *qdata) @@ -2471,9 +2363,7 @@ int quota_get_qdata(void *request, struct qunit_data *qdata, { struct ptlrpc_request *req = (struct ptlrpc_request *)request; struct qunit_data *new; - struct qunit_data_old *old; struct qunit_data_old2 *old2; - int size = sizeof(struct qunit_data_old); int size2 = sizeof(struct qunit_data_old2); __u64 flags = is_exp ? req->rq_export->exp_connect_flags : req->rq_import->imp_connect_data.ocd_connect_flags; @@ -2481,13 +2371,6 @@ int quota_get_qdata(void *request, struct qunit_data *qdata, LASSERT(req); LASSERT(qdata); -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) - if (OBD_FAIL_CHECK(OBD_FAIL_QUOTA_QD_COUNT_32BIT)) - goto quota32; -#else -#warning "remove quota code above for format absolete in new release" -#endif - #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 9, 0, 0) if (OBD_FAIL_CHECK(OBD_FAIL_QUOTA_WITHOUT_CHANGE_QS)) goto without_change_qs; @@ -2533,20 +2416,6 @@ without_change_qs: } #else #warning "remove quota code above for format absolete in new release" -#endif - -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) -quota32: - /* not support for quota64 and change_qs */ - if (is_req == QUOTA_REQUEST) - old = lustre_swab_reqbuf(req, REQ_REC_OFF, size, - lustre_swab_qdata_old); - else - old = lustre_swab_repbuf(req, REPLY_REC_OFF, size, - lustre_swab_qdata_old); - qdata_v1_v3(old, qdata); -#else -#warning "remove quota code above for format absolete in new release" #endif return 0; @@ -2559,7 +2428,6 @@ int quota_copy_qdata(void *request, struct qunit_data *qdata, { struct ptlrpc_request *req = (struct ptlrpc_request *)request; void *target; - struct qunit_data_old *old; struct qunit_data_old2 *old2; __u64 flags = is_exp ? req->rq_export->exp_connect_flags : req->rq_import->imp_connect_data.ocd_connect_flags; @@ -2567,13 +2435,6 @@ int quota_copy_qdata(void *request, struct qunit_data *qdata, LASSERT(req); LASSERT(qdata); -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) - if (OBD_FAIL_CHECK(OBD_FAIL_QUOTA_QD_COUNT_32BIT)) - goto quota32; -#else -#warning "remove quota code above for format absolete in new release" -#endif - #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 9, 0, 0) if (OBD_FAIL_CHECK(OBD_FAIL_QUOTA_WITHOUT_CHANGE_QS)) goto without_change_qs; @@ -2617,23 +2478,6 @@ without_change_qs: } #else #warning "remove quota code above for format absolete in new release" -#endif - -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) -quota32: - /* not support for quota64 and change_qs */ - if (is_req == QUOTA_REQUEST) - target = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, - sizeof(struct qunit_data_old)); - else - target = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, - sizeof(struct qunit_data_old)); - if (!target) - return -EINVAL; - old = qdata_v3_to_v1(qdata); - memcpy(target, old, sizeof(*old)); -#else -#warning "remove quota code above for format absolete in new release" #endif return 0; @@ -2644,15 +2488,10 @@ int quota_get_qunit_data_size(__u64 flag) { int size; - if (flag & OBD_CONNECT_CHANGE_QS) { + if (flag & OBD_CONNECT_CHANGE_QS) size = sizeof(struct qunit_data); - } else { - /* write in this way because sizes of qunit_data_old and - * qunit_data_old2 are same */ - LASSERT(sizeof(struct qunit_data_old) == - sizeof(struct qunit_data_old2)); - size = sizeof(struct qunit_data_old); - } + else + size = sizeof(struct qunit_data_old2); return(size); } @@ -2665,13 +2504,11 @@ static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req) switch (req->rq_reqmsg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 1; case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: return lustre_req_swabbed(req, MSG_PTLRPC_BODY_OFF); default: - CERROR("bad lustre msg magic: %#08X\n", + CERROR("incorrect message magic: %08x\n", req->rq_reqmsg->lm_magic); } return 0; @@ -2683,13 +2520,12 @@ static inline int rep_ptlrpc_body_swabbed(struct ptlrpc_request *req) switch (req->rq_repmsg->lm_magic) { case LUSTRE_MSG_MAGIC_V1: - case LUSTRE_MSG_MAGIC_V1_SWABBED: return 1; case LUSTRE_MSG_MAGIC_V2: - case LUSTRE_MSG_MAGIC_V2_SWABBED: return lustre_rep_swabbed(req, MSG_PTLRPC_BODY_OFF); default: - /* uninitialized yet */ + CERROR("incorrect message magic: %08x\n", + req->rq_repmsg->lm_magic); return 0; } } @@ -2699,6 +2535,24 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, { va_list args; + int opc = -1; + int req_fl = 0; + int rep_fl = 0; + int rep_status = 0; + + if (req->rq_reqmsg && + (!lustre_msg_need_swab(req->rq_reqmsg) || + lustre_req_need_swab(req))) { + opc = lustre_msg_get_opc(req->rq_reqmsg); + req_fl = lustre_msg_get_flags(req->rq_reqmsg); + } + + if (req->rq_repmsg && + (!lustre_msg_need_swab(req->rq_repmsg) || + lustre_rep_need_swab(req))) { + rep_fl = lustre_msg_get_flags(req->rq_repmsg); + rep_status = lustre_msg_get_status(req->rq_repmsg); + } va_start(args, fmt); libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file, @@ -2706,8 +2560,7 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d/%d " "lens %d/%d e %d to %d dl %ld ref %d " "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n", - req, req->rq_xid, req->rq_transno, - req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1, + req, req->rq_xid, req->rq_transno, opc, req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) : req->rq_export ? (char*)req->rq_export->exp_client_uuid.uuid : "<?>", @@ -2719,11 +2572,9 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, req->rq_reqlen, req->rq_replen, req->rq_early_count, req->rq_timeout, req->rq_deadline, atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req), - req->rq_reqmsg ? lustre_msg_get_flags(req->rq_reqmsg) : 0, - req->rq_repmsg ? lustre_msg_get_flags(req->rq_repmsg) : 0, - req->rq_status, - req->rq_repmsg ? lustre_msg_get_status(req->rq_repmsg) : 0); + req_fl, rep_fl, req->rq_status, rep_status); va_end(args); } + EXPORT_SYMBOL(_debug_req); diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 4f0e2a2a3ad4f8a0682a680d72358bf6e33e4a90..f74eef1f36eba9d0f92e19ac39c87926e9c307a5 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -130,6 +130,7 @@ EXPORT_SYMBOL(ptlrpc_register_bulk); EXPORT_SYMBOL(ptlrpc_unregister_bulk); EXPORT_SYMBOL(ptlrpc_send_reply); EXPORT_SYMBOL(ptlrpc_reply); +EXPORT_SYMBOL(ptlrpc_send_error); EXPORT_SYMBOL(ptlrpc_error); EXPORT_SYMBOL(ptlrpc_resend_req); EXPORT_SYMBOL(ptl_send_rpc); @@ -185,7 +186,6 @@ EXPORT_SYMBOL(ptlrpc_daemonize); EXPORT_SYMBOL(ptlrpc_service_health_check); /* pack_generic.c */ -EXPORT_SYMBOL(lustre_msg_swabbed); EXPORT_SYMBOL(lustre_msg_check_version); EXPORT_SYMBOL(lustre_pack_request); EXPORT_SYMBOL(lustre_pack_reply); @@ -197,7 +197,6 @@ EXPORT_SYMBOL(lustre_packed_msg_size); EXPORT_SYMBOL(lustre_unpack_msg); EXPORT_SYMBOL(lustre_msg_buf); EXPORT_SYMBOL(lustre_msg_string); -EXPORT_SYMBOL(lustre_swab_buf); EXPORT_SYMBOL(lustre_swab_reqbuf); EXPORT_SYMBOL(lustre_swab_repbuf); EXPORT_SYMBOL(lustre_swab_obdo); @@ -228,11 +227,6 @@ EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc); EXPORT_SYMBOL(lustre_swab_ldlm_request); EXPORT_SYMBOL(lustre_swab_ldlm_reply); EXPORT_SYMBOL(lustre_swab_qdata); -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) -EXPORT_SYMBOL(lustre_swab_qdata_old); -#else -#warning "remove quota code above for format absolete in new release" -#endif #if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 9, 0, 0) EXPORT_SYMBOL(lustre_swab_qdata_old2); #else diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c index d0abb2eb85e0fc5f94fb5ace7546ad09c5591e82..5b82f570f03ae206127b07deaeae56d45a4e598d 100644 --- a/lustre/ptlrpc/ptlrpcd.c +++ b/lustre/ptlrpc/ptlrpcd.c @@ -97,8 +97,6 @@ static int ptlrpcd_check(struct ptlrpcd_ctl *pc) if (test_bit(LIOD_STOP, &pc->pc_flags)) RETURN(1); - obd_zombie_impexp_cull(); - spin_lock(&pc->pc_set->set_new_req_lock); list_for_each_safe(pos, tmp, &pc->pc_set->set_new_requests) { req = list_entry(pos, struct ptlrpc_request, rq_set_chain); @@ -176,13 +174,6 @@ static int ptlrpcd(void *arg) return 0; } -static void ptlrpcd_zombie_impexp_notify(void) -{ - LASSERT(ptlrpcd_pc.pc_set != NULL); // call before ptlrpcd inited ? - - cfs_waitq_signal(&ptlrpcd_pc.pc_set->set_waitq); -} - #else int ptlrpcd_check_async_rpcs(void *arg) @@ -233,9 +224,6 @@ static int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc) RETURN(-ENOMEM); #ifdef __KERNEL__ - /* wake ptlrpcd when zombie imports or exports exist */ - obd_zombie_impexp_notify = ptlrpcd_zombie_impexp_notify; - rc = cfs_kernel_thread(ptlrpcd, pc, 0); if (rc < 0) { ptlrpc_set_destroy(pc->pc_set); @@ -260,7 +248,6 @@ static void ptlrpcd_stop(struct ptlrpcd_ctl *pc) set_bit(LIOD_STOP, &pc->pc_flags); cfs_waitq_signal(&pc->pc_set->set_waitq); #ifdef __KERNEL__ - obd_zombie_impexp_notify = NULL; wait_for_completion(&pc->pc_finishing); #else liblustre_deregister_wait_callback(pc->pc_wait_callback); diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index 9ec70741140c4be69f4e482dcbb9fc8c9b0d77da..b930aaa3946fcd433a0b630a34fadaa88203c492 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -137,6 +137,44 @@ static void llcd_send(struct llog_canceld_ctxt *llcd) cfs_waitq_signal_nr(&llcd->llcd_lcm->lcm_waitq, 1); } +/** + * Grab llcd and assign it to passed @ctxt. Also set up backward link + * and get ref on @ctxt. + */ +static struct llog_canceld_ctxt *ctxt_llcd_grab(struct llog_ctxt *ctxt) +{ + struct llog_canceld_ctxt *llcd; + + LASSERT_SEM_LOCKED(&ctxt->loc_sem); + llcd = llcd_grab(ctxt->loc_lcm); + if (llcd == NULL) + return NULL; + + llcd->llcd_ctxt = llog_ctxt_get(ctxt); + ctxt->loc_llcd = llcd; + + CDEBUG(D_RPCTRACE,"grab llcd %p:%p\n", ctxt->loc_llcd, ctxt); + return llcd; +} + +/** + * Put llcd in passed @ctxt. Set ->loc_llcd to NULL. + */ +static void ctxt_llcd_put(struct llog_ctxt *ctxt) +{ + mutex_down(&ctxt->loc_sem); + if (ctxt->loc_llcd != NULL) { + CDEBUG(D_RPCTRACE,"put llcd %p:%p\n", ctxt->loc_llcd, ctxt); + llcd_put(ctxt->loc_llcd); + ctxt->loc_llcd = NULL; + } + if (ctxt->loc_imp) { + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = NULL; + } + mutex_up(&ctxt->loc_sem); +} + /* deleted objects have a commit callback that cancels the MDS * log record for the deletion. The commit callback calls this * function @@ -152,16 +190,16 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt, LASSERT(ctxt); mutex_down(&ctxt->loc_sem); + llcd = ctxt->loc_llcd; + if (ctxt->loc_imp == NULL) { CDEBUG(D_RPCTRACE, "no import for ctxt %p\n", ctxt); GOTO(out, rc = 0); } - llcd = ctxt->loc_llcd; - if (count > 0 && cookies != NULL) { if (llcd == NULL) { - llcd = llcd_grab(ctxt->loc_lcm); + llcd = ctxt_llcd_grab(ctxt); if (llcd == NULL) { CERROR("couldn't get an llcd - dropped "LPX64 ":%x+%u\n", @@ -170,8 +208,6 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt, cookies->lgc_index); GOTO(out, rc = -ENOMEM); } - llcd->llcd_ctxt = llog_ctxt_get(ctxt); - ctxt->loc_llcd = llcd; } memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes, @@ -200,16 +236,18 @@ int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp) ENTRY; if (exp && (ctxt->loc_imp == exp->exp_imp_reverse)) { - CDEBUG(D_RPCTRACE,"reverse import disconnect, put llcd %p:%p\n", - ctxt->loc_llcd, ctxt); - mutex_down(&ctxt->loc_sem); - if (ctxt->loc_llcd != NULL) { - llcd_put(ctxt->loc_llcd); - ctxt->loc_llcd = NULL; - } - ctxt->loc_imp = NULL; - mutex_up(&ctxt->loc_sem); + CDEBUG(D_RPCTRACE,"reverse import disconnect\n"); + /* + * We put llcd because it is not going to sending list and + * thus, its refc will not be handled. We will handle it here. + */ + ctxt_llcd_put(ctxt); } else { + /* + * Sending cancel. This means that ctxt->loc_llcd wil be + * put on sending list in llog_obd_repl_cancel() and in + * this case recovery thread will take care of it refc. + */ rc = llog_cancel(ctxt, NULL, 0, NULL, OBD_LLOG_FL_SENDNOW); } @@ -363,9 +401,8 @@ static int log_commit_thread(void *arg) CERROR("error preparing commit: rc %d\n", rc); spin_lock(&lcm->lcm_llcd_lock); - list_splice(&lcd->lcd_llcd_list, - &lcm->lcm_llcd_resend); - CFS_INIT_LIST_HEAD(&lcd->lcd_llcd_list); + list_splice_init(&lcd->lcd_llcd_list, + &lcm->lcm_llcd_resend); spin_unlock(&lcm->lcm_llcd_lock); break; } @@ -416,9 +453,9 @@ static int log_commit_thread(void *arg) /* If we are force exiting, just drop all of the cookies. */ if (lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE) { spin_lock(&lcm->lcm_llcd_lock); - list_splice(&lcm->lcm_llcd_pending, &lcd->lcd_llcd_list); - list_splice(&lcm->lcm_llcd_resend, &lcd->lcd_llcd_list); - list_splice(&lcm->lcm_llcd_free, &lcd->lcd_llcd_list); + list_splice_init(&lcm->lcm_llcd_pending, &lcd->lcd_llcd_list); + list_splice_init(&lcm->lcm_llcd_resend, &lcd->lcd_llcd_list); + list_splice_init(&lcm->lcm_llcd_free, &lcd->lcd_llcd_list); spin_unlock(&lcm->lcm_llcd_lock); list_for_each_entry_safe(llcd, n, &lcd->lcd_llcd_list,llcd_list) @@ -589,9 +626,10 @@ static int llog_recovery_generic(struct llog_ctxt *ctxt, void *handle,void *arg) RETURN(-ENODEV); } rc = cfs_kernel_thread(log_process_thread, &llpa, CLONE_VM | CLONE_FILES); - if (rc < 0) + if (rc < 0) { + llog_ctxt_put(ctxt); CERROR("error starting log_process_thread: %d\n", rc); - else { + } else { CDEBUG(D_HA, "log_process_thread: %d\n", rc); rc = 0; } @@ -615,19 +653,19 @@ int llog_repl_connect(struct llog_ctxt *ctxt, int count, mutex_down(&ctxt->loc_sem); ctxt->loc_gen = *gen; - llcd = llcd_grab(ctxt->loc_lcm); + llcd = ctxt_llcd_grab(ctxt); if (llcd == NULL) { CERROR("couldn't get an llcd\n"); mutex_up(&ctxt->loc_sem); RETURN(-ENOMEM); } - llcd->llcd_ctxt = llog_ctxt_get(ctxt); - ctxt->loc_llcd = llcd; mutex_up(&ctxt->loc_sem); rc = llog_recovery_generic(ctxt, ctxt->llog_proc_cb, logid); - if (rc != 0) + if (rc != 0) { + ctxt_llcd_put(ctxt); CERROR("error recovery process: %d\n", rc); + } RETURN(rc); } diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 9d8c81181d2c78db9c800c602d81b69a0ac3ec7d..f8236bbc4b70d70aa8a95dd65e9f435eb5fb0f9f 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -283,6 +283,7 @@ static void ptlrpc_at_timer(unsigned long castmeharder) svc->srv_name, cfs_time_current_sec(), list_empty(&svc->srv_at_list) ? ", empty" : ""); svc->srv_at_check = 1; + svc->srv_at_checktime = cfs_time_current(); cfs_waitq_signal(&svc->srv_waitq); } @@ -583,7 +584,7 @@ static int ptlrpc_check_req(struct ptlrpc_request *req) static void ptlrpc_at_set_timer(struct ptlrpc_service *svc) { struct ptlrpc_request *rq; - time_t next; + __s32 next; spin_lock(&svc->srv_at_lock); if (list_empty(&svc->srv_at_list)) { @@ -595,13 +596,14 @@ static void ptlrpc_at_set_timer(struct ptlrpc_service *svc) /* Set timer for closest deadline */ rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, rq_timed_list); - next = rq->rq_deadline - cfs_time_current_sec() - at_early_margin; + next = (__s32)(rq->rq_deadline - cfs_time_current_sec() - + at_early_margin); if (next <= 0) ptlrpc_at_timer((unsigned long)svc); else cfs_timer_arm(&svc->srv_at_timer, cfs_time_shift(next)); spin_unlock(&svc->srv_at_lock); - CDEBUG(D_INFO, "armed %s at %+lds\n", svc->srv_name, next); + CDEBUG(D_INFO, "armed %s at %+ds\n", svc->srv_name, next); } /* Add rpc to early reply check list */ @@ -774,6 +776,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) struct ptlrpc_request *rq, *n; struct list_head work_list; time_t now = cfs_time_current_sec(); + cfs_duration_t delay; int first, counter = 0; ENTRY; @@ -782,6 +785,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) spin_unlock(&svc->srv_at_lock); RETURN(0); } + delay = cfs_time_sub(cfs_time_current(), svc->srv_at_checktime); svc->srv_at_check = 0; if (list_empty(&svc->srv_at_list)) { @@ -819,11 +823,17 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early " "replies\n", first, at_extra, counter); - if (first < 0) + + if (first < 0) { /* We're already past request deadlines before we even get a chance to send early replies */ LCONSOLE_WARN("%s: This server is not able to keep up with " - "request traffic (cpu-bound).\n", svc->srv_name); + "request traffic (cpu-bound).\n", svc->srv_name); + CWARN("earlyQ=%d reqQ=%d recA=%d, svcEst=%d, " + "delay="CFS_DURATION_T"(jiff)\n", + counter, svc->srv_n_queued_reqs, svc->srv_n_active_reqs, + at_get(&svc->srv_at_estimate), delay); + } /* ptlrpc_server_free_request may delete an entry out of the work list */ @@ -845,7 +855,7 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) } spin_unlock(&svc->srv_at_lock); - RETURN(0); + RETURN(0); } /* Handle freshly incoming reqs, add to timed early reply list, @@ -872,18 +882,21 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc) /* Consider this still a "queued" request as far as stats are concerned */ spin_unlock(&svc->srv_lock); - + /* Clear request swab mask; this is a new request */ req->rq_req_swab_mask = 0; rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); - if (rc != 0) { + if (rc < 0) { CERROR ("error unpacking request: ptl %d from %s" " xid "LPU64"\n", svc->srv_req_portal, libcfs_id2str(req->rq_peer), req->rq_xid); goto err_req; } + if (rc > 0) + lustre_set_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); + rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); if (rc) { CERROR ("error unpacking ptlrpc body: ptl %d from %s" @@ -901,7 +914,7 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc) } CDEBUG(D_NET, "got req "LPD64"\n", req->rq_xid); - + req->rq_export = class_conn2export( lustre_msg_get_handle(req->rq_reqmsg)); if (req->rq_export) { @@ -922,8 +935,11 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc) MSGHDR_AT_SUPPORT) ? /* The max time the client expects us to take */ lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; - LASSERT(deadline > 0); req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; + if (unlikely(deadline == 0)) { + DEBUG_REQ(D_ERROR, req, "Dropping request with 0 timeout"); + goto err_req; + } ptlrpc_at_add_timed(req); @@ -1490,11 +1506,14 @@ int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc) int i, rc = 0; ENTRY; - /* We require 2 threads min - see note in - ptlrpc_server_handle_request */ + /* We require 2 threads min - see note in + * ptlrpc_server_handle_request() */ LASSERT(svc->srv_threads_min >= 2); for (i = 0; i < svc->srv_threads_min; i++) { rc = ptlrpc_start_thread(dev, svc); + /* We have enough threads, don't start more. b=15759 */ + if (rc == -EMFILE) + break; if (rc) { CERROR("cannot start %s thread #%d: rc %d\n", svc->srv_thread_name, i, rc); @@ -1516,7 +1535,9 @@ int ptlrpc_start_thread(struct obd_device *dev, struct ptlrpc_service *svc) CDEBUG(D_RPCTRACE, "%s started %d min %d max %d running %d\n", svc->srv_name, svc->srv_threads_started, svc->srv_threads_min, svc->srv_threads_max, svc->srv_threads_running); - if (svc->srv_threads_started >= svc->srv_threads_max) + if (unlikely(svc->srv_threads_started >= svc->srv_threads_max) || + (OBD_FAIL_CHECK(OBD_FAIL_TGT_TOOMANY_THREADS) && + svc->srv_threads_started == svc->srv_threads_min - 1)) RETURN(-EMFILE); OBD_ALLOC(thread, sizeof(*thread)); diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index 6db7a10b72dea672244ffacccea00f22d4104c2b..75b07f001844b8b51879bd10b45ec5842331586e 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -147,7 +147,9 @@ void lustre_assert_wire_constants(void) (long long)REINT_RENAME); LASSERTF(REINT_OPEN == 6, " found %lld\n", (long long)REINT_OPEN); - LASSERTF(REINT_MAX == 7, " found %lld\n", + LASSERTF(REINT_SETXATTR == 7, " found %lld\n", + (long long)REINT_SETXATTR); + LASSERTF(REINT_MAX == 8, " found %lld\n", (long long)REINT_MAX); LASSERTF(MGS_CONNECT == 250, " found %lld\n", (long long)MGS_CONNECT); @@ -499,6 +501,7 @@ void lustre_assert_wire_constants(void) CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL); CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL); CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL); + CLASSERT(OBD_CONNECT_FID == 0x40000000ULL); /* Checks for struct obdo */ LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n", @@ -2078,26 +2081,6 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct qunit_data_old2 *)0)->qd_count) == 8, " found %lld\n", (long long)(int)sizeof(((struct qunit_data_old2 *)0)->qd_count)); - /* Checks for struct qunit_data_old */ - LASSERTF((int)sizeof(struct qunit_data_old) == 16, " found %lld\n", - (long long)(int)sizeof(struct qunit_data_old)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_id) == 0, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_id)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_id) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_id)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_type) == 4, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_type)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_type) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_type)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_count) == 8, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_count)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_count) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_count)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_isblk) == 12, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_isblk)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_isblk) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_isblk)); - /* Checks for struct quota_adjust_qunit */ LASSERTF((int)sizeof(struct quota_adjust_qunit) == 32, " found %lld\n", (long long)(int)sizeof(struct quota_adjust_qunit)); diff --git a/lustre/quota/quota_check.c b/lustre/quota/quota_check.c index bde6b6ed20b7e00e1b63bb5803a60205a19b485b..95bde12969db4a3acaa53d666348bf7369c69b35 100644 --- a/lustre/quota/quota_check.c +++ b/lustre/quota/quota_check.c @@ -118,6 +118,7 @@ int target_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl) qta->qta_exp = exp; qta->qta_oqctl = *oqctl; + qta->qta_oqctl.qc_id = obt->obt_qfmt; /* override qfmt version */ qta->qta_sb = obt->obt_sb; qta->qta_sem = &obt->obt_quotachecking; diff --git a/lustre/quota/quota_context.c b/lustre/quota/quota_context.c index 11b7a91562e2884f73c3f07bd6231a4b88162313..6f6efb89c0b576d9cda90924772855f07e6a7a1c 100644 --- a/lustre/quota/quota_context.c +++ b/lustre/quota/quota_context.c @@ -103,12 +103,7 @@ int should_translate_quota (struct obd_import *imp) ENTRY; LASSERT(imp); -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(1, 7, 0, 0) - if (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_QUOTA64 && - !OBD_FAIL_CHECK(OBD_FAIL_QUOTA_QD_COUNT_32BIT)) -#else if (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_QUOTA64) -#endif RETURN(0); else RETURN(1); @@ -473,40 +468,6 @@ static int schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata, int opc, int wait); -static int split_before_schedule_dqacq(struct obd_device *obd, - struct lustre_quota_ctxt *qctxt, - struct qunit_data *qdata, int opc, int wait) -{ - int rc = 0; - unsigned long factor; - struct qunit_data tmp_qdata; - ENTRY; - - LASSERT(qdata && qdata->qd_count); - QDATA_DEBUG(qdata, "%s quota split.\n", - QDATA_IS_BLK(qdata) ? "block" : "inode"); - if (QDATA_IS_BLK(qdata)) - factor = MAX_QUOTA_COUNT32 / qctxt->lqc_bunit_sz * - qctxt->lqc_bunit_sz; - else - factor = MAX_QUOTA_COUNT32 / qctxt->lqc_iunit_sz * - qctxt->lqc_iunit_sz; - - if (qctxt->lqc_import && should_translate_quota(qctxt->lqc_import) && - qdata->qd_count > factor) { - tmp_qdata = *qdata; - tmp_qdata.qd_count = factor; - qdata->qd_count -= tmp_qdata.qd_count; - QDATA_DEBUG((&tmp_qdata), "be split.\n"); - rc = schedule_dqacq(obd, qctxt, &tmp_qdata, opc, wait); - } else{ - QDATA_DEBUG(qdata, "don't be split.\n"); - rc = schedule_dqacq(obd, qctxt, qdata, opc, wait); - } - - RETURN(rc); -} - static int dqacq_completion(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata, int rc, int opc) @@ -524,7 +485,7 @@ dqacq_completion(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, /* update local operational quota file */ if (rc == 0) { - __u32 count = QUSG(qdata->qd_count, QDATA_IS_BLK(qdata)); + __u64 count = QUSG(qdata->qd_count, QDATA_IS_BLK(qdata)); struct obd_quotactl *qctl; __u64 *hardlimit; @@ -553,14 +514,23 @@ dqacq_completion(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, } CDEBUG(D_QUOTA, "hardlimt: "LPU64"\n", *hardlimit); + + if (*hardlimit == 0) + goto out_mem; + switch (opc) { case QUOTA_DQACQ: INC_QLIMIT(*hardlimit, count); break; case QUOTA_DQREL: LASSERTF(count < *hardlimit, - "count: %d, hardlimit: "LPU64".\n", - count, *hardlimit); + "id(%u) flag(%u) type(%c) isblk(%c) " + "count("LPU64") qd_qunit("LPU64") " + "hardlimit("LPU64").\n", + qdata->qd_id, qdata->qd_flags, + QDATA_IS_GRP(qdata) ? 'g' : 'u', + QDATA_IS_BLK(qdata) ? 'b': 'i', + qdata->qd_count, qdata->qd_qunit, *hardlimit); *hardlimit -= count; break; default: @@ -644,7 +614,7 @@ out: if (rc1 > 0) { int opc; opc = rc1 == 1 ? QUOTA_DQACQ : QUOTA_DQREL; - rc1 = split_before_schedule_dqacq(obd, qctxt, qdata, opc, 0); + rc1 = schedule_dqacq(obd, qctxt, qdata, opc, 0); QDATA_DEBUG(qdata, "reschedudle opc(%d) rc(%d)\n", opc, rc1); } RETURN(err); @@ -756,7 +726,6 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, struct dqacq_async_args *aa; int size[2] = { sizeof(struct ptlrpc_body), 0 }; struct obd_import *imp = NULL; - unsigned long factor; struct lustre_qunit_size *lqs = NULL; int rc = 0; ENTRY; @@ -800,6 +769,7 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, int rc2; QDATA_DEBUG(qdata, "local %s.\n", opc == QUOTA_DQACQ ? "DQACQ" : "DQREL"); + QDATA_SET_CHANGE_QS(qdata); rc = qctxt->lqc_handler(obd, qdata, opc); rc2 = dqacq_completion(obd, qctxt, qdata, rc, opc); RETURN(rc ? rc : rc2); @@ -838,16 +808,6 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, RETURN(-ENOMEM); } - if (QDATA_IS_BLK(qdata)) - factor = MAX_QUOTA_COUNT32 / qctxt->lqc_bunit_sz * - qctxt->lqc_bunit_sz; - else - factor = MAX_QUOTA_COUNT32 / qctxt->lqc_iunit_sz * - qctxt->lqc_iunit_sz; - - LASSERTF(!should_translate_quota(imp) || qdata->qd_count <= factor, - "qd_count: "LPU64"; should_translate_quota: %d.\n", - qdata->qd_count, should_translate_quota(imp)); rc = quota_copy_qdata(req, qdata, QUOTA_REQUEST, QUOTA_IMPORT); if (rc < 0) { CDEBUG(D_ERROR, "Can't pack qunit_data\n"); @@ -856,7 +816,7 @@ schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, ptlrpc_req_set_repsize(req, 2, size); class_import_put(imp); - if (wait && qunit) + if (wait && qunit) qunit_get(qunit); CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); @@ -918,8 +878,7 @@ qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, int opc; /* need acquire or release */ opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL; - ret = split_before_schedule_dqacq(obd, qctxt, &qdata[i], - opc, wait); + ret = schedule_dqacq(obd, qctxt, &qdata[i], opc, wait); if (!rc) rc = ret; } else if (wait == 1) { @@ -990,10 +949,9 @@ qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb, qctxt->lqc_switch_qs = 1; /* Change qunit size in default setting */ qctxt->lqc_cqs_boundary_factor = 4; qctxt->lqc_cqs_least_bunit = PTLRPC_MAX_BRW_SIZE; - qctxt->lqc_cqs_least_iunit = 1; + qctxt->lqc_cqs_least_iunit = 2; qctxt->lqc_cqs_qs_factor = 2; - qctxt->lqc_atype = 0; - qctxt->lqc_status= 0; + qctxt->lqc_flags = 0; qctxt->lqc_bunit_sz = default_bunit_sz; qctxt->lqc_btune_sz = default_bunit_sz / 100 * default_btune_ratio; qctxt->lqc_iunit_sz = default_iunit_sz; @@ -1112,9 +1070,7 @@ static int qslave_recovery_main(void *arg) if (ret > 0) { int opc; opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL; - rc = split_before_schedule_dqacq(obd, qctxt, - &qdata, opc, - 0); + rc = schedule_dqacq(obd, qctxt, &qdata, opc, 0); if (rc == -EDQUOT) rc = 0; } else { diff --git a/lustre/quota/quota_ctl.c b/lustre/quota/quota_ctl.c index 8ab7c0eeba7ff6efb7340d96a212b157d1cdf122..e11473f395c0d9ff05e1879d37e667ace3b186e9 100644 --- a/lustre/quota/quota_ctl.c +++ b/lustre/quota/quota_ctl.c @@ -77,6 +77,9 @@ int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) case LUSTRE_Q_INVALIDATE: rc = mds_quota_invalidate(obd, oqctl); break; + case LUSTRE_Q_FINVALIDATE: + rc = mds_quota_finvalidate(obd, oqctl); + break; default: CERROR("%s: unsupported mds_quotactl command: %d\n", obd->obd_name, oqctl->qc_cmd); @@ -100,6 +103,7 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) ENTRY; switch (oqctl->qc_cmd) { + case Q_FINVALIDATE: case Q_QUOTAON: case Q_QUOTAOFF: if (!atomic_dec_and_test(&obt->obt_quotachecking)) { @@ -108,6 +112,12 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) rc = -EBUSY; break; } + if (oqctl->qc_cmd == Q_FINVALIDATE && + (obt->obt_qctxt.lqc_flags & UGQUOTA2LQC(oqctl->qc_type))) { + rc = -EBUSY; + break; + } + oqctl->qc_id = obt->obt_qfmt; /* override qfmt version */ case Q_GETOINFO: case Q_GETOQUOTA: case Q_GETQUOTA: @@ -121,13 +131,15 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) 1); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); + rc = fsfilt_quotactl(obd, obt->obt_sb, oqctl); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - if (oqctl->qc_cmd == Q_QUOTAON || oqctl->qc_cmd == Q_QUOTAOFF) { - if (!rc) - obt->obt_qctxt.lqc_status = - (oqctl->qc_cmd == Q_QUOTAON) ? 1 : 0; + if (oqctl->qc_cmd == Q_QUOTAON || oqctl->qc_cmd == Q_QUOTAOFF || + oqctl->qc_cmd == Q_FINVALIDATE) { + if (!rc && oqctl->qc_cmd == Q_QUOTAON) + obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(oqctl->qc_type); + if (!rc && oqctl->qc_cmd == Q_QUOTAOFF) + obt->obt_qctxt.lqc_flags &= ~UGQUOTA2LQC(oqctl->qc_type); atomic_inc(&obt->obt_quotachecking); } break; @@ -257,7 +269,7 @@ int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) if (oqctl->qc_cmd != Q_QUOTAON && oqctl->qc_cmd != Q_QUOTAOFF && oqctl->qc_cmd != Q_GETOQUOTA && oqctl->qc_cmd != Q_INITQUOTA && - oqctl->qc_cmd != Q_SETQUOTA) { + oqctl->qc_cmd != Q_SETQUOTA && oqctl->qc_cmd != Q_FINVALIDATE) { CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd); RETURN(-EFAULT); } diff --git a/lustre/quota/quota_interface.c b/lustre/quota/quota_interface.c index 7e388ab340e715523f512a159f5ab6d88d889ff5..6f7e695732113e5594e0d709b5660f9648cf4b33 100644 --- a/lustre/quota/quota_interface.c +++ b/lustre/quota/quota_interface.c @@ -53,44 +53,64 @@ #define GROUP_QUOTA 2 #define MAX_STYPE_SIZE 5 + +/* The following information about CURRENT quotas is expected on the output: + * MDS: u for user quotas (administrative+operational) turned on, + * g for group quotas (administrative+operational) turned on, + * 1 for 32-bit operational quotas and 32-bit administrative quotas, + * 2 for 32-bit operational quotas and 64-bit administrative quotas, + * 3 for 64-bit operational quotas and 64-bit administrative quotas + * OST: u for user quotas (operational) turned on, + * g for group quotas (operational) turned on, + * 1 for 32-bit local operational quotas, + * 3 for 64-bit local operational quotas, + * Permanent parameters can be read with lctl (?) + */ int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count, int *eof, void *data) { struct obd_device *obd = (struct obd_device *)data; char stype[MAX_STYPE_SIZE + 1] = ""; - int type = obd->u.obt.obt_qctxt.lqc_atype; + int oq_type, rc, is_mds; + lustre_quota_version_t aq_version, oq_version; + struct obd_device_target *obt; + LASSERT(obd != NULL); - if (type == 0) { - strcpy(stype, "off"); - } else { - if (type & USER_QUOTA) - strcat(stype, "u"); - if (type & GROUP_QUOTA) - strcat(stype, "g"); - } + obt = &obd->u.obt; + is_mds = !strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME); - /* append with quota version on MDS */ - if (!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME)) { - int rc; - lustre_quota_version_t version; - - rc = mds_quota_get_version(obd, &version); + /* Collect the needed information */ + oq_type = obd->u.obt.obt_qctxt.lqc_flags; + oq_version = obt->obt_qfmt; + if (is_mds) { + rc = mds_quota_get_version(obd, &aq_version); if (rc) - return rc; - - switch (version) { - case LUSTRE_QUOTA_V1: - strcat(stype, "1"); - break; - case LUSTRE_QUOTA_V2: - strcat(stype, "2"); - break; - default: - return -ENOSYS; - } + return -EPROTO; + /* Here we can also assert that aq_type == oq_type + * except for quota startup/shutdown states */ } + /* Transform the collected data into a user-readable string */ + if (oq_type & LQC_USRQUOTA_FLAG) + strcat(stype, "u"); + if (oq_type & LQC_GRPQUOTA_FLAG) + strcat(stype, "g"); + + if ((!is_mds || aq_version == LUSTRE_QUOTA_V1) && + oq_version == LUSTRE_QUOTA_V1) + strcat(stype, "1"); +#ifdef HAVE_QUOTA64 + else if ((!is_mds || aq_version == LUSTRE_QUOTA_V2) && + oq_version == LUSTRE_QUOTA_V2) + strcat(stype, "3"); +#endif + else if (is_mds && aq_version == LUSTRE_QUOTA_V2 && + oq_version == LUSTRE_QUOTA_V1) + strcat(stype, "2"); + else + return -EPROTO; + return snprintf(page, count, "%s\n", stype); } EXPORT_SYMBOL(lprocfs_quota_rd_type); @@ -100,63 +120,130 @@ static int auto_quota_on(struct obd_device *obd, int type, { struct obd_quotactl *oqctl; struct lvfs_run_ctxt saved; - int rc; + int rc = 0, id; + struct obd_device_target *obt; ENTRY; LASSERT(type == USRQUOTA || type == GRPQUOTA || type == UGQUOTA); - /* quota already turned on */ - if (obd->u.obt.obt_qctxt.lqc_status) - RETURN(0); + obt = &obd->u.obt; OBD_ALLOC_PTR(oqctl); if (!oqctl) RETURN(-ENOMEM); + if (!atomic_dec_and_test(&obt->obt_quotachecking)) { + CDEBUG(D_INFO, "other people are doing quotacheck\n"); + atomic_inc(&obt->obt_quotachecking); + RETURN(-EBUSY); + } + + id = UGQUOTA2LQC(type); + /* quota already turned on */ + if ((obt->obt_qctxt.lqc_flags & id) == id) { + rc = 0; + goto out; + } + oqctl->qc_type = type; oqctl->qc_cmd = Q_QUOTAON; - oqctl->qc_id = QFMT_LDISKFS; + oqctl->qc_id = obt->obt_qfmt; push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + if (is_master) { + struct mds_obd *mds = &obd->u.mds; - if (!is_master) - goto local_quota; + down(&mds->mds_qonoff_sem); + /* turn on cluster wide quota */ + rc = mds_admin_quota_on(obd, oqctl); + if (rc) + CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR, + "auto-enable admin quota failed. rc=%d\n", rc); + up(&mds->mds_qonoff_sem); - /* turn on cluster wide quota */ - rc = mds_admin_quota_on(obd, oqctl); - if (rc) { - CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR, - "auto-enable admin quota failed. rc=%d\n", rc); - GOTO(out_pop, rc); } -local_quota: - /* turn on local quota */ - rc = fsfilt_quotactl(obd, sb, oqctl); - if (rc) { - CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR, - "auto-enable local quota failed. rc=%d\n", rc); - if (is_master) - mds_quota_off(obd, oqctl); - } else { - obd->u.obt.obt_qctxt.lqc_status = 1; + if (!rc) { + /* turn on local quota */ + rc = fsfilt_quotactl(obd, sb, oqctl); + if (rc) + CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR, + "auto-enable local quota failed. rc=%d\n", rc); + else + obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(type); } -out_pop: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); +out: + atomic_inc(&obt->obt_quotachecking); + OBD_FREE_PTR(oqctl); RETURN(rc); } +static int filter_quota_set_version(struct obd_device *obd, + lustre_quota_version_t version) +{ + struct obd_device_target *obt = &obd->u.obt; + + if (version != LUSTRE_QUOTA_V1) { +#ifdef HAVE_QUOTA64 + if (version != LUSTRE_QUOTA_V2) +#endif + return -EINVAL; + } + + if (!atomic_dec_and_test(&obt->obt_quotachecking)) { + CDEBUG(D_INFO, "other people are doing quotacheck\n"); + atomic_inc(&obt->obt_quotachecking); + return -EBUSY; + } + + if (obt->obt_qctxt.lqc_flags & (LQC_USRQUOTA_FLAG | LQC_GRPQUOTA_FLAG)) { + atomic_inc(&obt->obt_quotachecking); + return -EBUSY; + } + + obt->obt_qfmt = version; + + atomic_inc(&obt->obt_quotachecking); + + return 0; +} + +/* The following settings of CURRENT quotas is expected on the input: + * MDS: u for user quotas (administrative+operational) turned on, + * g for group quotas (administrative+operational) turned on, + * 1 for 32-bit operational quotas and 32-bit administrative quotas, + * 2 for 32-bit operational quotas and 64-bit administrative quotas, + * 3 for 64-bit operational quotas and 64-bit administrative quotas + * OST: u for user quotas (operational) turned on, + * g for group quotas (operational) turned on, + * 1 for 32-bit local operational quotas, + * 2 for 32-bit local operational quotas, + * 3 for 64-bit local operational quotas, + * Permanent parameters can be set with lctl/tunefs + */ int lprocfs_quota_wr_type(struct file *file, const char *buffer, unsigned long count, void *data) { struct obd_device *obd = (struct obd_device *)data; - struct obd_device_target *obt = &obd->u.obt; - int type = 0; + struct obd_device_target *obt; + int type = 0, is_mds, idx; unsigned long i; char stype[MAX_STYPE_SIZE + 1] = ""; + static const lustre_quota_version_t s2av[3] = {LUSTRE_QUOTA_V1, + LUSTRE_QUOTA_V2, + LUSTRE_QUOTA_V2}, + s2ov[3] = {LUSTRE_QUOTA_V1, + LUSTRE_QUOTA_V1, + LUSTRE_QUOTA_V2}; LASSERT(obd != NULL); + obt = &obd->u.obt; + + is_mds = !strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME); + if (count > MAX_STYPE_SIZE) return -EINVAL; @@ -175,22 +262,26 @@ int lprocfs_quota_wr_type(struct file *file, const char *buffer, break; /* quota version specifiers */ case '1' : - if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME)) - break; - - rc = mds_quota_set_version(obd, LUSTRE_QUOTA_V1); - if (rc) { - CDEBUG(D_QUOTA, "failed to set quota v1! %d\n", rc); - return rc; - } - break; case '2' : - if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME)) - break; - - rc = mds_quota_set_version(obd, LUSTRE_QUOTA_V2); + case '3' : + idx = stype[i] - '1'; +#ifndef HAVE_QUOTA64 + if (s2ov[idx] == LUSTRE_QUOTA_V2) + return -EINVAL; +#endif + if (is_mds) { + rc = mds_quota_set_version(obd, s2av[idx]); + if (rc) { + CDEBUG(D_QUOTA, "failed to set admin " + "quota to spec %c! %d\n", + stype[i], rc); + return rc; + } + } + rc = filter_quota_set_version(obd, s2ov[idx]); if (rc) { - CDEBUG(D_QUOTA, "could not set quota v2! %d\n", rc); + CDEBUG(D_QUOTA, "failed to set operational quota" + " to spec %c! %d\n", stype[i], rc); return rc; } break; @@ -199,17 +290,8 @@ int lprocfs_quota_wr_type(struct file *file, const char *buffer, } } - obt->obt_qctxt.lqc_atype = type; - - if (type == 0) - return count; - - if (!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME)) - auto_quota_on(obd, type - 1, obt->obt_sb, 1); - else if (!strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) - auto_quota_on(obd, type - 1, obt->obt_sb, 0); - else - return -EFAULT; + if (type != 0) + auto_quota_on(obd, type - 1, obt->obt_sb, is_mds); return count; } @@ -223,12 +305,16 @@ static int filter_quota_setup(struct obd_device *obd) struct obd_device_target *obt = &obd->u.obt; ENTRY; +#ifdef HAVE_QUOTA64 + obt->obt_qfmt = LUSTRE_QUOTA_V2; +#else + obt->obt_qfmt = LUSTRE_QUOTA_V1; +#endif atomic_set(&obt->obt_quotachecking, 1); rc = qctxt_init(&obt->obt_qctxt, obt->obt_sb, NULL); - if (rc) { + if (rc) CERROR("initialize quota context failed! (rc:%d)\n", rc); - RETURN(rc); - } + RETURN(rc); } @@ -330,7 +416,7 @@ static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa) oa->o_valid |= (cnt == USRQUOTA) ? OBD_MD_FLUSRQUOTA : OBD_MD_FLGRPQUOTA; if (oqctl->qc_dqblk.dqb_bhardlimit && - (toqb(oqctl->qc_dqblk.dqb_curspace) > + (toqb(oqctl->qc_dqblk.dqb_curspace) >= oqctl->qc_dqblk.dqb_bhardlimit)) oa->o_flags |= (cnt == USRQUOTA) ? OBD_FL_NO_USRQUOTA : OBD_FL_NO_GRPQUOTA; @@ -582,6 +668,11 @@ static int mds_quota_setup(struct obd_device *obd) int rc; ENTRY; +#ifdef HAVE_QUOTA64 + obt->obt_qfmt = LUSTRE_QUOTA_V2; +#else + obt->obt_qfmt = LUSTRE_QUOTA_V1; +#endif mds->mds_quota_info.qi_version = LUSTRE_QUOTA_V2; atomic_set(&obt->obt_quotachecking, 1); /* initialize quota master and quota context */ @@ -603,17 +694,14 @@ static int mds_quota_cleanup(struct obd_device *obd) static int mds_quota_fs_cleanup(struct obd_device *obd) { struct mds_obd *mds = &obd->u.mds; - int i; + struct obd_quotactl oqctl; ENTRY; - /* close admin quota files */ + memset(&oqctl, 0, sizeof(oqctl)); + oqctl.qc_type = UGQUOTA; + down(&mds->mds_qonoff_sem); - for (i = 0; i < MAXQUOTAS; i++) { - if (mds->mds_quota_info.qi_files[i]) { - filp_close(mds->mds_quota_info.qi_files[i], 0); - mds->mds_quota_info.qi_files[i] = NULL; - } - } + mds_admin_quota_off(obd, &oqctl); up(&mds->mds_qonoff_sem); RETURN(0); } diff --git a/lustre/quota/quota_internal.h b/lustre/quota/quota_internal.h index d269c0b1265d50e61e33f14151c2ac610d86e06c..fb5fd93c1c58748bc269b49ebc4b76acc32a922d 100644 --- a/lustre/quota/quota_internal.h +++ b/lustre/quota/quota_internal.h @@ -98,10 +98,12 @@ int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_quota_get_version(struct obd_device *obd, lustre_quota_version_t *ver); int mds_quota_set_version(struct obd_device *obd, lustre_quota_version_t ver); int mds_quota_invalidate(struct obd_device *obd, struct obd_quotactl *oqctl); +int mds_quota_finvalidate(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl); +int mds_admin_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_set_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_get_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl); diff --git a/lustre/quota/quota_master.c b/lustre/quota/quota_master.c index 31dc70fdc39ef99cc638a9f64178540532e32203..5187733fba82d4b74ee84f0e121659002bde4f31 100644 --- a/lustre/quota/quota_master.c +++ b/lustre/quota/quota_master.c @@ -368,7 +368,8 @@ int dqacq_handler(struct obd_device *obd, struct qunit_data *qdata, int opc) QUSG(*usage, QDATA_IS_BLK(qdata)) < hlimit) qdata->qd_count = (hlimit - QUSG(*usage, QDATA_IS_BLK(qdata))) - * QUOTABLOCK_SIZE; + * (QDATA_IS_BLK(qdata) ? + QUOTABLOCK_SIZE : 1); else GOTO(out, rc = -EDQUOT); } @@ -536,8 +537,7 @@ int mds_quota_set_version(struct obd_device *obd, lustre_quota_version_t version struct lustre_quota_info *qinfo = &mds->mds_quota_info; int rc = 0, i; - if (version != LUSTRE_QUOTA_V1 && - version != LUSTRE_QUOTA_V2) + if (version != LUSTRE_QUOTA_V1 && version != LUSTRE_QUOTA_V2) return -EINVAL; down(&mds->mds_qonoff_sem); @@ -596,7 +596,7 @@ int mds_quota_invalidate(struct obd_device *obd, struct obd_quotactl *oqctl) LASSERT(strlen(quotafile) + sizeof(prefix) <= sizeof(name)); sprintf(name, "%s%s", prefix, quotafile); - fp = filp_open(name, O_CREAT | O_TRUNC, 0644); + fp = filp_open(name, O_CREAT | O_TRUNC | O_RDWR, 0644); if (IS_ERR(fp)) { rc = PTR_ERR(fp); CERROR("error invalidating admin quotafile %s (rc:%d)\n", @@ -614,6 +614,27 @@ out: return rc; } +int mds_quota_finvalidate(struct obd_device *obd, struct obd_quotactl *oqctl) +{ + struct mds_obd *mds = &obd->u.mds; + int rc; + struct lvfs_run_ctxt saved; + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + down(&mds->mds_qonoff_sem); + + oqctl->qc_cmd = Q_FINVALIDATE; + oqctl->qc_id = obd->u.obt.obt_qfmt; + rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); + if (!rc) + rc = obd_quotactl(mds->mds_osc_exp, oqctl); + + up(&mds->mds_qonoff_sem); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + return rc; +} + int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl) { struct mds_obd *mds = &obd->u.mds; @@ -680,7 +701,7 @@ int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl) rc == -ENOENT ? "creating" : "overwriting"); /* create quota file overwriting old if needed */ - fp = filp_open(name, O_CREAT | O_TRUNC, 0644); + fp = filp_open(name, O_CREAT | O_TRUNC | O_RDWR, 0644); if (IS_ERR(fp)) { rc = PTR_ERR(fp); CERROR("error creating admin quotafile %s (rc:%d)\n", @@ -769,7 +790,7 @@ int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) qinfo->qi_version == LUSTRE_QUOTA_V2) { CDEBUG(D_INFO, "attempting to convert V1 quota file to" " V2 format\n"); - fp = filp_open(name, O_CREAT | O_TRUNC, 0644); + fp = filp_open(name, O_CREAT | O_TRUNC | O_RDWR, 0644); if (!IS_ERR(fp)) { qinfo->qi_files[i] = fp; rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_CONVERT); @@ -810,8 +831,8 @@ int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) RETURN(rc); } -static int mds_admin_quota_off(struct obd_device *obd, - struct obd_quotactl *oqctl) +int mds_admin_quota_off(struct obd_device *obd, + struct obd_quotactl *oqctl) { struct mds_obd *mds = &obd->u.mds; struct lustre_quota_info *qinfo = &mds->mds_quota_info; @@ -849,7 +870,7 @@ int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); if (!rc) - obt->obt_qctxt.lqc_status = 1; + obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(oqctl->qc_type); out: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); up(&mds->mds_qonoff_sem); @@ -879,7 +900,7 @@ int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl) rc = obd_quotactl(mds->mds_osc_exp, oqctl); rc2 = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); if (!rc2) - obt->obt_qctxt.lqc_status = 0; + obt->obt_qctxt.lqc_flags &= ~UGQUOTA2LQC(oqctl->qc_type); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); up(&mds->mds_qonoff_sem); @@ -1062,7 +1083,7 @@ static int mds_init_slave_ilimits(struct obd_device *obd, /* if we are going to set zero limit, needn't init slaves */ if (!oqctl->qc_dqblk.dqb_ihardlimit && !oqctl->qc_dqblk.dqb_isoftlimit && - set) + !set) RETURN(0); OBD_ALLOC_PTR(ioqc); @@ -1070,7 +1091,7 @@ static int mds_init_slave_ilimits(struct obd_device *obd, RETURN(-ENOMEM); flag = oqctl->qc_dqblk.dqb_ihardlimit || - oqctl->qc_dqblk.dqb_isoftlimit || set; + oqctl->qc_dqblk.dqb_isoftlimit || !set; ioqc->qc_cmd = flag ? Q_INITQUOTA : Q_SETQUOTA; ioqc->qc_id = oqctl->qc_id; ioqc->qc_type = oqctl->qc_type; @@ -1129,7 +1150,7 @@ static int mds_init_slave_blimits(struct obd_device *obd, /* if we are going to set zero limit, needn't init slaves */ if (!oqctl->qc_dqblk.dqb_bhardlimit && !oqctl->qc_dqblk.dqb_bsoftlimit && - set) + !set) RETURN(0); OBD_ALLOC_PTR(ioqc); @@ -1137,7 +1158,7 @@ static int mds_init_slave_blimits(struct obd_device *obd, RETURN(-ENOMEM); flag = oqctl->qc_dqblk.dqb_bhardlimit || - oqctl->qc_dqblk.dqb_bsoftlimit || set; + oqctl->qc_dqblk.dqb_bsoftlimit || !set; ioqc->qc_cmd = flag ? Q_INITQUOTA : Q_SETQUOTA; ioqc->qc_id = oqctl->qc_id; ioqc->qc_type = oqctl->qc_type; @@ -1198,7 +1219,10 @@ int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl) time_t btime, itime; struct lustre_dquot *dquot; struct obd_dqblk *dqblk = &oqctl->qc_dqblk; - int set, rc, rc2 = 0, flag = 0; + /* orig_set means if quota was set before; now_set means we are + * setting/cancelling quota */ + int orig_set, now_set; + int rc, rc2 = 0, flag = 0; ENTRY; OBD_ALLOC_PTR(oqaq); @@ -1292,24 +1316,26 @@ int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl) } up(&mds->mds_qonoff_sem); - if (dqblk->dqb_valid & QIF_ILIMITS) { - set = !(ihardlimit || isoftlimit); + orig_set = ihardlimit || isoftlimit; + now_set = dqblk->dqb_ihardlimit || dqblk->dqb_isoftlimit; + if (dqblk->dqb_valid & QIF_ILIMITS && orig_set != now_set) { down(&dquot->dq_sem); dquot->dq_dqb.dqb_curinodes = 0; up(&dquot->dq_sem); - rc = mds_init_slave_ilimits(obd, oqctl, set, oqaq); + rc = mds_init_slave_ilimits(obd, oqctl, orig_set, oqaq); if (rc) { CERROR("init slave ilimits failed! (rc:%d)\n", rc); goto revoke_out; } } - if (dqblk->dqb_valid & QIF_BLIMITS) { - set = !(bhardlimit || bsoftlimit); + orig_set = bhardlimit || bsoftlimit; + now_set = dqblk->dqb_bhardlimit || dqblk->dqb_bsoftlimit; + if (dqblk->dqb_valid & QIF_BLIMITS && orig_set != now_set) { down(&dquot->dq_sem); dquot->dq_dqb.dqb_curspace = 0; up(&dquot->dq_sem); - rc = mds_init_slave_blimits(obd, oqctl, set, oqaq); + rc = mds_init_slave_blimits(obd, oqctl, orig_set, oqaq); if (rc) { CERROR("init slave blimits failed! (rc:%d)\n", rc); goto revoke_out; diff --git a/lustre/quota/quotactl_test.c b/lustre/quota/quotactl_test.c index 90cb5b98c14db25a82f0b3112380d3be6c1942b7..bf5b145f958278ab1675782f7dcda8c6810a1c42 100644 --- a/lustre/quota/quotactl_test.c +++ b/lustre/quota/quotactl_test.c @@ -33,7 +33,7 @@ static int quotactl_test_1(struct obd_device *obd, struct super_block *sb) ENTRY; oqctl.qc_cmd = Q_QUOTAON; - oqctl.qc_id = QFMT_LDISKFS; + oqctl.qc_id = obd->u.obt.obt_qfmt; oqctl.qc_type = UGQUOTA; rc = fsfilt_quotactl(obd, sb, &oqctl); if (rc) diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore index 93d391b085d374d6485b82466f098e3f6e7c317b..14cc38d611d6c671435a7afd5276fcdc874424ef 100644 --- a/lustre/tests/.cvsignore +++ b/lustre/tests/.cvsignore @@ -71,8 +71,9 @@ rmdirmany flock_test flocks_test writemany -random-reads +reads chownmany llverdev llverfs ll_getstripe_info +it_test diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 305ff0e500d5ad15807319840a913c25ec324229..f82d16eeebde6b0cada4961e075c5ba6126b164a 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -12,8 +12,8 @@ noinst_SCRIPTS += conf-sanity.sh insanity.sh lfscktest.sh oos.sh oos2.sh noinst_SCRIPTS += llog-test.sh recovery-small.sh replay-dual.sh sanity-quota.sh noinst_SCRIPTS += replay-ost-single.sh replay-single.sh run-llog.sh sanityN.sh noinst_SCRIPTS += lockorder.sh socketclient socketserver runmultiop_bg_pause -nobase_noinst_SCRIPTS = cfg/insanity-local.sh -nobase_noinst_SCRIPTS += cfg/local.sh acl/make-tree acl/run +nobase_noinst_SCRIPTS = cfg/local.sh +nobase_noinst_SCRIPTS += acl/make-tree acl/run cfg/ncli.sh nobase_noinst_DATA = acl/cp.test acl/getfacl-noacl.test acl/inheritance.test nobase_noinst_DATA += acl/misc.test acl/permissions.test acl/setfacl.test @@ -29,7 +29,7 @@ noinst_PROGRAMS += wantedi statone runas openfile getdents o_directory rmdirmany noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify cmknod noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify mkdirmany noinst_PROGRAMS += openfilleddirunlink rename_many memhog iopentest1 iopentest2 -noinst_PROGRAMS += mmap_sanity flock_test writemany random-reads flocks_test +noinst_PROGRAMS += mmap_sanity flock_test writemany reads flocks_test noinst_PROGRAMS += ll_getstripe_info if MPITESTS noinst_PROGRAMS += parallel_grouplock write_append_truncate createmany_mpi @@ -50,6 +50,9 @@ LIBLUSTREAPI := $(top_builddir)/lustre/utils/liblustreapi.a ll_getstripe_info_LDADD=$(LIBLUSTREAPI) multiop_LDADD=$(LIBLUSTREAPI) +ll_dirstripe_verify_SOURCES= ll_dirstripe_verify.c +ll_dirstripe_verify_LDADD= -L$(top_builddir)/lustre/utils -llustreapi + if MPITESTS LAM_LD_FLAGS=-L/opt/lam/lib -lmpi -llam -lpthread write_append_truncate_SOURCES=write_append_truncate.c diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index 4a7142fb568133461c3d8c1aa420247c1c379ee9..ea59da971146378906cca5df3c71e4dff5d68de7 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -4,8 +4,6 @@ #set -vx set -e -PATH=`dirname $0`/../utils:$PATH - [ -z "$CONFIG" -a "$NAME" ] && CONFIGS=$NAME [ "$CONFIGS" ] || CONFIGS="local" #"local lov" [ "$MAX_THREADS" ] || MAX_THREADS=20 @@ -22,8 +20,8 @@ fi [ "$TMP" ] || TMP=/tmp [ "$COUNT" ] || COUNT=1000 [ "$DEBUG_LVL" ] || DEBUG_LVL=0 -[ "$DEBUG_OFF" ] || DEBUG_OFF="eval sysctl -w lnet.debug=\"$DEBUG_LVL\"" -[ "$DEBUG_ON" ] || DEBUG_ON="eval sysctl -w lnet.debug=0x33f0484" +[ "$DEBUG_OFF" ] || DEBUG_OFF="eval lctl set_param debug=\"$DEBUG_LVL\"" +[ "$DEBUG_ON" ] || DEBUG_ON="eval lctl set_param debug=0x33f0484" export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA" @@ -42,7 +40,7 @@ LIBLUSTRETESTS=${LIBLUSTRETESTS:-../liblustre/tests} STARTTIME=`date +%s` RANTEST="" -LUSTRE=${LUSTRE:-`dirname $0`/..} +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ @@ -196,7 +194,7 @@ for NAME in $CONFIGS; do [ $THREADS -lt $IOZ_THREADS ] && IOZ_THREADS=$THREADS IOZVER=`iozone -v | awk '/Revision:/ {print $3}' | tr -d .` if [ "$IOZ_THREADS" -gt 1 -a "$IOZVER" -ge 3145 ]; then - $LFS setstripe -c 1 $IOZDIR + $LFS setstripe -c -1 $IOZDIR $DEBUG_OFF THREAD=1 IOZFILE="-F " @@ -244,24 +242,24 @@ for NAME in $CONFIGS; do SANITYN="done" fi + remote_mds && log "Remote MDS, skipping LFSCK test" && LFSCK=no + remote_ost && log "Remote OST, skipping LFSCK test" && LFSCK=no + if [ "$LFSCK" != "no" -a -x /usr/sbin/lfsck ]; then title lfsck E2VER=`e2fsck -V 2>&1 | head -n 1 | cut -d' ' -f 2` - if grep -q obdfilter /proc/fs/lustre/devices; then - if [ `echo $E2VER | cut -d. -f2` -ge 39 ] && \ - [ "`echo $E2VER | grep cfs`" -o \ - "`echo $E2VER | grep sun`" ]; then - bash lfscktest.sh - else - e2fsck -V - echo "e2fsck does not support lfsck, skipping" - fi + if [ `echo $E2VER | cut -d. -f2` -ge 39 ] && \ + [ "`echo $E2VER | grep cfs`" -o \ + "`echo $E2VER | grep sun`" ]; then + bash lfscktest.sh else - echo "remote OST, skipping test" + e2fsck -V + echo "e2fsck does not support lfsck, skipping" fi LFSCK="done" fi + [ "$NETTYPE" = "tcp" -o "$NETTYPE" = "ptl" ] || LIBLUSTRE=no # bug 15660 if [ "$LIBLUSTRE" != "no" ]; then title liblustre assert_env MGSNID MOUNT2 @@ -272,7 +270,7 @@ for NAME in $CONFIGS; do [ -f /etc/modprobe.d/Lustre ] && MODPROBECONF=/etc/modprobe.d/Lustre LNETOPTS="$(awk '/^options lnet/ { print $0}' $MODPROBECONF | \ - sed 's/^options lnet //g') accept=all" \ + sed 's/^options lnet //g; s/"//g') accept=all" \ MDS_MOUNT_OPTS=$(echo $MDS_MOUNT_OPTS | sed 's/^[ \t]*//;s/[ \t]*$//') \ MDS_MOUNT_OPTS="${MDS_MOUNT_OPTS},noacl" \ MDS_MOUNT_OPTS=${MDS_MOUNT_OPTS/#,/-o } \ diff --git a/lustre/tests/cfg/insanity-local.sh b/lustre/tests/cfg/insanity-local.sh deleted file mode 100644 index 0708126455d8bd9213a2d43f5f7367889de57a61..0000000000000000000000000000000000000000 --- a/lustre/tests/cfg/insanity-local.sh +++ /dev/null @@ -1,70 +0,0 @@ -FSNAME=lustre - -# facet hosts -mds_HOST=${mds_HOST:-`hostname`} -mdsfailover_HOST=${mdsfailover_HOST:-""} -mgs_HOST=${mgs_HOST:-$mds_HOST} -ost_HOST=${ost_HOST:-`hostname`} -LIVE_CLIENT=${LIVE_CLIENT:-`hostname`} -# This should always be a list, not a regexp -FAIL_CLIENTS=${FAIL_CLIENTS:-""} -PDSH=${PDSH:-no_dsh} - -TMP=${TMP:-/tmp} -MDSDEV=${MDSDEV:-$TMP/${FSNAME}-mdt} -MDSSIZE=${MDSSIZE:-100000} -MDSOPT=${MDSOPT:-"--mountfsoptions=acl"} - -OSTCOUNT=${OSTCOUNT:-3} -OSTDEVBASE=${OSTDEVBASE:-$TMP/${FSNAME}-ost} -OSTSIZE=${OSTSIZE:-200000} - -NETTYPE=${NETTYPE:-tcp} -MGSNID=${MGSNID:-`h2$NETTYPE $mgs_HOST`} -FSTYPE=${FSTYPE:-ldiskfs} -STRIPE_BYTES=${STRIPE_BYTES:-1048576} -STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} -TIMEOUT=${TIMEOUT:-30} -PTLDEBUG=${PTLDEBUG:-0x33f0404} -DEBUG_SIZE=${DEBUG_SIZE:-10} -SUBSYSTEM=${SUBSYSTEM:- 0xffb7e3ff} - -L_GETGROUPS=${L_GETGROUPS:-`do_facet mds which l_getgroups || echo`} - -MKFSOPT="" -MOUNTOPT="" -[ "x$MDSJOURNALSIZE" != "x" ] && - MKFSOPT=$MKFSOPT" -J size=$MDSJOURNALSIZE" -[ "x$MDSISIZE" != "x" ] && - MKFSOPT=$MKFSOPT" -i $MDSISIZE" -[ "x$MKFSOPT" != "x" ] && - MKFSOPT="--mkfsoptions=\\\"$MKFSOPT\\\"" -[ "x$mdsfailover_HOST" != "x" ] && - MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`" -[ "x$STRIPE_BYTES" != "x" ] && - MOUNTOPT=$MOUNTOPT" --param lov.stripesize=$STRIPE_BYTES" -[ "x$STRIPES_PER_OBJ" != "x" ] && - MOUNTOPT=$MOUNTOPT" --param lov.stripecount=$STRIPES_PER_OBJ" -[ "x$L_GETGROUPS" != "x" ] && - MOUNTOPT=$MOUNTOPT" --param mdt.group_upcall=$L_GETGROUPS" -MDS_MKFS_OPTS="--mgs --mdt --fsname=$FSNAME --device-size=$MDSSIZE --param sys.timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $MDSOPT" - -MKFSOPT="" -MOUNTOPT="" -[ "x$OSTJOURNALSIZE" != "x" ] && - MKFSOPT=$MKFSOPT" -J size=$OSTJOURNALSIZE" -[ "x$MKFSOPT" != "x" ] && - MKFSOPT="--mkfsoptions=\\\"$MKFSOPT\\\"" -[ "x$ostfailover_HOST" != "x" ] && - MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`" -OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID --param sys.timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $OSTOPT" - -MDS_MOUNT_OPTS=${MDS_MOUNT_OPTS:-"-o loop"} -OST_MOUNT_OPTS=${OST_MOUNT_OPTS:-"-o loop"} -MOUNT=${MOUNT:-"/mnt/lustre"} - -FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD -POWER_DOWN=${POWER_DOWN:-"powerman --off"} -POWER_UP=${POWER_UP:-"powerman --on"} -SLOW=${SLOW:-no} -FAIL_ON_ERROR=${FAIL_ON_ERROR:-true} diff --git a/lustre/tests/cfg/ncli.sh b/lustre/tests/cfg/ncli.sh new file mode 100644 index 0000000000000000000000000000000000000000..6dfae9b18b063e978cb52a184f8dba186ba026a0 --- /dev/null +++ b/lustre/tests/cfg/ncli.sh @@ -0,0 +1,18 @@ +. $LUSTRE/tests/cfg/local.sh + +CLIENT1=${CLIENT1:-`hostname`} +SINGLECLIENT=$CLIENT1 +RCLIENTS=${RCLIENTS:-""} +CLIENTS=`comma_list $SINGLECLIENT $RCLIENTS` +REMOTECLIENTS=($RCLIENTS) +for ((i=0; $i<${#REMOTECLIENTS[@]}; i++)); do + varname=CLIENT$((i + 2)) + eval $varname=${REMOTECLIENTS[i]} +done + +CLIENTCOUNT=$((${#REMOTECLIENTS[@]} + 1)) + +[ -n "$RCLIENTS" -a "$PDSH" = "no_dsh" ] && \ + error "tests for remote clients $RCLIENTS needs pdsh != do_dsh " || true + +[ -n "$FUNCTIONS" ] && . $FUNCTIONS || true diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 6dbc0586b121191740415df75ab100a3c3154b36..cd6732557e4456900b80b4050444b35e94287bdb 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -37,6 +37,8 @@ OSTSIZE=40000 # [ "$SLOW" = "no" ] && EXCEPT_SLOW="0 1 2 3 6 7 15 18 24b 25 30 31 32 33 34a " +assert_DIR + reformat() { formatall } @@ -329,7 +331,7 @@ test_5e() { start_mds #define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 - do_facet client "sysctl -w lustre.fail_loc=0x80000506" + do_facet client "lctl set_param fail_loc=0x80000506" grep " $MOUNT " /etc/mtab && echo "test 5e: mtab before mount" && return 10 mount_client $MOUNT || echo "mount failed (not fatal)" cleanup || return $? @@ -366,17 +368,17 @@ run_test 8 "double mount setup" test_9() { start_ost - do_facet ost1 sysctl lnet.debug=\'inode trace\' || return 1 - do_facet ost1 sysctl lnet.subsystem_debug=\'mds ost\' || return 1 + do_facet ost1 lctl set_param debug=\'inode trace\' || return 1 + do_facet ost1 lctl set_param subsystem_debug=\'mds ost\' || return 1 - CHECK_PTLDEBUG="`do_facet ost1 sysctl -n lnet.debug`" + CHECK_PTLDEBUG="`do_facet ost1 lctl get_param -n debug`" if [ "$CHECK_PTLDEBUG" ] && [ "$CHECK_PTLDEBUG" = "trace inode" ];then echo "lnet.debug success" else echo "lnet.debug: want 'trace inode', have '$CHECK_PTLDEBUG'" return 1 fi - CHECK_SUBSYS="`do_facet ost1 sysctl -n lnet.subsystem_debug`" + CHECK_SUBSYS="`do_facet ost1 lctl get_param -n subsystem_debug`" if [ "$CHECK_SUBSYS" ] && [ "$CHECK_SUBSYS" = "mds ost" ]; then echo "lnet.subsystem_debug success" else @@ -513,7 +515,7 @@ test_12() { } run_test 12 "lmc --batch, with single/double quote, backslash in batchfile" -test_13() { +test_13a() { # was test_13 OLDXMLCONFIG=$XMLCONFIG XMLCONFIG="conf13-1.xml" @@ -546,7 +548,7 @@ test_13() { rm -f $XMLCONFIG XMLCONFIG=$OLDXMLCONFIG } -run_test 13 "check new_uuid of lmc operating correctly" +run_test 13a "check new_uuid of lmc operating correctly" test_13b() { OLDXMLCONFIG=$XMLCONFIG @@ -800,6 +802,7 @@ test_22() { echo Client mount before any osts are in the logs mount_client $MOUNT check_mount && return 41 + umount_client $MOUNT pass echo Client mount with ost in logs, but none running @@ -876,7 +879,7 @@ test_23b() { # was test_23 start_ost start_mds # Simulate -EINTR during mount OBD_FAIL_LDLM_CLOSE_THREAD - sysctl -w lustre.fail_loc=0x80000313 + lctl set_param fail_loc=0x80000313 mount_client $MOUNT cleanup } @@ -970,7 +973,7 @@ test_26() { # we need modules before mount for sysctl, so make sure... do_facet mds "lsmod | grep -q lustre || modprobe lustre" #define OBD_FAIL_MDS_FS_SETUP 0x135 - do_facet mds "sysctl -w lustre.fail_loc=0x80000135" + do_facet mds "lctl set_param fail_loc=0x80000135" start_mds && echo MDS started && return 1 lctl get_param -n devices DEVS=$(lctl get_param -n devices | wc -l) @@ -1148,27 +1151,30 @@ test_32a() { # there appears to be a lot of assumption here about loopback # devices # or maybe this test is just totally useless on a client-only system + [ "$NETTYPE" = "tcp" ] || { skip "NETTYPE != tcp" && return 0; } [ "$mds_HOST" = "`hostname`" ] || { skip "remote MDS" && return 0; } [ "$ost_HOST" = "`hostname`" -o "$ost1_HOST" = "`hostname`" ] || \ { skip "remote OST" && return 0; } [ -z "$TUNEFS" ] && skip "No tunefs" && return local DISK1_4=$LUSTRE/tests/disk1_4.zip - [ ! -r $DISK1_4 ] && skip "Cant find $DISK1_4, skipping" && return - unzip -o -j -d $TMP/$tdir $DISK1_4 || { skip "Cant unzip $DISK1_4, skipping" && return ; } + [ ! -r $DISK1_4 ] && skip "Cant find $DISK1_4, skipping" && return + + local tmpdir=$TMP/conf32a + unzip -o -j -d $tmpdir $DISK1_4 || { skip "Cant unzip $DISK1_4, skipping" && return ; } load_modules - sysctl lnet.debug=$PTLDEBUG + lctl set_param debug=$PTLDEBUG - $TUNEFS $TMP/$tdir/mds || error "tunefs failed" + $TUNEFS $tmpdir/mds || error "tunefs failed" # nids are wrong, so client wont work, but server should start - start mds $TMP/$tdir/mds "-o loop,exclude=lustre-OST0000" || return 3 + start mds $tmpdir/mds "-o loop,exclude=lustre-OST0000" || return 3 local UUID=$(lctl get_param -n mds.lustre-MDT0000.uuid) echo MDS uuid $UUID [ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID" - $TUNEFS --mgsnode=`hostname` $TMP/$tdir/ost1 || error "tunefs failed" - start ost1 $TMP/$tdir/ost1 "-o loop" || return 5 - UUID=$(cat lctl get_param -n obdfilter.lustre-OST0000.uuid) + $TUNEFS --mgsnode=`hostname` $tmpdir/ost1 || error "tunefs failed" + start ost1 $tmpdir/ost1 "-o loop" || return 5 + UUID=$(lctl get_param -n obdfilter.lustre-OST0000.uuid) echo OST uuid $UUID [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID" @@ -1198,12 +1204,12 @@ test_32a() { # mount a second time to make sure we didnt leave upgrade flag on load_modules - $TUNEFS --dryrun $TMP/$tdir/mds || error "tunefs failed" + $TUNEFS --dryrun $tmpdir/mds || error "tunefs failed" load_modules - start mds $TMP/$tdir/mds "-o loop,exclude=lustre-OST0000" || return 12 - cleanup_nocli + start mds $tmpdir/mds "-o loop,exclude=lustre-OST0000" || return 12 + cleanup_nocli - [ -d $TMP/$tdir ] && rm -rf $TMP/$tdir + rm -rf $tmpdir || true # true is only for TMP on NFS } run_test 32a "Upgrade from 1.4 (not live)" @@ -1213,28 +1219,31 @@ test_32b() { # there appears to be a lot of assumption here about loopback # devices # or maybe this test is just totally useless on a client-only system + [ "$NETTYPE" = "tcp" ] || { skip "NETTYPE != tcp" && return 0; } [ "$mds_HOST" = "`hostname`" ] || { skip "remote MDS" && return 0; } [ "$ost_HOST" = "`hostname`" -o "$ost1_HOST" = "`hostname`" ] || \ { skip "remote OST" && return 0; } [ -z "$TUNEFS" ] && skip "No tunefs" && return local DISK1_4=$LUSTRE/tests/disk1_4.zip - [ ! -r $DISK1_4 ] && skip "Cant find $DISK1_4, skipping" && return - unzip -o -j -d $TMP/$tdir $DISK1_4 || { skip "Cant unzip $DISK1_4, skipping" && return ; } + [ ! -r $DISK1_4 ] && skip "Cant find $DISK1_4, skipping" && return + + local tmpdir=$TMP/conf32b + unzip -o -j -d $tmpdir $DISK1_4 || { skip "Cant unzip $DISK1_4, skipping" && return ; } load_modules - sysctl lnet.debug=$PTLDEBUG + lctl set_param debug=$PTLDEBUG NEWNAME=sofia # writeconf will cause servers to register with their current nids - $TUNEFS --writeconf --fsname=$NEWNAME $TMP/$tdir/mds || error "tunefs failed" - start mds $TMP/$tdir/mds "-o loop" || return 3 - local UUID=$(lctl get_param -n mds.${NEWNAME}-MDT0000.uuid) + $TUNEFS --writeconf --fsname=$NEWNAME $tmpdir/mds || error "tunefs failed" + start mds $tmpdir/mds "-o loop" || return 3 + local UUID=$(lctl get_param -n mds.${NEWNAME}-MDT0000.uuid) echo MDS uuid $UUID [ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID" - $TUNEFS --mgsnode=`hostname` --fsname=$NEWNAME --writeconf $TMP/$tdir/ost1 || error "tunefs failed" - start ost1 $TMP/$tdir/ost1 "-o loop" || return 5 - UUID=$(lctl get_param -n obdfilter.${NEWNAME}-OST0000.uuid) + $TUNEFS --mgsnode=`hostname` --fsname=$NEWNAME --writeconf $tmpdir/ost1 || error "tunefs failed" + start ost1 $tmpdir/ost1 "-o loop" || return 5 + UUID=$(lctl get_param -n obdfilter.${NEWNAME}-OST0000.uuid) echo OST uuid $UUID [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID" @@ -1258,7 +1267,7 @@ test_32b() { echo "ok." cleanup - [ -d $TMP/$tdir ] && rm -rf $TMP/$tdir + rm -rf $tmpdir || true # true is only for TMP on NFS } run_test 32b "Upgrade from 1.4 with writeconf" @@ -1299,8 +1308,8 @@ test_33b() { # was test_33a do_facet client dd if=/dev/zero of=$MOUNT/24 bs=1024k count=1 # Drop lock cancelation reply during umount #define OBD_FAIL_LDLM_CANCEL 0x304 - do_facet client sysctl -w lustre.fail_loc=0x80000304 - #sysctl -w lnet.debug=-1 + do_facet client lctl set_param fail_loc=0x80000304 + #lctl set_param debug=-1 umount_client $MOUNT cleanup } @@ -1357,7 +1366,7 @@ test_35() { # bug 12459 setup debugsave - sysctl -w lnet.debug="ha" + lctl set_param debug="ha" log "Set up a fake failnode for the MDS" FAKENID="127.0.0.2" @@ -1564,5 +1573,73 @@ test_39() { #bug 14413 } run_test 39 "leak_finder recognizes both LUSTRE and LNET malloc messages" +test_40() { # bug 15759 + start_ost + #define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 + do_facet mds "lctl set_param fail_loc=0x80000706" + start_mds + cleanup +} +run_test 40 "race during service thread startup" + +test_41() { #bug 14134 + local rc + start mds $MDSDEV $MDS_MOUNT_OPTS -o nosvc -n + start ost `ostdevname 1` $OST_MOUNT_OPTS + start mds $MDSDEV $MDS_MOUNT_OPTS -o nomgs + mkdir -p $MOUNT + mount_client $MOUNT || return 1 + sleep 5 + + echo "blah blah" > $MOUNT/$tfile + cat $MOUNT/$tfile + + umount_client $MOUNT + stop ost -f || return 201 + stop mds -f || return 202 + stop mds -f || return 203 + unload_modules || return 204 + return $rc +} +run_test 41 "mount mds with --nosvc and --nomgs" + +test_42() { #bug 14693 + setup + check_mount || return 2 + do_facet client lctl conf_param lustre.llite.some_wrong_param=10 + umount_client $MOUNT + mount_client $MOUNT || return 1 + cleanup + return 0 +} +run_test 42 "invalid config param should not prevent client from mounting" + +test_43() { #bug 15993 + setup + check_mount || return 2 + testfile=$DIR/$tfile + lma="this-should-be-removed-after-remount-and-accessed" + touch $testfile + echo "set/get trusted.lma" + setfattr -n trusted.lma -v $lma $testfile || error "create common EA" + ATTR=$(getfattr -n trusted.lma $testfile 2> /dev/null | grep trusted.lma) + [ "$ATTR" = "trusted.lma=\"$lma\"" ] || error "check common EA" + umount_client $MOUNT + stop_mds + sleep 5 + start_mds + mount_client $MOUNT + check_mount || return 3 +#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e + do_facet mds "lctl set_param fail_loc=0x13e" + stat $testfile + do_facet mds "lctl set_param fail_loc=0" + getfattr -d -m trusted $testfile 2> /dev/null | \ + grep "trusted.lma" && error "common EA not removed" || true + cleanup + return 0 +} +run_test 43 "remove common EA if it exists" + equals_msg `basename $0`: test complete [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true diff --git a/lustre/tests/createmany-mpi.c b/lustre/tests/createmany-mpi.c index 289e07174edb33c1b3e869854243ee0f10672c58..524c2d94915228afffb194158ce38cb698d2ca5b 100644 --- a/lustre/tests/createmany-mpi.c +++ b/lustre/tests/createmany-mpi.c @@ -56,6 +56,7 @@ int main(int argc, char ** argv) if (strcmp(argv[1], "-o") == 0) { do_open = 1; + tgt = NULL; } else if (strncmp(argv[1], "-l", 2) == 0 && argv[1][2]) { tgt = argv[1] + 2; do_link = 1; diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 011e66c4921a8b546356c0b6e4f8017f37fb7732..cd4ffaf7e9ae2dcf7ed56087fd4b9f7f51551277 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -8,10 +8,17 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} init_test_env $@ -. ${CONFIG:=$LUSTRE/tests/cfg/insanity-local.sh} +. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} ALWAYS_EXCEPT="10 $INSANITY_EXCEPT" +if [ "$FAILURE_MODE" = "HARD" ]; then + mixed_ost_devs && CONFIG_EXCEPTIONS="0 2 4 5 6 8" && \ + echo -n "Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " && \ + echo "Except the tests: $CONFIG_EXCEPTIONS" && \ + ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS" +fi + # [ "$SLOW" = "no" ] && EXCEPT_SLOW="" @@ -20,13 +27,17 @@ CLEANUP=${CLEANUP:-""} build_test_filter +SINGLECLIENT=${SINGLECLIENT:-$HOSTNAME} +LIVE_CLIENT=${LIVE_CLIENT:-$SINGLECLIENT} +FAIL_CLIENTS=${FAIL_CLIENTS:-$RCLIENTS} + assert_env mds_HOST MDS_MKFS_OPTS MDSDEV assert_env ost_HOST OST_MKFS_OPTS OSTCOUNT assert_env LIVE_CLIENT FSNAME # This can be a regexp, to allow more clients -CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS $EXTRA_CLIENTS`"} +CLIENTS=${CLIENTS:-"`comma_list $LIVE_CLIENT $FAIL_CLIENTS`"} DIR=${DIR:-$MOUNT} @@ -69,11 +80,14 @@ reboot_node() { fail_clients() { num=$1 + + log "Request clients to fail: ${num}. Num of clients to fail: ${FAIL_NUM}, already failed: $DOWN_NUM" if [ -z "$num" ] || [ "$num" -gt $((FAIL_NUM - DOWN_NUM)) ]; then num=$((FAIL_NUM - DOWN_NUM)) fi if [ -z "$num" ] || [ "$num" -le 0 ]; then + log "No clients failed!" return fi @@ -156,15 +170,11 @@ test_0() { echo "Waiting for df pid: $DFPID" wait $DFPID || { echo "df returned $?" && return 1; } - facet_failover ost1 || return 4 - echo "Waiting for df pid: $DFPID" - wait $DFPID || { echo "df returned $?" && return 2; } - - if [ $OSTCOUNT -gt 1 ]; then - facet_failover ost2 || return 5 - echo "Waiting for df pid: $DFPID" - wait $DFPID || { echo "df returned $?" && return 3; } - fi + for i in $(seq $OSTCOUNT) ; do + facet_failover ost$i || return 4 + echo "Waiting for df pid: $DFPID" + wait $DFPID || { echo "df returned $?" && return 3; } + done return 0 } run_test 0 "Fail all nodes, independently" @@ -293,7 +303,7 @@ run_test 4 "Fourth Failure Mode: OST/MDS `date`" ############### Fifth Failure Mode ############### test_5() { - [ $OSTCOUNT -lt 1 ] && skip "$OSTCOUNT < 1, not enough OSTs" && return 0 + [ $OSTCOUNT -lt 2 ] && skip "$OSTCOUNT < 2, not enough OSTs" && return 0 echo "Fifth Failure Mode: OST/OST `date`" diff --git a/lustre/tests/it_test.c b/lustre/tests/it_test.c index da9256da3c96d603509e8af43909f21ed2dbd901..44c7f6f2451df806d44e6407798988020a7aee42 100644 --- a/lustre/tests/it_test.c +++ b/lustre/tests/it_test.c @@ -7,6 +7,7 @@ #include <time.h> #include <sys/time.h> +#include <libcfs/kp30.h> #include <../ldlm/interval_tree.c> #define dprintf(fmt, args...) //printf(fmt, ##args) @@ -17,7 +18,7 @@ } while(0) #define __F(ext) (ext)->start, (ext)->end -#define __S "[%llx:%llx]" +#define __S "["LPX64":"LPX64"]" #define ALIGN_SIZE 4096 #define ALIGN_MASK (~(ALIGN_SIZE - 1)) diff --git a/lustre/tests/ll_dirstripe_verify.c b/lustre/tests/ll_dirstripe_verify.c index 8edaaaef127026a3f0cd8301a6808a8e73e8c228..99981427a54f4c6049a720513bf4630111dd3324 100644 --- a/lustre/tests/ll_dirstripe_verify.c +++ b/lustre/tests/ll_dirstripe_verify.c @@ -20,13 +20,19 @@ #include <liblustre.h> #include <obd.h> #include <lustre_lib.h> -#include <lustre/lustre_user.h> +#include <lustre/liblustreapi.h> #include <obd_lov.h> #include <lnet/lnetctl.h> #define MAX_LOV_UUID_COUNT 1000 +union { + struct obd_uuid uuid; + char name[0]; +} lov; +#define lov_uuid lov.uuid +#define lov_name lov.name /* Returns bytes read on success and a negative value on failure. * If zero bytes are read it will be treated as failure as such @@ -40,18 +46,18 @@ int read_proc_entry(char *proc_path, char *buf, int len) fd = open(proc_path, O_RDONLY); if (fd == -1) { - fprintf(stderr, "open('%s') failed: %s\n", - proc_path, strerror(errno)); + llapi_err(LLAPI_MSG_ERROR, "open('%s') failed: %s\n", + proc_path); return -2; } rc = read(fd, buf, len - 1); if (rc < 0) { - fprintf(stderr, "read('%s') failed: %s\n", - proc_path, strerror(errno)); + llapi_err(LLAPI_MSG_ERROR, "read('%s') failed: %s\n",proc_path); rc = -3; } else if (rc == 0) { - fprintf(stderr, "read('%s') zero bytes\n", proc_path); + llapi_err(LLAPI_MSG_ERROR | LLAPI_MSG_NO_ERRNO, + "read('%s') zero bytes\n", proc_path); rc = -4; } else if (/* rc > 0 && */ buf[rc - 1] == '\n') { buf[rc - 1] = '\0'; /* Remove trailing newline */ @@ -64,7 +70,7 @@ int read_proc_entry(char *proc_path, char *buf, int len) int compare(struct lov_user_md *lum_dir, struct lov_user_md *lum_file1, struct lov_user_md *lum_file2) { - int stripe_count = 0; + int stripe_count = 0, min_stripe_count = 0, def_stripe_count = 1; int stripe_size = 0; int stripe_offset = -1; int ost_count; @@ -76,44 +82,60 @@ int compare(struct lov_user_md *lum_dir, struct lov_user_md *lum_file1, fp = popen("\\ls -d /proc/fs/lustre/lov/*lov* | head -1", "r"); if (!fp) { - fprintf(stderr, "open(lustre/lov/*lov*) failed: %s\n", - strerror(errno)); + llapi_err(LLAPI_MSG_ERROR, + "open(lustre/lov/*lov*) failed: %s\n"); return 2; } - if (fscanf(fp, "%s", lov_path) < 1) { - fprintf(stderr, "read(lustre/lov/*lov*) failed: %s\n", - strerror(errno)); + if (fscanf(fp, "%s", lov_path) < 1) { + llapi_err(LLAPI_MSG_ERROR,"read lustre/lov/*lov* failed: %s\n"); pclose(fp); return 3; } pclose(fp); - if (lum_dir == NULL) { - snprintf(tmp_path, sizeof(tmp_path) - 1, "%s/stripecount", - lov_path); - if (read_proc_entry(tmp_path, buf, sizeof(buf)) < 0) - return 5; - - stripe_count = atoi(buf); - } else { - stripe_count = (int)lum_dir->lmm_stripe_count; - } - if (stripe_count == 0) - stripe_count = 1; + snprintf(tmp_path, sizeof(tmp_path) - 1, "%s/stripecount", lov_path); + if (read_proc_entry(tmp_path, buf, sizeof(buf)) < 0) + return 5; + def_stripe_count = (short)atoi(buf); snprintf(tmp_path, sizeof(tmp_path) - 1, "%s/numobd", lov_path); if (read_proc_entry(tmp_path, buf, sizeof(buf)) < 0) return 6; - ost_count = atoi(buf); - stripe_count = stripe_count > 0 ? stripe_count : ost_count; - if (lum_file1->lmm_stripe_count != stripe_count) { - fprintf(stderr, "file1 stripe count %d != dir %d\n", - lum_file1->lmm_stripe_count, stripe_count); + if (lum_dir == NULL) { + stripe_count = def_stripe_count; + min_stripe_count = -1; + } else { + stripe_count = (short)lum_dir->lmm_stripe_count; + printf("dir stripe %d, ", stripe_count); + min_stripe_count = 1; + } + + printf("default stripe %d, ost count %d\n", + def_stripe_count, ost_count); + if (stripe_count == 0) { + min_stripe_count = -1; + stripe_count = 1; + } + + stripe_count = (stripe_count > 0 && stripe_count <= ost_count) ? + stripe_count : ost_count; + min_stripe_count = min_stripe_count > 0 ? stripe_count : + ((stripe_count + 1) / 2); + + if (lum_file1->lmm_stripe_count != stripe_count || + lum_file1->lmm_stripe_count < min_stripe_count) { + llapi_err(LLAPI_MSG_ERROR, "file1 stripe count %d != dir %d\n", + lum_file1->lmm_stripe_count, stripe_count); return 7; } + if (lum_file1->lmm_stripe_count < stripe_count) + llapi_err(LLAPI_MSG_WARN, "warning: file1 used fewer stripes " + "%d < dir %d (likely due to bug 4900)\n", + lum_file1->lmm_stripe_count, stripe_count); + if (lum_dir != NULL) stripe_size = (int)lum_dir->lmm_stripe_size; if (stripe_size == 0) { @@ -126,8 +148,8 @@ int compare(struct lov_user_md *lum_dir, struct lov_user_md *lum_file1, } if (lum_file1->lmm_stripe_size != stripe_size) { - fprintf(stderr, "file1 stripe size %d != dir %d\n", - lum_file1->lmm_stripe_size, stripe_size); + llapi_err(LLAPI_MSG_ERROR, "file1 stripe size %d != dir %d\n", + lum_file1->lmm_stripe_size, stripe_size); return 8; } @@ -137,10 +159,11 @@ int compare(struct lov_user_md *lum_dir, struct lov_user_md *lum_file1, for (i = 0; i < stripe_count; i++) if (lum_file1->lmm_objects[i].l_ost_idx != (stripe_offset + i) % ost_count) { - fprintf(stderr, "warning: file1 non-sequential " - "stripe[%d] %d != %d\n", i, - lum_file1->lmm_objects[i].l_ost_idx, - (stripe_offset + i) % ost_count); + llapi_err(LLAPI_MSG_WARN, + "warning: file1 non-sequential " + "stripe[%d] %d != %d\n", i, + lum_file1->lmm_objects[i].l_ost_idx, + (stripe_offset + i) % ost_count); } } else if (lum_file2 != NULL) { int next, idx, stripe = stripe_count - 1; @@ -148,10 +171,11 @@ int compare(struct lov_user_md *lum_dir, struct lov_user_md *lum_file1, ost_count; idx = lum_file2->lmm_objects[0].l_ost_idx; if (idx != next) { - fprintf(stderr, "warning: non-sequential " - "file1 stripe[%d] %d != file2 stripe[0] %d\n", - stripe, - lum_file1->lmm_objects[stripe].l_ost_idx, idx); + llapi_err(LLAPI_MSG_WARN, "warning: non-sequential " + "file1 stripe[%d] %d != file2 stripe[0] %d\n", + stripe, + lum_file1->lmm_objects[stripe].l_ost_idx, + idx); } } @@ -164,69 +188,81 @@ int main(int argc, char **argv) struct lov_user_md *lum_dir, *lum_file1 = NULL, *lum_file2 = NULL; int rc; int lum_size; - char *fname; if (argc < 3) { - fprintf(stderr, "Usage: %s <dirname> <filename1> [filename2]\n", - argv[0]); + llapi_err(LLAPI_MSG_ERROR, + "Usage: %s <dirname> <filename1> [filename2]\n", + argv[0]); return 1; } dir = opendir(argv[1]); if (dir == NULL) { - fprintf(stderr, "%s opendir failed: %s\n", argv[1], - strerror(errno)); - return errno; + rc = errno; + llapi_err(LLAPI_MSG_ERROR, + "error: %s opendir failed\n", argv[1]); + return rc; } lum_size = lov_mds_md_size(MAX_LOV_UUID_COUNT); if ((lum_dir = (struct lov_user_md *)malloc(lum_size)) == NULL) { - fprintf(stderr, "unable to allocate memory for ioctl's"); - return errno; + rc = ENOMEM; + llapi_err(LLAPI_MSG_ERROR, "error: can't allocate %d bytes " + "for dir EA", lum_size); + goto cleanup; } - rc = ioctl(dirfd(dir), LL_IOC_LOV_GETSTRIPE, lum_dir); + rc = llapi_file_get_stripe(argv[1], lum_dir); if (rc) { if (errno == ENODATA) { free(lum_dir); lum_dir = NULL; } else { rc = errno; + llapi_err(LLAPI_MSG_ERROR, + "error: can't get EA for %s\n", argv[1]); goto cleanup; } } - if ((lum_file1 = (struct lov_user_md *)malloc(lum_size)) == NULL) { - fprintf(stderr, "unable to allocate memory for ioctl's"); + /* XXX should be llapi_lov_getname() */ + rc = llapi_file_get_lov_uuid(argv[1], &lov_uuid); + if (rc) { rc = errno; - goto cleanup; + llapi_err(LLAPI_MSG_ERROR, "error: can't get lov name for %s\n", + argv[1]); + return rc; } - fname = strrchr(argv[2], '/'); - fname = (fname == NULL ? argv[2] : fname + 1); + if ((lum_file1 = (struct lov_user_md *)malloc(lum_size)) == NULL) { + rc = ENOMEM; + llapi_err(LLAPI_MSG_ERROR, + "error: can't allocate %d bytes for EA\n", lum_size); + goto cleanup; + } - strncpy((char *)lum_file1, fname, lum_size); - rc = ioctl(dirfd(dir), IOC_MDC_GETFILESTRIPE, lum_file1); + rc = llapi_file_get_stripe(argv[2], lum_file1); if (rc) { rc = errno; + llapi_err(LLAPI_MSG_ERROR, + "error: unable to get EA for %s\n", argv[2]); goto cleanup; } if (argc == 4) { lum_file2 = (struct lov_user_md *)malloc(lum_size); if (lum_file2 == NULL) { - fprintf(stderr, - "unable to allocate memory for ioctl's"); - rc = errno; + rc = ENOMEM; + llapi_err(LLAPI_MSG_ERROR, "error: can't allocate %d " + "bytes for file2 EA\n", lum_size); goto cleanup; } - fname = strrchr(argv[3], '/'); - fname = (fname == NULL ? argv[3] : fname + 1); - strncpy((char *)lum_file2, fname, lum_size); - rc = ioctl(dirfd(dir), IOC_MDC_GETFILESTRIPE, lum_file2); + rc = llapi_file_get_stripe(argv[3], lum_file2); if (rc) { rc = errno; + llapi_err(LLAPI_MSG_ERROR, + "error: can't get EA for %s\n", argv[3]); goto cleanup; } } @@ -234,6 +270,7 @@ int main(int argc, char **argv) rc = compare(lum_dir, lum_file1, lum_file2); cleanup: + closedir(dir); if (lum_dir != NULL) free(lum_dir); if (lum_file1 != NULL) diff --git a/lustre/tests/llog-test.sh b/lustre/tests/llog-test.sh index 56c7f0a36e8a1132db6d546dc4754f361a4db34f..1de2b4f9c7a1e8058c6819687f5011f24b72c55c 100644 --- a/lustre/tests/llog-test.sh +++ b/lustre/tests/llog-test.sh @@ -59,7 +59,7 @@ setup() { log "== test 00: target handle mismatch (bug 5317) === `date +%H:%M:%S`" #define OBD_FAIL_OST_ALL_REPLY_NET 0x211 - do_facet ost "sysctl -w lustre.fail_loc=0x80000211" + do_facet ost "lctl set_param fail_loc=0x80000211" zconf_mount `hostname` $MOUNT && df $MOUNT && pass || error "mount fail" fi diff --git a/lustre/tests/openfile.c b/lustre/tests/openfile.c index 66aba746891e0c341ec012366eca1cb3250ae498..6638ac1a874afac32e73946383088aaddef2917f 100644 --- a/lustre/tests/openfile.c +++ b/lustre/tests/openfile.c @@ -58,14 +58,14 @@ void Usage_and_abort(void) int main(int argc, char** argv) { int fd; - int flags=0; - mode_t mode=0644; - char* fname=NULL; - int mode_set=0; - int flag_set=0; - int file_set=0; + int flags = 0; + mode_t mode = 0644; + char* fname = NULL; + int mode_set = 0; + int flag_set = 0; int c; - int save_errno; + int save_errno = 0; + int print_usage = 0; char* cloned_flags = NULL; if (argc == 1) @@ -79,7 +79,8 @@ int main(int argc, char** argv) cloned_flags = (char *)malloc(strlen(optarg)+1); if (cloned_flags == NULL) { fprintf(stderr, "Insufficient memory.\n"); - exit(-1); + save_errno = -1; + goto out; } strncpy(cloned_flags, optarg, strlen(optarg)+1); @@ -110,10 +111,10 @@ int main(int argc, char** argv) if (flag_table[i].flag == -1) { fprintf(stderr, "No such flag: %s\n", tmp); - exit(-1); + save_errno = -1; + goto out; } } - free(cloned_flags); #ifdef DEBUG printf("flags = %x\n", flags); #endif @@ -131,21 +132,23 @@ int main(int argc, char** argv) break; default: fprintf(stderr, "Bad parameters.\n"); - Usage_and_abort(); + print_usage = 1; + goto out; } } if (optind == argc) { fprintf(stderr, "Bad parameters.\n"); - Usage_and_abort(); + print_usage = 1; + goto out; } fname = argv[optind]; - file_set = 1; - if (!flag_set || !file_set) { + if (!flag_set) { fprintf(stderr, "Missing flag or file-name\n"); - exit(-1); + save_errno = -1; + goto out; } @@ -164,14 +167,21 @@ int main(int argc, char** argv) printf(", mode=%o", mode); printf(")\n"); close(fd); - return 0; + } else { + fprintf(stderr, "Error in opening file \"%s\"(flags=%s", + fname, cloned_flags); } - fprintf(stderr, "Error in opening file \"%s\"(flags=%s", - fname, cloned_flags); if (mode_set) fprintf(stderr, ", mode=%o", mode); fprintf(stderr, ") %d: %s\n", save_errno, strerror(save_errno)); +out: + if (cloned_flags) + free(cloned_flags); + if (print_usage) + Usage_and_abort(); + return save_errno; } + diff --git a/lustre/tests/parallel_grouplock.c b/lustre/tests/parallel_grouplock.c index 2369209b95553d0e33fa58c5ba149efbb8a23b3d..6230495ce699a79bfa66ea0a2ded747c3b00d600 100644 --- a/lustre/tests/parallel_grouplock.c +++ b/lustre/tests/parallel_grouplock.c @@ -106,7 +106,7 @@ void grouplock_test1(char *filename, int fd, char *errmsg) if (rank == 0) { int iter = MAX_WAITING_TIME; int flag1, flag2; - + /* reading task will tell us when it completes */ MPI_Irecv(&temp1, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &req1); /* 2nd locking task will tell us when it completes */ @@ -140,7 +140,7 @@ void grouplock_test1(char *filename, int fd, char *errmsg) if (rank == 0) { int iter = MAX_WAITING_TIME; int flag1; - + do { iter--; if (!iter) { @@ -217,7 +217,7 @@ void grouplock_test2(char *filename, int fd, char *errmsg) if (rank == 0) { int iter = MAX_WAITING_TIME; int flag1, flag2, flag3; - + /* 2nd locking task will tell us when it completes */ MPI_Irecv(&temp1, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &req1); /* 3nd locking task will tell us when it completes */ @@ -284,7 +284,7 @@ void grouplock_test2(char *filename, int fd, char *errmsg) if (rank == 0) { int iter = MAX_WAITING_TIME; int flag3; - + do { iter--; if (!iter) { @@ -366,7 +366,7 @@ void grouplock_test3(char *filename, int fd, char *errmsg) if (rank == 0) { int iter = MAX_WAITING_TIME; int flag1, flag2; - + /* reading task will tell us when it completes */ MPI_Irecv(&temp1, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &req1); /* 2nd locking task will tell us when it completes */ @@ -400,7 +400,7 @@ void grouplock_test3(char *filename, int fd, char *errmsg) if (rank == 0) { int iter = MAX_WAITING_TIME; int flag1; - + do { iter--; usleep(100); @@ -418,7 +418,7 @@ void grouplock_test3(char *filename, int fd, char *errmsg) filename, rc); FAIL(errmsg); } - + do { iter--; if (!iter) { @@ -435,7 +435,7 @@ void grouplock_test3(char *filename, int fd, char *errmsg) } -/* +/* * process1 attempts CW(gid=1) -- granted * process2 attempts PR on non-blocking fd -> should return -EWOULDBLOCK * process3 attempts CW(gid=2) on non-blocking fd -> should return -EWOULDBLOCK @@ -501,7 +501,7 @@ void grouplock_test4(char *filename, int fd, char *errmsg) int flag1, flag2; MPI_Request req1, req2; int temp1, temp2; - + /* reading task will tell us when it completes */ MPI_Irecv(&temp1, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &req1); /* 2nd locking task will tell us when it completes */ @@ -536,9 +536,7 @@ void grouplock_test4(char *filename, int fd, char *errmsg) */ void grouplock_test5(char *filename, int fd, char *errmsg) { - int rc, count, gid = 1; - char buf[LPGL_FILEN]; - char zeros[LPGL_FILEN]; + int rc, gid = 1; MPI_Request req1, req2; int temp1, temp2; @@ -565,7 +563,7 @@ void grouplock_test5(char *filename, int fd, char *errmsg) if (rank == 0) { int iter = MAX_WAITING_TIME; int flag1, flag2; - + /* 3rd locking task will tell us when it completes */ MPI_Irecv(&temp1, 1, MPI_INT, 1, 1, MPI_COMM_WORLD, &req1); /* 2nd locking task will tell us when it completes */ @@ -592,7 +590,7 @@ void grouplock_test5(char *filename, int fd, char *errmsg) filename, rc); FAIL(errmsg); } - + do { iter--; if (!iter) { @@ -615,8 +613,6 @@ void grouplock_test5(char *filename, int fd, char *errmsg) } MPI_Barrier(MPI_COMM_WORLD); - - } /* @@ -653,7 +649,7 @@ void grouplock_errorstest(char *filename, int fd, char *errmsg) /* To not do lots of separate tests with lots of fd opening/closing, different parts of this test are performed in different processes */ - + if (rank == 0 || rank == 1 ) { if ((rc = ioctl(fd, LL_IOC_GROUP_LOCK, gid)) == -1) { sprintf(errmsg, "ioctl GROUP_LOCK of file %s return %d", @@ -668,7 +664,7 @@ void grouplock_errorstest(char *filename, int fd, char *errmsg) if (errno != EINVAL) { sprintf(errmsg, "Double GROUP lock failed with errno %d instead of EINVAL\n", errno); FAIL(errmsg); - } + } } else { FAIL("Taking second GROUP lock on same fd succeed\n"); } @@ -680,7 +676,7 @@ void grouplock_errorstest(char *filename, int fd, char *errmsg) if (errno != EINVAL) { sprintf(errmsg, "Double GROUP lock different gid failed with errno %d instead of EINVAL\n", errno); FAIL(errmsg); - } + } } else { FAIL("Taking second GROUP lock on same fd, different gid, succeed\n"); } @@ -693,7 +689,7 @@ void grouplock_errorstest(char *filename, int fd, char *errmsg) sprintf(errmsg, "GROUP unlock with wrong gid failed with errno %d instead of EINVAL\n", errno); FAIL(errmsg); - } + } } else { FAIL("GROUP unlock with wrong gid succeed\n"); } @@ -714,7 +710,7 @@ void grouplock_errorstest(char *filename, int fd, char *errmsg) sprintf(errmsg, "GROUP unlock on never locked fd failed with errno %d instead of EINVAL\n", errno); FAIL(errmsg); - } + } } else { FAIL("GROUP unlock on never locked fd succeed\n"); } @@ -723,7 +719,7 @@ void grouplock_errorstest(char *filename, int fd, char *errmsg) void grouplock_file(char *name, int items) { - int i, fd; + int fd; char filename[MAX_FILENAME_LEN]; char errmsg[MAX_FILENAME_LEN+20]; @@ -821,7 +817,6 @@ int main(int argc, char *argv[]) { char c; int i, iterations = 1; - int tr = 1; /* Check for -h parameter before MPI_Init so the binary can be called directly, without, for instance, mpirun */ @@ -875,8 +870,8 @@ int main(int argc, char *argv[]) } if (testdir == NULL && rank == 0) { - fprintf(stderr, "Please specify a test directory! (\"%s -h\" for help)\n", - argv[0]); + fprintf(stderr, "Please specify a test directory! " + "(\"%s -h\" for help)\n", argv[0]); MPI_Abort(MPI_COMM_WORLD, 2); } diff --git a/lustre/tests/racer/dir_create.sh b/lustre/tests/racer/dir_create.sh new file mode 100755 index 0000000000000000000000000000000000000000..80fbbe128574f25890c271f27a83d30ca95ddb4b --- /dev/null +++ b/lustre/tests/racer/dir_create.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +DIR=$1 +MAX=$2 + +create(){ + echo "asdf" > $DIR/$file/$file/$file +} + +while /bin/true ; do + file=$(($RANDOM%$MAX)) + mkdir -p $DIR/$file/$file/ 2> /dev/null + create 2> /dev/null +done diff --git a/lustre/tests/racer/file_concat.sh b/lustre/tests/racer/file_concat.sh new file mode 100755 index 0000000000000000000000000000000000000000..38181ad271208216491aab395e102eae96815849 --- /dev/null +++ b/lustre/tests/racer/file_concat.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +DIR=$1 +MAX=$2 + +concat(){ + cat $DIR/$file >> $DIR/$new_file + cat $DIR/$file/$file/$file >> $DIR/$new_file + +} + +while /bin/true ; do + file=$(($RANDOM%$MAX)) + new_file=$(($RANDOM%$MAX)) + concat 2> /dev/null +done diff --git a/lustre/tests/racer/file_create.sh b/lustre/tests/racer/file_create.sh new file mode 100755 index 0000000000000000000000000000000000000000..d94502c117b6d02b38fdbdf877f54b61c5641e4a --- /dev/null +++ b/lustre/tests/racer/file_create.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +DIR=$1 +MAX=$2 +MAX_MB=256 + +create() { + SIZE=$(($RANDOM*MAX_MB/32)) + echo "file_create: SIZE=$SIZE" + dd if=/dev/zero of=$DIR/$file bs=1k count=$SIZE +} + +while /bin/true ; do + file=$(($RANDOM%$MAX)) + create 2> /dev/null +done + diff --git a/lustre/tests/racer/file_link.sh b/lustre/tests/racer/file_link.sh new file mode 100755 index 0000000000000000000000000000000000000000..5c1cac726fabacc44eba1a5db733c56cc68c7444 --- /dev/null +++ b/lustre/tests/racer/file_link.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +DIR=$1 +MAX=$2 + +while /bin/true ; do + file=$(($RANDOM%$MAX)) + new_file=$((($file + 1)%$MAX)) + ln $file $DIR/$new_file 2> /dev/null +done diff --git a/lustre/tests/racer/file_list.sh b/lustre/tests/racer/file_list.sh new file mode 100755 index 0000000000000000000000000000000000000000..44a3e5fc0d0e47b4ece55479b5aaf846f58f64a6 --- /dev/null +++ b/lustre/tests/racer/file_list.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +DIR=$1 +MAX=$2 + +while /bin/true ; do + ls -R $DIR/ > /dev/null 2> /dev/null & + ls -R $DIR/ > /dev/null 2> /dev/null & + ls -R $DIR/ > /dev/null 2> /dev/null & + ls -R $DIR/ > /dev/null 2> /dev/null & + ls -R $DIR/ > /dev/null 2> /dev/null & + + ls -R $DIR/ > /dev/null 2> /dev/null & + ls -R $DIR/ > /dev/null 2> /dev/null & + ls -R $DIR/ > /dev/null 2> /dev/null & + ls -R $DIR/ > /dev/null 2> /dev/null & + ls -R $DIR/ > /dev/null 2> /dev/null & + + wait + sleep 1 +done diff --git a/lustre/tests/racer/file_rename.sh b/lustre/tests/racer/file_rename.sh new file mode 100755 index 0000000000000000000000000000000000000000..955210293e6d143079397a98337efad7dde7910c --- /dev/null +++ b/lustre/tests/racer/file_rename.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +DIR=$1 +MAX=$2 + +while /bin/true ; do + file=$(($RANDOM%$MAX)) + new_file=$((($file + 1)%$MAX)) + mv $DIR/$file $DIR/$new_file 2> /dev/null +done diff --git a/lustre/tests/racer/file_rm.sh b/lustre/tests/racer/file_rm.sh new file mode 100755 index 0000000000000000000000000000000000000000..41d3d62db4aa87bf7821d7fc607406554d9cff42 --- /dev/null +++ b/lustre/tests/racer/file_rm.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +DIR=$1 +MAX=$2 + +while /bin/true ; do + file=$(($RANDOM%$MAX)) + rm -rf $DIR/$file 2> /dev/null + sleep 1 +done + + diff --git a/lustre/tests/racer/file_symlink.sh b/lustre/tests/racer/file_symlink.sh new file mode 100755 index 0000000000000000000000000000000000000000..44771a52d02cb515822b882ab333521eef3c32d2 --- /dev/null +++ b/lustre/tests/racer/file_symlink.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +DIR=$1 +MAX=$2 + +while /bin/true ; do + file=$(($RANDOM%$MAX)) + new_file=$((($file + 1)%$MAX)) + ln -s $file $DIR/$new_file 2> /dev/null + ln -s $file/$file/$file $DIR/$new_file 2> /dev/null +done diff --git a/lustre/tests/racer/racer.sh b/lustre/tests/racer/racer.sh new file mode 100755 index 0000000000000000000000000000000000000000..c1f8b9970fc683964adc1be2547d446bb0c3f205 --- /dev/null +++ b/lustre/tests/racer/racer.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +MAX_FILES=${MAX_FILES:-20} +DIR=${DIR:-$1} +DIR=${DIR:-"/mnt/lustre/racer"} +if ! [ -d "$DIR" -o -d "`basename $DIR`" ]; then + echo "$0: '$DIR' and '`basename $DIR`' are not directories" + exit 1 +fi +DURATION=${DURATION:-$((60*5))} + +NUM_THREADS=${NUM_THREADS:-$2} +NUM_THREADS=${NUM_THREADS:-3} + +[ -e $DIR ] || mkdir $DIR + +racer_cleanup() +{ + killall file_create.sh + killall dir_create.sh + killall file_rm.sh + killall file_rename.sh + killall file_link.sh + killall file_symlink.sh + killall file_list.sh + killall file_concat.sh + trap 0 +} + +echo "Running $0 for $DURATION seconds. CTRL-C to exit" +trap " + echo \"Cleaning up\" + racer_cleanup + exit 0 +" 2 + +cd `dirname $0` +for N in `seq 1 $NUM_THREADS`; do + ./file_create.sh $DIR $MAX_FILES & + ./dir_create.sh $DIR $MAX_FILES & + ./file_rename.sh $DIR $MAX_FILES & + ./file_link.sh $DIR $MAX_FILES & + ./file_symlink.sh $DIR $MAX_FILES & + ./file_concat.sh $DIR $MAX_FILES & + ./file_list.sh $DIR & + ./file_rm.sh $DIR $MAX_FILES & +done + +sleep $DURATION; +racer_cleanup +# Check our to see whether our test DIR is still available. +df $DIR +RC=$? +if [ $RC -eq 0 ]; then + echo "We survived $0 for $DURATION seconds." +fi +exit $RC diff --git a/lustre/tests/random-reads.c b/lustre/tests/reads.c similarity index 63% rename from lustre/tests/random-reads.c rename to lustre/tests/reads.c index 1722afb34f419dcaa23943e42f86565c3316b7a4..77ebeaaa568e72398b812fe6aebbd16362431dda 100644 --- a/lustre/tests/random-reads.c +++ b/lustre/tests/reads.c @@ -1,7 +1,8 @@ /* - * Lustre Random Reads test + * Lustre Reads test * * Copyright (c) 2005 Cluster File Systems, Inc. + * Copyright (c) 2008 SUN Microsystems. * * Author: Nikita Danilov <nikita@clusterfs.com> * @@ -34,13 +35,14 @@ #include <sys/types.h> #include <sys/time.h> -long long atoll(const char *nptr); - static void usage(void) { - printf("random-reads: read random chunks of a file.\n"); + printf("reads: read random or stride chunks of a file.\n"); printf("Usage:\n\n"); - printf("random-reads -f <filename> -s <filesize> -b <buffersize> -a <adjacent reads> [-v] [-h] [-C] [-S <seed>] [-n <iterations>] [-w <width>] [-t <timelimit>]\n"); + printf("reads -f <filename> -s <filesize> -b <buffersize>" + "-a <adjacent reads> [-v] [-h] [-C] [-l <stride_length> ] " + "[ -o <stride_offset> ] [-S <seed>] [-n <iterations>]" + "[-w <width>] [-t <timelimit>]\n"); } enum { @@ -82,6 +84,8 @@ int main(int argc, char **argv) unsigned int seed = 0; unsigned long iterations = 0; unsigned long timelimit = 24 * 3600; + unsigned long stride_length = 0; + unsigned long stride_offset = 0; int opt; int fd; @@ -95,9 +99,10 @@ int main(int argc, char **argv) double usecs; char *buf; + char *term; do { - opt = getopt(argc, argv, "f:s:b:va:hCS:n:t:w:"); + opt = getopt(argc, argv, "f:s:b:va:hCS:n:t:l:o:w:"); switch (opt) { case -1: break; @@ -113,28 +118,81 @@ int main(int argc, char **argv) fname = strdup(optarg); break; case 's': - size = atoll(optarg); + size = strtol(optarg, &term, 0); + if (term == optarg) { + fprintf (stderr, "Can't parse size %s\n", optarg); + usage(); + return RR_SET; + } break; case 'b': - bsize = atol(optarg); + bsize = strtol(optarg, &term, 0); + if (term == optarg) { + fprintf (stderr, "Can't parse bsize %s\n", optarg); + usage(); + return RR_SET; + } break; case 'a': - ad = atoi(optarg); + ad = (int)strtol(optarg, &term, 0); + if (term == optarg) { + fprintf (stderr, "Can't parse ad %s\n", optarg); + usage(); + return RR_SET; + } break; case 'C': preclean = 1; break; case 'S': - seed = atol(optarg); + seed = strtol(optarg, &term, 0); + if (term == optarg) { + fprintf (stderr, "Can't parse seed %s\n", optarg); + usage(); + return RR_SET; + } break; case 'n': - iterations = atoll(optarg); + iterations = strtol(optarg, &term, 0); + if (term == optarg) { + fprintf (stderr, "Can't parse seed %s\n", optarg); + usage(); + return RR_SET; + } + break; + break; case 't': - timelimit = atoll(optarg); + timelimit = strtol(optarg, &term, 0); + if (term == optarg) { + fprintf (stderr, "Can't parse seed %s\n", optarg); + usage(); + return RR_SET; + } + break; + case 'l': + stride_length = strtol(optarg, &term, 0); + if (term == optarg) { + fprintf (stderr, "Can't parse seed %s\n", optarg); + usage(); + return RR_SET; + } + break; + case 'o': + stride_offset = strtol(optarg, &term, 0); + if (term == optarg) { + fprintf (stderr, "Can't parse seed %s\n", optarg); + usage(); + return RR_SET; + } break; case 'w': - width = atoi(optarg); + width = (int)strtol(optarg, &term, 0); + if (term == optarg) { + fprintf (stderr, "Can't parse seed %s\n", optarg); + usage(); + return RR_SET; + } break; } } while (opt != -1); @@ -171,6 +229,7 @@ int main(int argc, char **argv) if (ret < 0) { LOG(LOG_CRIT, "write() failure: %s\n", strerror(errno)); + close(fd); return RR_PRECLEAN; } } @@ -183,7 +242,12 @@ int main(int argc, char **argv) unsigned long block_nr; int j; - block_nr = (int) ((double)nblocks*rand()/(RAND_MAX+1.0)); + if (stride_length) + block_nr = (unsigned long)(i*stride_length + + stride_offset) % nblocks; + else + block_nr = (unsigned long)((double)nblocks*rand()/ + (RAND_MAX+1.0)); if (i % width == 0) LOG(LOG_INFO, "\n%9lu: ", i); LOG(LOG_INFO, "%7lu ", block_nr); @@ -193,6 +257,7 @@ int main(int argc, char **argv) LOG(LOG_CRIT, "pread(...%zi, %li) got: %zi, %s\n", bsize, block_nr * bsize, ret, strerror(errno)); + close(fd); return RR_READ; } } @@ -200,6 +265,7 @@ int main(int argc, char **argv) if (stop.tv_sec > timelimit) break; } + close(fd); usecs = (stop.tv_sec - start.tv_sec) * 1000000. + stop.tv_usec - start.tv_usec; printf("\n%fs, %gMB/s\n", usecs / 1000000., diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 13b61eab464337ad1a207aacdbb35e4fe4b21dbf..a7eaa84574c1ecea68e45d14998a926b1aed3e3a 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -23,6 +23,7 @@ SETUP=${SETUP:-""} CLEANUP=${CLEANUP:-""} cleanup_and_setup_lustre +assert_DIR rm -rf $DIR/[df][0-9]* test_1() { @@ -126,7 +127,7 @@ run_test 11 "wake up a thread waiting for completion after eviction (b=2460)" #b=2494 test_12(){ $LCTL mark multiop $MOUNT/$tfile OS_c - do_facet mds "sysctl -w lustre.fail_loc=0x115" + do_facet mds "lctl set_param fail_loc=0x115" clear_failloc mds $((TIMEOUT * 2)) & multiop_bg_pause $MOUNT/$tfile OS_c || return 1 PID=$! @@ -143,9 +144,9 @@ test_13() { mkdir $MOUNT/readdir || return 1 touch $MOUNT/readdir/newentry || return # OBD_FAIL_MDS_READPAGE_NET|OBD_FAIL_ONCE - do_facet mds "sysctl -w lustre.fail_loc=0x80000104" + do_facet mds "lctl set_param fail_loc=0x80000104" ls $MOUNT/readdir || return 3 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" rm -rf $MOUNT/readdir || return 4 } run_test 13 "mdc_readpage restart test (bug 1138)" @@ -155,14 +156,14 @@ test_14() { mkdir $MOUNT/readdir touch $MOUNT/readdir/newentry # OBD_FAIL_MDS_SENDPAGE|OBD_FAIL_ONCE - do_facet mds "sysctl -w lustre.fail_loc=0x80000106" + do_facet mds "lctl set_param fail_loc=0x80000106" ls $MOUNT/readdir || return 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" } run_test 14 "mdc_readpage resend test (bug 1138)" test_15() { - do_facet mds "sysctl -w lustre.fail_loc=0x80000128" + do_facet mds "lctl set_param fail_loc=0x80000128" touch $DIR/$tfile && return 1 return 0 } @@ -183,11 +184,11 @@ test_16() { stop_read_ahead #define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 | OBD_FAIL_ONCE - do_facet ost1 sysctl -w lustre.fail_loc=0x80000504 + do_facet ost1 lctl set_param fail_loc=0x80000504 cancel_lru_locks osc # OST bulk will time out here, client resends do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 1 - do_facet ost1 sysctl -w lustre.fail_loc=0 + do_facet ost1 lctl set_param fail_loc=0 # give recovery a chance to finish (shouldn't take long) sleep $TIMEOUT do_facet client "cmp $SAMPLE_FILE $MOUNT/${SAMPLE_FILE##*/}" || return 2 @@ -206,7 +207,7 @@ test_17() { # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE # OST bulk will time out here, client retries - do_facet ost1 sysctl -w lustre.fail_loc=0x80000503 + do_facet ost1 lctl set_param fail_loc=0x80000503 # need to ensure we send an RPC do_facet client cp $SAMPLE_FILE $DIR/$tfile sync @@ -215,7 +216,7 @@ test_17() { # expiring the req, hopefully timeout*2 is enough sleep $(($TIMEOUT*2)) - do_facet ost1 sysctl -w lustre.fail_loc=0 + do_facet ost1 lctl set_param fail_loc=0 do_facet client "df $DIR" # expect cmp to succeed, client resent bulk do_facet client "cmp $SAMPLE_FILE $DIR/$tfile" || return 3 @@ -297,7 +298,7 @@ test_18c() { # OBD_FAIL_OST_CONNECT_NET2 # lost reply to connect request - do_facet ost1 sysctl -w lustre.fail_loc=0x80000225 + do_facet ost1 lctl set_param fail_loc=0x80000225 # force reconnect df $MOUNT > /dev/null 2>&1 sleep 2 @@ -343,7 +344,7 @@ test_20a() { # bug 2983 - ldlm_handle_enqueue cleanup MULTI_PID=$! cancel_lru_locks osc #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 - do_facet ost1 sysctl -w lustre.fail_loc=0x80000308 + do_facet ost1 lctl set_param fail_loc=0x80000308 kill -USR1 $MULTI_PID wait $MULTI_PID rc=$? @@ -356,7 +357,7 @@ test_20b() { # bug 2986 - ldlm_handle_enqueue error during open lfs setstripe $DIR/$tdir/${tfile} -i 0 -c 1 cancel_lru_locks osc #define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 - do_facet ost1 sysctl -w lustre.fail_loc=0x80000308 + do_facet ost1 lctl set_param fail_loc=0x80000308 dd if=/etc/hosts of=$DIR/$tdir/$tfile && \ error "didn't fail open enqueue" || true } @@ -368,18 +369,18 @@ test_21a() { multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 close_pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000129" + do_facet mds "lctl set_param fail_loc=0x80000129" multiop $DIR/$tdir-2/f Oc & open_pid=$! sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" - do_facet mds "sysctl -w lustre.fail_loc=0x80000115" + do_facet mds "lctl set_param fail_loc=0x80000115" kill -USR1 $close_pid cancel_lru_locks mdc wait $close_pid || return 1 wait $open_pid || return 2 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" $CHECKSTAT -t file $DIR/$tdir-1/f || return 3 $CHECKSTAT -t file $DIR/$tdir-2/f || return 4 @@ -394,11 +395,11 @@ test_21b() { multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 close_pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000107" + do_facet mds "lctl set_param fail_loc=0x80000107" mcreate $DIR/$tdir-2/f & open_pid=$! sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" kill -USR1 $close_pid cancel_lru_locks mdc @@ -417,19 +418,19 @@ test_21c() { multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 close_pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000107" + do_facet mds "lctl set_param fail_loc=0x80000107" mcreate $DIR/$tdir-2/f & open_pid=$! sleep 3 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" - do_facet mds "sysctl -w lustre.fail_loc=0x80000115" + do_facet mds "lctl set_param fail_loc=0x80000115" kill -USR1 $close_pid cancel_lru_locks mdc wait $close_pid || return 1 wait $open_pid || return 2 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" $CHECKSTAT -t file $DIR/$tdir-1/f || return 2 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3 @@ -443,16 +444,16 @@ test_21d() { multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000129" + do_facet mds "lctl set_param fail_loc=0x80000129" multiop $DIR/$tdir-2/f Oc & sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" - do_facet mds "sysctl -w lustre.fail_loc=0x80000122" + do_facet mds "lctl set_param fail_loc=0x80000122" kill -USR1 $pid cancel_lru_locks mdc wait $pid || return 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" $CHECKSTAT -t file $DIR/$tdir-1/f || return 2 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3 @@ -467,10 +468,10 @@ test_21e() { multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000119" + do_facet mds "lctl set_param fail_loc=0x80000119" touch $DIR/$tdir-2/f & sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" kill -USR1 $pid cancel_lru_locks mdc @@ -489,16 +490,16 @@ test_21f() { multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000119" + do_facet mds "lctl set_param fail_loc=0x80000119" touch $DIR/$tdir-2/f & sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" - do_facet mds "sysctl -w lustre.fail_loc=0x80000122" + do_facet mds "lctl set_param fail_loc=0x80000122" kill -USR1 $pid cancel_lru_locks mdc wait $pid || return 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" $CHECKSTAT -t file $DIR/$tdir-1/f || return 2 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3 @@ -512,16 +513,16 @@ test_21g() { multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000119" + do_facet mds "lctl set_param fail_loc=0x80000119" touch $DIR/$tdir-2/f & sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" - do_facet mds "sysctl -w lustre.fail_loc=0x80000115" + do_facet mds "lctl set_param fail_loc=0x80000115" kill -USR1 $pid cancel_lru_locks mdc wait $pid || return 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" $CHECKSTAT -t file $DIR/$tdir-1/f || return 2 $CHECKSTAT -t file $DIR/$tdir-2/f || return 3 @@ -535,17 +536,17 @@ test_21h() { multiop_bg_pause $DIR/$tdir-1/f O_c || return 1 pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000107" + do_facet mds "lctl set_param fail_loc=0x80000107" touch $DIR/$tdir-2/f & touch_pid=$! sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" - do_facet mds "sysctl -w lustre.fail_loc=0x80000122" + do_facet mds "lctl set_param fail_loc=0x80000122" cancel_lru_locks mdc kill -USR1 $pid wait $pid || return 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" wait $touch_pid || return 2 @@ -560,7 +561,7 @@ test_22() { f1=$DIR/${tfile}-1 f2=$DIR/${tfile}-2 - do_facet mds "sysctl -w lustre.fail_loc=0x80000115" + do_facet mds "lctl set_param fail_loc=0x80000115" multiop $f2 Oc & close_pid=$! @@ -568,7 +569,7 @@ test_22() { multiop $f1 msu || return 1 cancel_lru_locks mdc - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" wait $close_pid || return 2 rm -rf $f2 || return 4 @@ -599,7 +600,7 @@ test_24() { # bug 2248 - eviction fails writeback but app doesn't see it kill -USR1 $MULTI_PID wait $MULTI_PID rc=$? - sysctl -w lustre.fail_loc=0x0 + lctl set_param fail_loc=0x0 client_reconnect [ $rc -eq 0 ] && error_ignore 5494 "multiop didn't fail fsync: rc $rc" || true } @@ -614,7 +615,7 @@ test_26a() { # was test_26 bug 5921 - evict dead exports by pinger OST_NEXP1=`echo $OST_EXP | cut -d' ' -f2` echo starting with $OST_NEXP1 OST exports # OBD_FAIL_PTLRPC_DROP_RPC 0x505 - do_facet client sysctl -w lustre.fail_loc=0x505 + do_facet client lctl set_param fail_loc=0x505 # evictor takes up to 2.25x to evict. But if there's a # race to start the evictor from various obds, the loser # might have to wait for the next ping. @@ -623,7 +624,7 @@ test_26a() { # was test_26 bug 5921 - evict dead exports by pinger OST_EXP="`do_facet ost1 lctl get_param -n $OST_FILE`" OST_NEXP2=`echo $OST_EXP | cut -d' ' -f2` echo ending with $OST_NEXP2 OST exports - do_facet client sysctl -w lustre.fail_loc=0x0 + do_facet client lctl set_param fail_loc=0x0 [ $OST_NEXP1 -le $OST_NEXP2 ] && error "client not evicted" return 0 } @@ -662,10 +663,10 @@ test_27() { FAILURE_MODE="SOFT" facet_failover mds #define OBD_FAIL_OSC_SHUTDOWN 0x407 - sysctl -w lustre.fail_loc=0x80000407 + lctl set_param fail_loc=0x80000407 # need to wait for reconnect echo -n waiting for fail_loc - while [ `sysctl -n lustre.fail_loc` -eq -2147482617 ]; do + while [ `lctl get_param -n fail_loc` -eq -2147482617 ]; do sleep 1 echo -n . done @@ -681,7 +682,7 @@ test_28() { # bug 6086 - error adding new clients do_facet client mcreate $MOUNT/$tfile || return 1 drop_bl_callback "chmod 0777 $MOUNT/$tfile" ||echo "evicted as expected" #define OBD_FAIL_MDS_ADD_CLIENT 0x12f - do_facet mds sysctl -w lustre.fail_loc=0x8000012f + do_facet mds lctl set_param fail_loc=0x8000012f # fail once (evicted), reconnect fail (fail_loc), ok df || (sleep 1; df) || (sleep 1; df) || error "reconnect failed" rm -f $MOUNT/$tfile @@ -692,7 +693,7 @@ run_test 28 "handle error adding new clients (bug 6086)" test_50() { mkdir -p $DIR/$tdir debugsave - sysctl -w lnet.debug="-dlmtrace -ha" + lctl set_param debug="-dlmtrace -ha" # put a load of file creates/writes/deletes writemany -q $DIR/$tdir/$tfile 0 5 & CLIENT_PID=$! @@ -750,6 +751,7 @@ test_51() { run_test 51 "failover MDS during recovery" test_52_guts() { + do_facet client "mkdir -p $DIR/$tdir" do_facet client "writemany -q -a $DIR/$tdir/$tfile 300 5" & CLIENT_PID=$! echo writemany pid $CLIENT_PID @@ -828,7 +830,7 @@ test_55() { echo "(dd_pid=$DDPID, time=$count)successful" #define OBD_FAIL_OST_DROP_REQ 0x21d - do_facet ost sysctl -w lustre.fail_loc=0x0000021d + do_facet ost lctl set_param fail_loc=0x0000021d # second dd will be never finished dd if=/dev/zero of=$DIR/$tdir/$tfile-2 bs=32M count=4 & DDPID=$! @@ -847,7 +849,7 @@ test_55() { echo "(dd_pid=$DDPID, time=$count)successful" #Recover fail_loc and dd will finish soon - do_facet ost sysctl -w lustre.fail_loc=0 + do_facet ost lctl set_param fail_loc=0 count=0 echo "step3: testing ......" while [ true ]; do @@ -867,9 +869,9 @@ run_test 55 "ost_brw_read/write drops timed-out read/write request" test_56() { # b=11277 #define OBD_FAIL_MDS_RESEND 0x136 touch $DIR/$tfile - do_facet mds sysctl -w lustre.fail_loc=0x80000136 + do_facet mds lctl set_param fail_loc=0x80000136 stat $DIR/$tfile - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 rm -f $DIR/$tfile } run_test 56 "do not allow reconnect to busy exports" @@ -886,12 +888,12 @@ test_57() { # bug 10866 pid=$! sleep 1 #define OBD_FAIL_LPROC_REMOVE 0xB00 - sysctl -w lustre.fail_loc=0x80000B00 + lctl set_param fail_loc=0x80000B00 zconf_umount `hostname` $DIR - sysctl -w lustre.fail_loc=0x80000B00 + lctl set_param fail_loc=0x80000B00 fail_abort mds kill -9 $pid - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 mount_client $DIR do_facet client "df $DIR" } @@ -901,11 +903,11 @@ test_58() { # bug 11546 #define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 touch $MOUNT/$tfile ls -la $MOUNT/$tfile - sysctl -w lustre.fail_loc=0x80000801 + lctl set_param fail_loc=0x80000801 cp $MOUNT/$tfile /dev/null & pid=$! sleep 1 - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 drop_bl_callback rm -f $MOUNT/$tfile wait $pid do_facet client "df $DIR" @@ -915,11 +917,11 @@ run_test 58 "Eviction in the middle of open RPC reply processing" test_59() { # bug 10589 zconf_mount `hostname` $MOUNT2 || error "Failed to mount $MOUNT2" echo $DIR2 | grep -q $MOUNT2 || error "DIR2 is not set properly: $DIR2" - sysctl -w lustre.fail_loc=0x311 + lctl set_param fail_loc=0x311 writes=$(LANG=C dd if=/dev/zero of=$DIR2/$tfile count=1 2>&1) [ $? = 0 ] || error "dd write failed" writes=$(echo $writes | awk -F '+' '/out/ {print $1}') - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 sync zconf_umount `hostname` $MOUNT2 -f reads=$(LANG=C dd if=$DIR/$tfile of=/dev/null 2>&1) diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 5e499c7bd95d3fc34f207d70a6f642830db36abb..97728deab24ae51d879ad759af4b59f71a8aca7b 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -2,12 +2,12 @@ set -e -# bug number: 10124 -ALWAYS_EXCEPT="15c $REPLAY_DUAL_EXCEPT" +# bug number: 10124 16389 +ALWAYS_EXCEPT="15c 20 $REPLAY_DUAL_EXCEPT" SAVE_PWD=$PWD PTLDEBUG=${PTLDEBUG:--1} -LUSTRE=${LUSTRE:-`dirname $0`/..} +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} SETUP=${SETUP:-""} CLEANUP=${CLEANUP:-""} MOUNT_2=${MOUNT_2:-"yes"} @@ -23,6 +23,7 @@ init_test_env $@ build_test_filter cleanup_and_setup_lustre +assert_DIR rm -rf $DIR/[df][0-9]* [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE @@ -141,9 +142,9 @@ test_9() { mcreate $MOUNT1/$tfile-1 mcreate $MOUNT2/$tfile-2 # drop first reint reply - do_facet mds sysctl -w lustre.fail_loc=0x80000119 + do_facet mds lctl set_param fail_loc=0x80000119 fail mds - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 rm $MOUNT1/$tfile-[1,2] || return 1 @@ -157,9 +158,9 @@ test_10() { munlink $MOUNT1/$tfile-1 mcreate $MOUNT2/$tfile-2 # drop first reint reply - do_facet mds sysctl -w lustre.fail_loc=0x80000119 + do_facet mds lctl set_param fail_loc=0x80000119 fail mds - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 checkstat $MOUNT1/$tfile-1 && return 1 checkstat $MOUNT1/$tfile-2 || return 2 @@ -177,12 +178,12 @@ test_11() { mcreate $MOUNT2/$tfile-4 mcreate $MOUNT1/$tfile-5 # drop all reint replies for a while - do_facet mds sysctl -w lustre.fail_loc=0x0119 + do_facet mds lctl set_param fail_loc=0x0119 # note that with this fail_loc set, facet_failover df will fail facet_failover mds #sleep for while, let both clients reconnect and timeout sleep $((TIMEOUT * 2)) - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 client_df while [ -z "$(ls $MOUNT1/$tfile-[1-5] 2>/dev/null)" ]; do sleep 5 @@ -202,9 +203,9 @@ test_12() { MULTIPID=$! #define OBD_FAIL_LDLM_ENQUEUE 0x302 - do_facet mds sysctl -w lustre.fail_loc=0x80000302 + do_facet mds lctl set_param fail_loc=0x80000302 facet_failover mds - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 df $MOUNT || { kill -USR1 $MULTIPID && return 1; } ls $DIR/$tfile @@ -227,9 +228,9 @@ test_13() { wait $MULTIPID || return 4 # drop close - do_facet mds sysctl -w lustre.fail_loc=0x80000115 + do_facet mds lctl set_param fail_loc=0x80000115 facet_failover mds - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 df $MOUNT || return 1 ls $DIR/$tfile @@ -260,7 +261,7 @@ test_14() { } run_test 14 "timeouts waiting for lost client during replay" -test_15() { +test_15a() { # was test_15 replay_barrier mds createmany -o $MOUNT1/$tfile- 25 createmany -o $MOUNT2/$tfile-2- 1 @@ -275,72 +276,7 @@ test_15() { zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" return 0 } -run_test 15 "timeout waiting for lost client during replay, 1 client completes" - -test_15a() { - local ost_last_id="" - local osc_last_id="" - - replay_barrier mds - echo "data" > "$MOUNT2/${tfile}-m2" - - umount $MOUNT2 - facet_failover mds - df $MOUNT || return 1 - - ost_last_id=`cat /proc/fs/lustre/obdfilter/*/last_id` - mds_last_id=`cat /proc/fs/lustre/osc/*mds*/last_id` - - echo "Ids after MDS<->OST synchonizing" - echo "--------------------------------" - echo "MDS last_id:" - echo $mds_last_id - echo "OST last_id:" - echo $ost_last_id - - local i=0 - echo $ost_last_id | while read id; do - ost_ids[$i]=$id - ((i++)) - done - - i=0 - echo $mds_last_id | while read id; do - mds_ids[$i]=$id - ((i++)) - done - - local arr_len=${#mds_ids[*]} - for ((i=0;i<$arr_len;i++)); do - mds_id=${mds_ids[i]} - ost_id=${ost_ids[i]} - - test $mds_id -ge $ost_id || { - echo "MDS last id ($mds_id) is smaller than OST one ($ost_id)" - return 2 - } - done - - zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" - return 0 -} -#CROW run_test 15a "OST clear orphans - synchronize ids on MDS and OST" - -test_15b() { - replay_barrier mds - echo "data" > "$MOUNT2/${tfile}-m2" - umount $MOUNT2 - - do_facet ost1 "sysctl -w lustre.fail_loc=0x80000802" - facet_failover mds - - df $MOUNT || return 1 - do_facet ost1 "sysctl -w lustre.fail_loc=0" - - zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" - return 0 -} -#CROW run_test 15b "multiple delayed OST clear orphans" +run_test 15a "timeout waiting for lost client during replay, 1 client completes" test_15c() { replay_barrier mds @@ -409,10 +345,10 @@ test_18() { # bug 3822 - evicting client with enqueued lock statmany -s $MOUNT1/$tdir/f 1 500 & OPENPID=$! NOW=`date +%s` - do_facet mds sysctl -w lustre.fail_loc=0x8000030b # hold enqueue + do_facet mds lctl set_param fail_loc=0x8000030b # hold enqueue sleep 1 #define OBD_FAIL_LDLM_BL_CALLBACK 0x305 - do_facet client sysctl -w lustre.fail_loc=0x80000305 # drop cb, evict + do_facet client lctl set_param fail_loc=0x80000305 # drop cb, evict cancel_lru_locks mdc sleep 0.500s # wait to ensure first client is one that will be evicted openfile -f O_RDONLY $MOUNT2/$tdir/f0 @@ -433,6 +369,33 @@ test_19() { # Bug 10991 - resend of open request does not fail assertion. } run_test 19 "resend of open request" +test_20() { #16389 + BEFORE=`date +%s` + replay_barrier mds + touch $MOUNT1/a + touch $MOUNT2/b + umount $MOUNT2 + facet_failover mds + df $MOUNT1 || return 1 + rm $MOUNT1/a + zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" + TIER1=$((`date +%s` - BEFORE)) + BEFORE=`date +%s` + replay_barrier mds + touch $MOUNT1/a + touch $MOUNT2/b + umount $MOUNT2 + facet_failover mds + df $MOUNT1 || return 1 + rm $MOUNT1/a + zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" + TIER2=$((`date +%s` - BEFORE)) + [ $TIER2 -ge $((TIER1 * 2)) ] && \ + error "recovery time is growing $TIER2 > $TIER1" + return 0 +} +run_test 20 "recovery time is not increasing" + equals_msg `basename $0`: test complete, cleaning up SLEEP=$((`date +%s` - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index 8f5765e9c361d0466d1bdd217d9701c8cc159ff5..cdcd690ad43c4e10f7c7d1ea6878d738f22361a1 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -3,7 +3,7 @@ set -e PTLDEBUG=${PTLDEBUG:--1} -LUSTRE=${LUSTRE:-`dirname $0`/..} +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} SETUP=${SETUP:-""} CLEANUP=${CLEANUP:-""} . $LUSTRE/tests/test-framework.sh @@ -22,7 +22,7 @@ CPU=`awk '/model/ {print $4}' /proc/cpuinfo` ALWAYS_EXCEPT="$REPLAY_OST_SINGLE_EXCEPT" # -[ "$SLOW" = "no" ] && EXCEPT_SLOW="" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="5" # It is replay-ost-single, after all OSTCOUNT=1 @@ -30,13 +30,14 @@ OSTCOUNT=1 build_test_filter REFORMAT=--reformat cleanup_and_setup_lustre +assert_DIR rm -rf $DIR/[df][0-9]* test_0a() { zconf_umount `hostname` $MOUNT -f # needs to run during initial client->OST connection #define OBD_FAIL_OST_ALL_REPLY_NET 0x211 - do_facet ost "sysctl -w lustre.fail_loc=0x80000211" + do_facet ost "lctl set_param fail_loc=0x80000211" zconf_mount `hostname` $MOUNT && df $MOUNT || error "0a mount fail" } run_test 0a "target handle mismatch (bug 5317) `date +%H:%M:%S`" @@ -96,11 +97,12 @@ test_4() { run_test 4 "Fail OST during read, with verification" test_5() { - [ -z "`which iozone 2> /dev/null`" ] && log "iozone missing" && return - FREE=`df -P -h $DIR | tail -n 1 | awk '{ print $3 }'` - case $FREE in - *T|*G) FREE=1G;; - esac + [ -z "`which iozone 2> /dev/null`" ] && skip "iozone missing" && return 0 + FREE=`df -P $DIR | tail -n 1 | awk '{ print $4/2 }'` + GB=1048576 # 1048576KB == 1GB + if (( FREE > GB )); then + FREE=$GB + fi IOZONE_OPTS="-i 0 -i 1 -i 2 -+d -r 4 -s $FREE" iozone $IOZONE_OPTS -f $DIR/$tfile & PID=$! @@ -130,7 +132,7 @@ test_6() { sleep 2 # ensure we have a fresh statfs sync #define OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds "sysctl -w lustre.fail_loc=0x80000119" + do_facet mds "lctl set_param fail_loc=0x80000119" after_dd=`kbytesfree` log "before: $before after_dd: $after_dd" (( $before > $after_dd )) || return 1 diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 1ce5c2cb1646e495aa622bd1cfada18d065aad20..fd74e0167a2e3a5725382f7ab4cdea32b1a62729 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -7,7 +7,7 @@ set -e # This test needs to be run on the client # SAVE_PWD=$PWD -LUSTRE=${LUSTRE:-`dirname $0`/..} +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} SETUP=${SETUP:-} CLEANUP=${CLEANUP:-} . $LUSTRE/tests/test-framework.sh @@ -17,11 +17,11 @@ CHECK_GRANT=${CHECK_GRANT:-"yes"} GRANT_CHECK_LIST=${GRANT_CHECK_LIST:-""} # Skip these tests -# bug number: +# bug number: ALWAYS_EXCEPT="$REPLAY_SINGLE_EXCEPT" -# 63 min 7 min AT AT AT AT" -[ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 6b 12 16 44 44b 65 66 67 68" +# 63 min 7 min AT AT AT AT" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="1 2 3 4 6 12 16 44a 44b 65 66 67 68" build_test_filter @@ -29,13 +29,14 @@ cleanup_and_setup_lustre mkdir -p $DIR +assert_DIR rm -rf $DIR/[df][0-9]* -test_0() { +test_0a() { # was test_0 replay_barrier mds fail mds } -run_test 0 "empty replay" +run_test 0a "empty replay" test_0b() { # this test attempts to trigger a race in the precreation code, @@ -55,50 +56,6 @@ test_1() { } run_test 1 "simple create" -test_1a() { - do_facet ost1 "sysctl -w lustre.fail_loc=0" - - rm -fr $DIR/$tfile - local old_last_id=`cat $LPROC/obdfilter/*/last_id` - touch -o $DIR/$tfile 1 - sync - local new_last_id=`cat $LPROC/obdfilter/*/last_id` - - test "$old_last_id" = "$new_last_id" || { - echo "OST object create is caused by MDS" - return 1 - } - - old_last_id=`cat $LPROC/obdfilter/*/last_id` - echo "data" > $DIR/$tfile - sync - new_last_id=`cat $LPROC/obdfilter/*/last_id` - test "$old_last_id" = "$new_last_id "&& { - echo "CROW does not work on write" - return 1 - } - - rm -fr $DIR/$tfile - -#define OBD_FAIL_OST_CROW_EIO | OBD_FAIL_ONCE - do_facet ost1 "sysctl -w lustre.fail_loc=0x80000801" - - rm -fr $DIR/1a1 - old_last_id=`cat $LPROC/obdfilter/*/last_id` - echo "data" > $DIR/1a1 - sync - new_last_id=`cat $LPROC/obdfilter/*/last_id` - test "$old_last_id" = "$new_last_id" || { - echo "CROW does work with fail_loc=0x80000801" - return 1 - } - - rm -fr $DIR/1a1 - - do_facet ost1 "sysctl -w lustre.fail_loc=0" -} -#CROW run_test 1a "CROW object create (check OST last_id)" - test_2a() { replay_barrier mds touch $DIR/$tfile @@ -131,9 +88,9 @@ run_test 3a "replay failed open(O_DIRECTORY)" test_3b() { replay_barrier mds #define OBD_FAIL_MDS_OPEN_PACK | OBD_FAIL_ONCE - do_facet mds "sysctl -w lustre.fail_loc=0x80000114" + do_facet mds "lctl set_param fail_loc=0x80000114" touch $DIR/$tfile - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" fail mds $CHECKSTAT -t file $DIR/$tfile && return 2 return 0 @@ -143,9 +100,9 @@ run_test 3b "replay failed open -ENOMEM" test_3c() { replay_barrier mds #define OBD_FAIL_MDS_ALLOC_OBDO | OBD_FAIL_ONCE - do_facet mds "sysctl -w lustre.fail_loc=0x80000128" + do_facet mds "lctl set_param fail_loc=0x80000128" touch $DIR/$tfile - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" fail mds $CHECKSTAT -t file $DIR/$tfile && return 2 @@ -153,7 +110,7 @@ test_3c() { } run_test 3c "replay failed open -ENOMEM" -test_4() { +test_4a() { # was test_4 replay_barrier mds for i in `seq 10`; do echo "tag-$i" > $DIR/$tfile-$i @@ -163,7 +120,7 @@ test_4() { grep -q "tag-$i" $DIR/$tfile-$i || error "$tfile-$i" done } -run_test 4 "|x| 10 open(O_CREAT)s" +run_test 4a "|x| 10 open(O_CREAT)s" test_4b() { replay_barrier mds @@ -191,7 +148,8 @@ test_5() { run_test 5 "|x| 220 open(O_CREAT)" -test_6() { +test_6a() { # was test_6 + mkdir -p $DIR/$tdir replay_barrier mds mcreate $DIR/$tdir/$tfile fail mds @@ -200,9 +158,10 @@ test_6() { sleep 2 # waiting for log process thread } -run_test 6 "mkdir + contained create" +run_test 6a "mkdir + contained create" test_6b() { + mkdir -p $DIR/$tdir replay_barrier mds rm -rf $DIR/$tdir fail mds @@ -211,6 +170,7 @@ test_6b() { run_test 6b "|X| rmdir" test_7() { + mkdir -p $DIR/$tdir replay_barrier mds mcreate $DIR/$tdir/$tfile fail mds @@ -401,7 +361,7 @@ test_19() { } run_test 19 "|X| mcreate, open, write, rename " -test_20() { +test_20a() { # was test_20 replay_barrier mds multiop_bg_pause $DIR/$tfile O_tSc || return 3 pid=$! @@ -413,7 +373,7 @@ test_20() { [ -e $DIR/$tfile ] && return 2 return 0 } -run_test 20 "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)" +run_test 20a "|X| open(O_CREAT), unlink, replay, close (test mds_cleanup_orphans)" test_20b() { # bug 10480 BEFOREUSED=`df -P $DIR | tail -1 | awk '{ print $3 }'` @@ -673,7 +633,7 @@ test_32() { run_test 32 "close() notices client eviction; close() after client eviction" # Abort recovery before client complete -test_33() { +test_33a() { # was test_33 replay_barrier mds createmany -o $DIR/$tfile-%d 100 fail_abort mds @@ -682,7 +642,7 @@ test_33() { unlinkmany $DIR/$tfile-%d 0 100 return 0 } -run_test 33 "abort recovery before client does replay" +run_test 33a "abort recovery before client does replay" test_34() { multiop_bg_pause $DIR/$tfile O_c || return 2 @@ -703,7 +663,7 @@ test_35() { touch $DIR/$tfile #define OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds "sysctl -w lustre.fail_loc=0x80000119" + do_facet mds "lctl set_param fail_loc=0x80000119" rm -f $DIR/$tfile & sleep 1 sync @@ -772,7 +732,7 @@ test_39() { # bug 4176 run_test 39 "test recovery from unlink llog (test llog_gen_rec) " count_ost_writes() { - awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }' $LPROC/osc/*/stats + lctl get_param -n osc.*.stats | awk -vwrites=0 '/ost_write/ { writes += $2 } END { print writes; }' } #b=2477,2532 @@ -785,7 +745,7 @@ test_40(){ sleep 1 facet_failover mds #define OBD_FAIL_MDS_CONNECT_NET 0x117 - do_facet mds "sysctl -w lustre.fail_loc=0x80000117" + do_facet mds "lctl set_param fail_loc=0x80000117" kill -USR1 $PID stat1=`count_ost_writes` sleep $TIMEOUT @@ -829,7 +789,7 @@ test_41() { do_facet client dd if=/dev/zero of=$f bs=4k count=1 || return 3 cancel_lru_locks osc # fail ost2 and read from ost1 - local osc2dev=`grep ${ost2_svc}-osc- $LPROC/devices | awk '{print $1}'` + local osc2dev=`lctl get_param -n devices | grep ${ost2_svc}-osc- | awk '{print $1}'` [ "$osc2dev" ] || return 4 $LCTL --device $osc2dev deactivate || return 1 do_facet client dd if=$f of=/dev/null bs=4k count=1 || return 3 @@ -845,7 +805,7 @@ test_42() { replay_barrier ost1 unlinkmany $DIR/$tfile-%d 0 400 debugsave - sysctl -w lnet.debug=-1 + lctl set_param debug=-1 facet_failover ost1 # osc is evicted, fs is smaller (but only with failout OSTs (bug 7287) @@ -864,19 +824,19 @@ test_43() { # bug 2530 replay_barrier mds # OBD_FAIL_OST_CREATE_NET 0x204 - do_facet ost1 "sysctl -w lustre.fail_loc=0x80000204" + do_facet ost1 "lctl set_param fail_loc=0x80000204" fail mds sleep 10 - do_facet ost1 "sysctl -w lustre.fail_loc=0" + do_facet ost1 "lctl set_param fail_loc=0" return 0 } run_test 43 "mds osc import failure during recovery; don't LBUG" -test_44() { +test_44a() { # was test_44 local at_max_saved=0 - mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices` + mdcdev=`lctl get_param -n devices | awk '/-mdc-/ {print $1}'` [ "$mdcdev" ] || exit 2 # adaptive timeouts slow this way down @@ -887,38 +847,38 @@ test_44() { for i in `seq 1 10`; do echo "$i of 10 ($(date +%s))" - do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts" + do_facet mds "lctl get_param -n mdt.MDS.mds.timeouts | grep service" #define OBD_FAIL_TGT_CONN_RACE 0x701 - do_facet mds "sysctl -w lustre.fail_loc=0x80000701" + do_facet mds "lctl set_param fail_loc=0x80000701" $LCTL --device $mdcdev recover df $MOUNT done - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" [ $at_max_saved -ne 0 ] && at_max_set $at_max_saved mds return 0 } -run_test 44 "race in target handle connect" +run_test 44a "race in target handle connect" test_44b() { - mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices` + mdcdev=`lctl get_param -n devices | awk '/-mdc-/ {print $1}'` [ "$mdcdev" ] || exit 2 for i in `seq 1 10`; do echo "$i of 10 ($(date +%s))" - do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts" + do_facet mds "lctl get_param -n mdt.MDS.mds.timeouts | grep service" #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 - do_facet mds "sysctl -w lustre.fail_loc=0x80000704" + do_facet mds "lctl set_param fail_loc=0x80000704" $LCTL --device $mdcdev recover df $MOUNT done - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" return 0 } run_test 44b "race in target handle connect" # Handle failed close test_45() { - mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices` + mdcdev=`lctl get_param -n devices | awk '/-mdc-/ {print $1}'` [ "$mdcdev" ] || exit 2 $LCTL --device $mdcdev recover @@ -959,7 +919,7 @@ test_47() { # bug 2824 # OBD_FAIL_OST_CREATE_NET 0x204 fail ost1 - do_facet ost1 "sysctl -w lustre.fail_loc=0x80000204" + do_facet ost1 "lctl set_param fail_loc=0x80000204" df $MOUNT || return 2 # let the MDS discover the OST failure, attempt to recover, fail @@ -970,7 +930,7 @@ test_47() { # bug 2824 createmany -o $DIR/$tfile 20 || return 3 unlinkmany $DIR/$tfile 20 || return 4 - do_facet ost1 "sysctl -w lustre.fail_loc=0" + do_facet ost1 "lctl set_param fail_loc=0" return 0 } run_test 47 "MDS->OSC failure during precreate cleanup (2824)" @@ -980,19 +940,19 @@ test_48() { createmany -o $DIR/$tfile 20 || return 1 # OBD_FAIL_OST_EROFS 0x216 fail mds - do_facet ost1 "sysctl -w lustre.fail_loc=0x80000216" + do_facet ost1 "lctl set_param fail_loc=0x80000216" df $MOUNT || return 2 createmany -o $DIR/$tfile 20 20 || return 2 unlinkmany $DIR/$tfile 40 || return 3 - do_facet ost1 "sysctl -w lustre.fail_loc=0" + do_facet ost1 "lctl set_param fail_loc=0" return 0 } run_test 48 "MDS->OSC failure during precreate cleanup (2824)" test_50() { - local oscdev=`do_facet mds grep \'${ost1_svc}-osc \' $LPROC/devices | awk '{print $1}' | head -1` + local oscdev=`do_facet mds lctl get_param -n devices | grep ${ost1_svc}-osc | awk '{print $1}' | head -1` [ "$oscdev" ] || return 1 do_facet mds $LCTL --device $oscdev recover || return 2 do_facet mds $LCTL --device $oscdev recover || return 3 @@ -1009,9 +969,9 @@ test_52() { multiop $DIR/$tfile s || return 1 replay_barrier mds #define OBD_FAIL_LDLM_REPLY 0x30c - do_facet mds "sysctl -w lustre.fail_loc=0x8000030c" + do_facet mds "lctl set_param fail_loc=0x8000030c" fail mds || return 2 - do_facet mds "sysctl -w lustre.fail_loc=0x0" + do_facet mds "lctl set_param fail_loc=0x0" $CHECKSTAT -t file $DIR/$tfile-* && return 3 || true } @@ -1027,10 +987,10 @@ test_53a() { sleep 1 #define OBD_FAIL_MDS_CLOSE_NET 0x115 - do_facet mds "sysctl -w lustre.fail_loc=0x80000115" + do_facet mds "lctl set_param fail_loc=0x80000115" kill -USR1 $close_pid cancel_lru_locks MDC # force the close - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" mcreate $DIR/${tdir}-2/f || return 1 # close should still be here @@ -1052,12 +1012,12 @@ test_53b() { close_pid=$! #define OBD_FAIL_MDS_REINT_NET 0x107 - do_facet mds "sysctl -w lustre.fail_loc=0x80000107" + do_facet mds "lctl set_param fail_loc=0x80000107" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" kill -USR1 $close_pid cancel_lru_locks MDC # force the close wait $close_pid || return 1 @@ -1080,12 +1040,12 @@ test_53c() { multiop $DIR/${tdir}-1/f O_c & close_pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000107" + do_facet mds "lctl set_param fail_loc=0x80000107" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0x80000115" + do_facet mds "lctl set_param fail_loc=0x80000115" kill -USR1 $close_pid cancel_lru_locks MDC # force the close @@ -1095,7 +1055,7 @@ test_53c() { sleep 2 # close should be gone [ -d /proc/$close_pid ] && return 2 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 @@ -1112,10 +1072,10 @@ test_53d() { sleep 1 # define OBD_FAIL_MDS_CLOSE_NET_REP 0X138 - do_facet mds "sysctl -w lustre.fail_loc=0x8000013b" + do_facet mds "lctl set_param fail_loc=0x8000013b" kill -USR1 $close_pid cancel_lru_locks MDC # force the close - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" mcreate $DIR/${tdir}-2/f || return 1 # close should still be here @@ -1137,12 +1097,12 @@ test_53e() { close_pid=$! #define OBD_FAIL_MDS_REINT_NET_REP 0x119 - do_facet mds "sysctl -w lustre.fail_loc=0x80000119" + do_facet mds "lctl set_param fail_loc=0x80000119" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" kill -USR1 $close_pid cancel_lru_locks MDC # force the close wait $close_pid || return 1 @@ -1165,12 +1125,12 @@ test_53f() { multiop $DIR/${tdir}-1/f O_c & close_pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000119" + do_facet mds "lctl set_param fail_loc=0x80000119" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0x8000013b" + do_facet mds "lctl set_param fail_loc=0x8000013b" kill -USR1 $close_pid cancel_lru_locks MDC @@ -1180,7 +1140,7 @@ test_53f() { sleep 2 #close should be gone [ -d /proc/$close_pid ] && return 2 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 @@ -1194,16 +1154,16 @@ test_53g() { multiop $DIR/${tdir}-1/f O_c & close_pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000119" + do_facet mds "lctl set_param fail_loc=0x80000119" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0x80000115" + do_facet mds "lctl set_param fail_loc=0x80000115" kill -USR1 $close_pid cancel_lru_locks MDC # force the close - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" replay_barrier_nodf mds fail_nodf mds wait $open_pid || return 1 @@ -1223,12 +1183,12 @@ test_53h() { multiop $DIR/${tdir}-1/f O_c & close_pid=$! - do_facet mds "sysctl -w lustre.fail_loc=0x80000107" + do_facet mds "lctl set_param fail_loc=0x80000107" mcreate $DIR/${tdir}-2/f & open_pid=$! sleep 1 - do_facet mds "sysctl -w lustre.fail_loc=0x8000013b" + do_facet mds "lctl set_param fail_loc=0x8000013b" kill -USR1 $close_pid cancel_lru_locks MDC # force the close sleep 1 @@ -1239,7 +1199,7 @@ test_53h() { sleep 2 # close should be gone [ -d /proc/$close_pid ] && return 2 - do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "lctl set_param fail_loc=0" $CHECKSTAT -t file $DIR/${tdir}-1/f || return 3 $CHECKSTAT -t file $DIR/${tdir}-2/f || return 4 @@ -1250,11 +1210,11 @@ run_test 53h "|X| open request and close reply while two MDC requests in flight" #b3761 ASSERTION(hash != 0) failed test_55() { # OBD_FAIL_MDS_OPEN_CREATE | OBD_FAIL_ONCE - do_facet mds "sysctl -w lustre.fail_loc=0x8000012b" + do_facet mds "lctl set_param fail_loc=0x8000012b" touch $DIR/$tfile & # give touch a chance to run sleep 5 - do_facet mds "sysctl -w lustre.fail_loc=0x0" + do_facet mds "lctl set_param fail_loc=0x0" rm $DIR/$tfile return 0 } @@ -1273,27 +1233,28 @@ run_test 56 "don't replay a symlink open request (3440)" #recovery one mds-ost setattr from llog test_57() { #define OBD_FAIL_MDS_OST_SETATTR 0x12c - do_facet mds "sysctl -w lustre.fail_loc=0x8000012c" + do_facet mds "lctl set_param fail_loc=0x8000012c" touch $DIR/$tfile replay_barrier mds fail mds sleep 1 $CHECKSTAT -t file $DIR/$tfile || return 1 - do_facet mds "sysctl -w lustre.fail_loc=0x0" + do_facet mds "lctl set_param fail_loc=0x0" rm $DIR/$tfile } run_test 57 "test recovery from llog for setattr op" #recovery many mds-ost setattr from llog test_58() { + mkdir -p $DIR/$tdir #define OBD_FAIL_MDS_OST_SETATTR 0x12c - do_facet mds "sysctl -w lustre.fail_loc=0x8000012c" + do_facet mds "lctl set_param fail_loc=0x8000012c" createmany -o $DIR/$tdir/$tfile-%d 2500 replay_barrier mds fail mds sleep 2 $CHECKSTAT -t file $DIR/$tdir/$tfile-* >/dev/null || return 1 - do_facet mds "sysctl -w lustre.fail_loc=0x0" + do_facet mds "lctl set_param fail_loc=0x0" unlinkmany $DIR/$tdir/$tfile-%d 2500 rmdir $DIR/$tdir } @@ -1302,14 +1263,15 @@ run_test 58 "test recovery from llog for setattr op (test llog_gen_rec)" # log_commit_thread vs filter_destroy race used to lead to import use after free # bug 11658 test_59() { + mkdir -p $DIR/$tdir createmany -o $DIR/$tdir/$tfile-%d 200 sync unlinkmany $DIR/$tdir/$tfile-%d 200 #define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 - do_facet ost1 "sysctl -w lustre.fail_loc=0x507" + do_facet ost1 "lctl set_param fail_loc=0x507" fail ost1 fail mds - do_facet ost1 "sysctl -w lustre.fail_loc=0x0" + do_facet ost1 "lctl set_param fail_loc=0x0" sleep 20 rmdir $DIR/$tdir } @@ -1318,6 +1280,7 @@ run_test 59 "test log_commit_thread vs filter_destroy race" # race between add unlink llog vs cat log init in post_recovery (only for b1_6) # bug 12086: should no oops and No ctxt error for this test test_60() { + mkdir -p $DIR/$tdir createmany -o $DIR/$tdir/$tfile-%d 200 replay_barrier mds unlinkmany $DIR/$tdir/$tfile-%d 0 100 @@ -1330,16 +1293,17 @@ run_test 60 "test llog post recovery init vs llog unlink" #test race llog recovery thread vs llog cleanup test_61a() { + mkdir -p $DIR/$tdir createmany -o $DIR/$tdir/$tfile-%d 800 replay_barrier ost1 # OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 unlinkmany $DIR/$tdir/$tfile-%d 800 - do_facet ost "sysctl -w lustre.fail_loc=0x80000221" + do_facet ost "lctl set_param fail_loc=0x80000221" facet_failover ost1 sleep 10 fail ost1 sleep 30 - do_facet ost "sysctl -w lustre.fail_loc=0x0" + do_facet ost "lctl set_param fail_loc=0x0" $CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1 rmdir $DIR/$tdir } @@ -1348,7 +1312,7 @@ run_test 61a "test race llog recovery vs llog cleanup" #test race mds llog sync vs llog cleanup test_61b() { # OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a - do_facet mds "sysctl -w lustre.fail_loc=0x8000013a" + do_facet mds "lctl set_param fail_loc=0x8000013a" facet_failover mds sleep 10 fail mds @@ -1360,19 +1324,46 @@ run_test 61b "test race mds llog sync vs llog cleanup" test_61c() { # OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 touch $DIR/$tfile - do_facet ost "sysctl -w lustre.fail_loc=0x80000222" + do_facet ost "lctl set_param fail_loc=0x80000222" rm $DIR/$tfile sleep 10 fail ost1 } run_test 61c "test race mds llog sync vs llog cleanup" +test_61d() { # bug 16002 +#define OBD_FAIL_OBD_LLOG_SETUP 0x605 + stop mds + do_facet mds "lctl set_param fail_loc=0x80000605" + start mds $MDSDEV $MDS_MOUNT_OPTS && error "mds start should have failed" + do_facet mds "lctl set_param fail_loc=0" + start mds $MDSDEV $MDS_MOUNT_OPTS || error "cannot restart mds" +} +run_test 61d "error in llog_setup should cleanup the llog context correctly" + +test_62() { # Bug 15756 - don't mis-drop resent replay + mkdir -p $DIR/$tdir + replay_barrier mds + createmany -o $DIR/$tdir/$tfile- 25 +#define OBD_FAIL_TGT_REPLAY_DROP 0x707 + do_facet mds "lctl set_param fail_loc=0x80000707" + facet_failover mds + df $MOUNT || return 1 + do_facet mds "lctl set_param fail_loc=0" + unlinkmany $DIR/$tdir/$tfile- 25 || return 2 + return 0 +} +run_test 62 "don't mis-drop resent replay" + #Adaptive Timeouts (bug 3055) AT_MAX_SET=0 at_start() { - at_is_valid || skip "AT env is invalid" + if ! at_is_valid; then + skip "AT env is invalid" + return 1 + fi if ! at_is_enabled; then echo "AT is disabled, enable it by force temporarily" @@ -1395,19 +1386,19 @@ test_65a() #bug 3055 at_start || return 0 $LCTL dk > /dev/null debugsave - sysctl -w lnet.debug="+other" + lctl set_param debug="+other" # slow down a request - do_facet mds sysctl -w lustre.fail_val=30000 + do_facet mds lctl set_param fail_val=30000 #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a - do_facet mds sysctl -w lustre.fail_loc=0x8000050a + do_facet mds lctl set_param fail_loc=0x8000050a createmany -o $DIR/$tfile 10 > /dev/null unlinkmany $DIR/$tfile 10 > /dev/null # check for log message $LCTL dk | grep "Early reply #" || error "No early reply" # client should show 30s estimates - grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal sleep 9 - grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal } run_test 65a "AT: verify early replies" @@ -1416,50 +1407,50 @@ test_65b() #bug 3055 at_start || return 0 # turn on D_ADAPTTO debugsave - sysctl -w lnet.debug="+other" + lctl set_param debug="+other" $LCTL dk > /dev/null # slow down bulk i/o - do_facet ost1 sysctl -w lustre.fail_val=30 + do_facet ost1 lctl set_param fail_val=30 #define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 - do_facet ost1 sysctl -w lustre.fail_loc=0x224 + do_facet ost1 lctl set_param fail_loc=0x224 rm -f $DIR/$tfile lfs setstripe $DIR/$tfile --index=0 --count=1 # force some real bulk transfer multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c - do_facet ost1 sysctl -w lustre.fail_loc=0 + do_facet ost1 lctl set_param fail_loc=0 # check for log message $LCTL dk | grep "Early reply #" || error "No early reply" debugrestore # client should show 30s estimates - grep portal $LPROC/osc/${FSNAME}-OST0000-osc-*/timeouts + lctl get_param -n osc.${FSNAME}-OST0000-osc-*.timeouts | grep portal } run_test 65b "AT: verify early replies on packed reply / bulk" test_66a() #bug 3055 { at_start || return 0 - grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12" # adjust 5s at a time so no early reply is sent (within deadline) - do_facet mds "sysctl -w lustre.fail_val=5000" + do_facet mds "lctl set_param fail_val=5000" #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a - do_facet mds "sysctl -w lustre.fail_loc=0x8000050a" + do_facet mds "lctl set_param fail_loc=0x8000050a" createmany -o $DIR/$tfile 20 > /dev/null unlinkmany $DIR/$tfile 20 > /dev/null - grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts - do_facet mds "sysctl -w lustre.fail_val=10000" - do_facet mds "sysctl -w lustre.fail_loc=0x8000050a" + lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12" + do_facet mds "lctl set_param fail_val=10000" + do_facet mds "lctl set_param fail_loc=0x8000050a" createmany -o $DIR/$tfile 20 > /dev/null unlinkmany $DIR/$tfile 20 > /dev/null - grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts - do_facet mds "sysctl -w lustre.fail_loc=0" + lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep "portal 12" + do_facet mds "lctl set_param fail_loc=0" sleep 9 createmany -o $DIR/$tfile 20 > /dev/null unlinkmany $DIR/$tfile 20 > /dev/null - grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12" - CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts) - WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts) + lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | grep portal | grep "portal 12" + CUR=$(lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | awk '/portal 12/ {print $5}') + WORST=$(lctl get_param -n mdc.${FSNAME}-MDT0000-mdc-*.timeouts | awk '/portal 12/ {print $7}') echo "Current MDT timeout $CUR, worst $WORST" [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST" } @@ -1468,14 +1459,14 @@ run_test 66a "AT: verify MDT service time adjusts with no early replies" test_66b() #bug 3055 { at_start || return 0 - ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts) - sysctl -w lustre.fail_val=$(($ORIG + 5)) + ORIG=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $4}') + lctl set_param fail_val=$(($ORIG + 5)) #define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c - sysctl -w lustre.fail_loc=0x50c + lctl set_param fail_loc=0x50c ls $DIR/$tfile > /dev/null 2>&1 - sysctl -w lustre.fail_loc=0 - CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts) - WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts) + lctl set_param fail_loc=0 + CUR=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $4}') + WORST=$(lctl get_param -n mdc.${FSNAME}-*.timeouts | awk '/network/ {print $6}') echo "network timeout orig $ORIG, cur $CUR, worst $WORST" [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG" } @@ -1484,15 +1475,15 @@ run_test 66b "AT: verify net latency adjusts" test_67a() #bug 3055 { at_start || return 0 - CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') # sleeping threads may drive values above this - do_facet ost1 "sysctl -w lustre.fail_val=400" + do_facet ost1 "lctl set_param fail_val=400" #define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a - do_facet ost1 "sysctl -w lustre.fail_loc=0x50a" + do_facet ost1 "lctl set_param fail_loc=0x50a" createmany -o $DIR/$tfile 20 > /dev/null unlinkmany $DIR/$tfile 20 > /dev/null - do_facet ost1 "sysctl -w lustre.fail_loc=0" - CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + do_facet ost1 "lctl set_param fail_loc=0" + CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN2 - $CONN1)) echo "$ATTEMPTS osc reconnect attemps on gradual slow" [ $ATTEMPTS -gt 0 ] && error_ignore 13721 "AT should have prevented reconnect" @@ -1503,24 +1494,24 @@ run_test 67a "AT: verify slow request processing doesn't induce reconnects" test_67b() #bug 3055 { at_start || return 0 - CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + CONN1=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') #define OBD_FAIL_OST_PAUSE_CREATE 0x223 - do_facet ost1 "sysctl -w lustre.fail_val=20000" - do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" + do_facet ost1 "lctl set_param fail_val=20000" + do_facet ost1 "lctl set_param fail_loc=0x80000223" cp /etc/profile $DIR/$tfile || error "cp failed" client_reconnect - cat $LPROC/ost/OSS/ost_create/timeouts + lctl get_param -n ost.OSS.ost_create.timeouts log "phase 2" - CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + CONN2=$(lctl get_param -n osc.*.stats | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN2 - $CONN1)) echo "$ATTEMPTS osc reconnect attemps on instant slow" # do it again; should not timeout - do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" + do_facet ost1 "lctl set_param fail_loc=0x80000223" cp /etc/profile $DIR/$tfile || error "cp failed" - do_facet ost1 "sysctl -w lustre.fail_loc=0" + do_facet ost1 "lctl set_param fail_loc=0" client_reconnect - cat $LPROC/ost/OSS/ost_create/timeouts - CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + lctl get_param -n ost.OSS.ost_create.timeouts + CONN3=$(`lctl get_param -n osc.*.stats` | awk '/_connect/ {total+=$2} END {print total}') ATTEMPTS=$(($CONN3 - $CONN2)) echo "$ATTEMPTS osc reconnect attemps on 2nd slow" [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect" @@ -1538,13 +1529,13 @@ test_68 () #bug 13813 rm -f $DIR/${tfile}_[1-2] lfs setstripe $DIR/$tfile --index=0 --count=1 #define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 - sysctl -w lustre.fail_val=$(($TIMEOUT - 1)) - sysctl -w lustre.fail_loc=0x80000312 + lctl set_param fail_val=$(($TIMEOUT - 1)) + lctl set_param fail_loc=0x80000312 cp /etc/profile $DIR/${tfile}_1 || error "1st cp failed $?" - sysctl -w lustre.fail_val=$((TIMEOUT * 3 / 2)) - sysctl -w lustre.fail_loc=0x80000312 + lctl set_param fail_val=$((TIMEOUT * 3 / 2)) + lctl set_param fail_loc=0x80000312 cp /etc/profile $DIR/${tfile}_2 || error "2nd cp failed $?" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 echo $ENQ_MIN >> $ldlm_enqueue_min return 0 } @@ -1563,6 +1554,76 @@ fi # end of AT tests includes above lines +# start multi-client tests +test_70a () { + [ -z "$CLIENTS" ] && \ + { skip "Need two or more clients." && return; } + [ $CLIENTCOUNT -lt 2 ] && \ + { skip "Need two or more clients, have $CLIENTCOUNT" && return; } + + echo "mount clients $CLIENTS ..." + zconf_mount_clients $CLIENTS $DIR + + local clients=${CLIENTS//,/ } + echo "Write/read files on $DIR ; clients $CLIENTS ... " + for CLIENT in $clients; do + do_node $CLIENT dd bs=1M count=10 if=/dev/zero \ + of=$DIR/${tfile}_${CLIENT} 2>/dev/null || \ + error "dd failed on $CLIENT" + done + + local prev_client=$(echo $clients | sed 's/^.* \(\w\+\)$/\1/') + for C in ${CLIENTS//,/ }; do + do_node $prev_client dd if=$DIR/${tfile}_${C} of=/dev/null 2>/dev/null || \ + error "dd if=$DIR/${tfile}_${C} failed on $prev_client" + prev_client=$C + done + + ls $DIR + + zconf_umount_clients $CLIENTS $DIR +} +run_test 70a "check multi client t-f" + +test_70b () { + [ -z "$CLIENTS" ] && \ + { skip "Need two or more clients." && return; } + [ $CLIENTCOUNT -lt 2 ] && \ + { skip "Need two or more clients, have $CLIENTCOUNT" && return; } + + zconf_mount_clients $CLIENTS $DIR + + local duration="-t 60" + local cmd="rundbench 1 $duration " + local PID="" + for CLIENT in ${CLIENTS//,/ }; do + $PDSH $CLIENT "set -x; PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests/:${DBENCH_LIB} DBENCH_LIB=${DBENCH_LIB} $cmd" & + PID=$! + echo $PID >pid.$CLIENT + echo "Started load PID=`cat pid.$CLIENT`" + done + + replay_barrier mds + sleep 3 # give clients a time to do operations + + log "$TESTNAME fail mds 1" + fail mds + +# wait for client to reconnect to MDS + sleep $TIMEOUT + + for CLIENT in ${CLIENTS//,/ }; do + PID=`cat pid.$CLIENT` + wait $PID + rc=$? + echo "load on ${CLIENT} returned $rc" + done + + zconf_umount_clients $CLIENTS $DIR +} +run_test 70b "mds recovery; $CLIENTCOUNT clients" +# end multi-client tests + equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh index 0ba4bf52ff5017c86434e45814b7c8708bb9a0f4..3e98d2a01c9cf1392b20af2e854cb850bb6031e0 100644 --- a/lustre/tests/sanity-quota.sh +++ b/lustre/tests/sanity-quota.sh @@ -17,7 +17,7 @@ SRCDIR=`dirname $0` export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/../utils:$PATH:/sbin ONLY=${ONLY:-"$*"} -ALWAYS_EXCEPT="$SANITY_QUOTA_EXCEPT" +ALWAYS_EXCEPT="10 $SANITY_QUOTA_EXCEPT" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! case `uname -r` in @@ -47,7 +47,7 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} -[ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11 21" QUOTALOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log} @@ -58,11 +58,11 @@ DIR2=${DIR2:-$MOUNT2} cleanup_and_setup_lustre -LOVNAME=`cat $LPROC/llite/*/lov/common_name | tail -n 1` -OSTCOUNT=`cat $LPROC/lov/$LOVNAME/numobd` +LOVNAME=`lctl get_param -n llite.*.lov.common_name | tail -n 1` +OSTCOUNT=`lctl get_param -n lov.$LOVNAME.numobd` -SHOW_QUOTA_USER="$LFS quota -u $TSTUSR $DIR" -SHOW_QUOTA_GROUP="$LFS quota -g $TSTUSR $DIR" +SHOW_QUOTA_USER="$LFS quota -v -u $TSTUSR $DIR" +SHOW_QUOTA_GROUP="$LFS quota -v -g $TSTUSR $DIR" SHOW_QUOTA_INFO="$LFS quota -t $DIR" # control the time of tests @@ -76,50 +76,38 @@ eval ONLY_99=true # set_blk_tunables(btune_sz) set_blk_tunesz() { - local i + local btune=$(($1 * BLK_SZ)) # set btune size on all obdfilters - do_facet ost1 "set -x; for i in /proc/fs/lustre/obdfilter/*/quota_btune_sz; do - echo $(($1 * BLK_SZ)) >> \\\$i; - done" + do_facet ost1 "lctl set_param obdfilter.*.quota_btune_sz=$btune" # set btune size on mds - do_facet mds "for i in /proc/fs/lustre/mds/${FSNAME}-MDT*/quota_btune_sz; do - echo $(($1 * BLK_SZ)) >> \\\$i; - done" + do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_btune_sz=$btune" } # set_blk_unitsz(bunit_sz) set_blk_unitsz() { - local i - do_facet ost1 "for i in /proc/fs/lustre/obdfilter/*/quota_bunit_sz; do - echo $(($1 * BLK_SZ)) >> \\\$i; - done" - do_facet mds "for i in /proc/fs/lustre/mds/${FSNAME}-MDT*/quota_bunit_sz; do - echo $(($1 * BLK_SZ)) >> \\\$i; - done" + local bunit=$(($1 * BLK_SZ)) + # set bunit size on all obdfilters + do_facet ost1 "lctl set_param obdfilter.*.quota_bunit_sz=$bunit" + # set bunit size on mds + do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_bunit_sz=$bunit" } # set_file_tunesz(itune_sz) set_file_tunesz() { - local i - # set iunit and itune size on all obdfilters - do_facet ost1 "for i in /proc/fs/lustre/obdfilter/*/quota_itune_sz; do - echo $1 >> \\\$i; - done" - # set iunit and itune size on mds - do_facet mds "for i in /proc/fs/lustre/mds/${FSNAME}-MDT*/quota_itune_sz; do - echo $1 >> \\\$i; - done" + local itune=$1 + # set itune size on all obdfilters + do_facet ost1 "lctl set_param obdfilter.*.quota_itune_sz=$itune" + # set itune size on mds + do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_itune_sz=$itune" } # set_file_unitsz(iunit_sz) set_file_unitsz() { - local i - do_facet ost1 "for i in /proc/fs/lustre/obdfilter/*/quota_iunit_sz; do - echo $1 >> \\\$i; - done" - do_facet mds "for i in /proc/fs/lustre/mds/${FSNAME}-MDT*/quota_iunit_sz; do - echo $1 >> \\\$i; - done" + local iunit=$1 + # set iunit size on all obdfilters + do_facet ost1 "lctl set_param obdfilter.*.quota_iunit_sz=$iunit" + # set iunit size on mds + do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_iunit_sz=$iunit" } lustre_fail() { @@ -128,15 +116,15 @@ lustre_fail() { case $fail_node in "mds" ) - do_facet mds "sysctl -w lustre.fail_loc=$fail_loc" ;; + do_facet mds "lctl set_param fail_loc=$fail_loc" ;; "ost" ) for num in `seq $OSTCOUNT`; do - do_facet ost$num "sysctl -w lustre.fail_loc=$fail_loc" + do_facet ost$num "lctl set_param fail_loc=$fail_loc" done ;; "mds_ost" ) - do_facet mds "sysctl -w lustre.fail_loc=$fail_loc" ; + do_facet mds "lctl set_param fail_loc=$fail_loc" ; for num in `seq $OSTCOUNT`; do - do_facet ost$num "sysctl -w lustre.fail_loc=$fail_loc" + do_facet ost$num "lctl set_param fail_loc=$fail_loc" done ;; * ) echo "usage: lustre_fail fail_node fail_loc" ; return 1 ;; @@ -155,15 +143,21 @@ test_0() { $LFS quotaoff -ug $DIR $LFS quotacheck -ug $DIR - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR - sysctl -w lnet.debug="+quota" + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + + lctl set_param debug="+quota" + do_facet mds "lctl set_param debug=+quota" + for num in `seq $OSTCOUNT`; do + do_facet ost$num "lctl set_param debug=+quota" + done } run_test 0 "Set quota =============================" # test for specific quota limitation, qunit, qtune $1=block_quota_limit test_1_sub() { LIMIT=$1 + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir TESTFILE="$DIR/$tdir/$tfile-0" @@ -171,7 +165,7 @@ test_1_sub() { # test for user log " User quota (limit: $LIMIT kbytes)" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR sleep 3 $SHOW_QUOTA_USER @@ -196,12 +190,12 @@ test_1_sub() { [ $OST0_QUOTA_USED -ne 0 ] && \ ($SHOW_QUOTA_USER; error "quota deleted isn't released") $SHOW_QUOTA_USER - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR # clear user limit + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR # test for group log "--------------------------------------" log " Group quota (limit: $LIMIT kbytes)" - $LFS setquota -g $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR sleep 3 $SHOW_QUOTA_GROUP TESTFILE="$DIR/$tdir/$tfile-1" @@ -227,7 +221,7 @@ test_1_sub() { [ $OST0_QUOTA_USED -ne 0 ] && \ ($SHOW_QUOTA_USER; error "quota deleted isn't released") $SHOW_QUOTA_GROUP - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR # clear group limit + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } # block hard limit (normal use and out of quota) @@ -252,6 +246,7 @@ run_test 1 "Block hard limit (normal use and out of quota) ===" # test for specific quota limitation, qunit, qtune $1=block_quota_limit test_2_sub() { LIMIT=$1 + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir TESTFILE="$DIR/$tdir/$tfile-0" @@ -259,13 +254,13 @@ test_2_sub() { # test for user log " User quota (limit: $LIMIT files)" - $LFS setquota -u $TSTUSR 0 0 0 $LIMIT $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I $LIMIT $DIR sleep 3 $SHOW_QUOTA_USER log " Create $LIMIT files ..." $RUNAS createmany -m ${TESTFILE} $LIMIT || \ - error "(usr) create failure, but except success" + error "(usr) create failure, but expect success" log " Done" log " Create out of file quota ..." $RUNAS touch ${TESTFILE}_xxx && \ @@ -281,19 +276,19 @@ test_2_sub() { [ $MDS_QUOTA_USED -ne 0 ] && \ ($SHOW_QUOTA_USER; error "quota deleted isn't released") $SHOW_QUOTA_USER - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR # clear user limit + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR # test for group log "--------------------------------------" log " Group quota (limit: $LIMIT FILE)" - $LFS setquota -g $TSTUSR 0 0 0 $LIMIT $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I $LIMIT $DIR sleep 3 $SHOW_QUOTA_GROUP TESTFILE=$DIR/$tdir/$tfile-1 log " Create $LIMIT files ..." $RUNAS createmany -m ${TESTFILE} $LIMIT || \ - error "(usr) create failure, but except success" + error "(usr) create failure, but expect success" log " Done" log " Create out of file quota ..." $RUNAS touch ${TESTFILE}_xxx && \ @@ -309,17 +304,24 @@ test_2_sub() { [ $MDS_QUOTA_USED -ne 0 ] && \ ($SHOW_QUOTA_USER; error "quota deleted isn't released") $SHOW_QUOTA_GROUP - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR # clear user limit + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } # file hard limit (normal use and out of quota) test_2() { for i in `seq 1 $cycle`; do - # define ino_qunit is between 10 and 100 - ino_qunit=$(( $RANDOM % 90 + 10 )) - ino_qtune=$(( $RANDOM % $ino_qunit )) - # RANDOM's maxium is 32767 - i_limit=$(( $RANDOM % 990 + 10 )) + if [ $i -eq 1 ]; then + ino_qunit=52 + ino_qtune=41 + i_limit=11 + else + # define ino_qunit is between 10 and 100 + ino_qunit=$(( $RANDOM % 90 + 10 )) + ino_qtune=$(( $RANDOM % $ino_qunit )) + # RANDOM's maxium is 32767 + i_limit=$(( $RANDOM % 990 + 10 )) + fi + set_file_tunesz $ino_qtune set_file_unitsz $ino_qunit echo "cycle: $i(total $cycle) iunit:$ino_qunit, itune:$ino_qtune, ilimit:$i_limit" @@ -395,6 +397,7 @@ test_block_soft() { # block soft limit (start timer, timer goes off, stop timer) test_3() { + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir # 1 bunit on mds and 1 bunit on every ost @@ -407,11 +410,11 @@ test_3() { $LFS setstripe $TESTFILE -c 1 chown $TSTUSR.$TSTUSR $TESTFILE - $LFS setquota -t -u $GRACE $MAX_IQ_TIME $DIR - $LFS setquota -u $TSTUSR $LIMIT 0 0 0 $DIR + $LFS setquota -t -u --block-grace $GRACE --inode-grace $MAX_IQ_TIME $DIR + $LFS setquota -u $TSTUSR -b $LIMIT -B 0 -i 0 -I 0 $DIR test_block_soft $TESTFILE $GRACE - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR echo " Group quota (soft limit: $LIMIT kbytes grace: $GRACE seconds)" TESTFILE=$DIR/$tdir/$tfile-1 @@ -419,11 +422,11 @@ test_3() { $LFS setstripe $TESTFILE -c 1 chown $TSTUSR.$TSTUSR $TESTFILE - $LFS setquota -t -g $GRACE $MAX_IQ_TIME $DIR - $LFS setquota -g $TSTUSR $LIMIT 0 0 0 $DIR + $LFS setquota -t -g --block-grace $GRACE --inode-grace $MAX_IQ_TIME $DIR + $LFS setquota -g $TSTUSR -b $LIMIT -B 0 -i 0 -I 0 $DIR test_block_soft $TESTFILE $GRACE - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } run_test 3 "Block soft limit (start timer, timer goes off, stop timer) ===" @@ -454,7 +457,11 @@ test_file_soft() { $SHOW_QUOTA_INFO echo " Create file after timer goes off" - $RUNAS touch ${TESTFILE}_after ${TESTFILE}_after1 && \ + # the least of inode qunit is 2, so there are at most 3(qunit:2+qtune:1) + # inode quota left here + $RUNAS touch ${TESTFILE}_after ${TESTFILE}_after1 ${TESTFILE}_after2 || true + sync; sleep 1; sync + $RUNAS touch ${TESTFILE}_after3 && \ error "create after timer expired, but expect EDQUOT" sync; sleep 1; sync @@ -478,7 +485,8 @@ test_file_soft() { } # file soft limit (start timer, timer goes off, stop timer) -test_4() { +test_4a() { # was test_4 + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir LIMIT=$(($IUNIT_SZ * 10)) # 10 iunits on mds TESTFILE=$DIR/$tdir/$tfile-0 @@ -486,29 +494,29 @@ test_4() { GRACE=5 echo " User quota (soft limit: $LIMIT files grace: $GRACE seconds)" - $LFS setquota -t -u $MAX_DQ_TIME $GRACE $DIR - $LFS setquota -u $TSTUSR 0 0 $LIMIT 0 $DIR + $LFS setquota -t -u --block-grace $MAX_DQ_TIME --inode-grace $GRACE $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i $LIMIT -I 0 $DIR $SHOW_QUOTA_USER test_file_soft $TESTFILE $LIMIT $GRACE - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR echo " Group quota (soft limit: $LIMIT files grace: $GRACE seconds)" - $LFS setquota -t -g $MAX_DQ_TIME $GRACE $DIR - $LFS setquota -g $TSTUSR 0 0 $LIMIT 0 $DIR + $LFS setquota -t -g --block-grace $MAX_DQ_TIME --inode-grace $GRACE $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i $LIMIT -I 0 $DIR $SHOW_QUOTA_GROUP TESTFILE=$DIR/$tdir/$tfile-1 test_file_soft $TESTFILE $LIMIT $GRACE - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR # cleanup - $LFS setquota -t -u $MAX_DQ_TIME $MAX_IQ_TIME $DIR - $LFS setquota -t -g $MAX_DQ_TIME $MAX_IQ_TIME $DIR + $LFS setquota -t -u --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR + $LFS setquota -t -g --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR } -run_test 4 "File soft limit (start timer, timer goes off, stop timer) ===" +run_test 4a "File soft limit (start timer, timer goes off, stop timer) ===" -test_4a() { +test_4b() { # was test_4a GR_STR1="1w3d" GR_STR2="1000s" GR_STR3="5s" @@ -520,32 +528,33 @@ test_4a() { # test of valid grace strings handling echo " Valid grace strings test" - $LFS setquota -t -u $GR_STR1 $GR_STR2 $DIR + $LFS setquota -t -u --block-grace $GR_STR1 --inode-grace $GR_STR2 $DIR $LFS quota -u -t $DIR | grep "Block grace time: $GR_STR1" - $LFS setquota -t -g $GR_STR3 $GR_STR4 $DIR + $LFS setquota -t -g --block-grace $GR_STR3 --inode-grace $GR_STR4 $DIR $LFS quota -g -t $DIR | grep "Inode grace time: $GR_STR4" # test of invalid grace strings handling echo " Invalid grace strings test" - ! $LFS setquota -t -u $GR_STR4 $GR_STR5 $DIR - ! $LFS setquota -t -g $GR_STR4 $GR_STR6 $DIR + ! $LFS setquota -t -u --block-grace $GR_STR4 --inode-grace $GR_STR5 $DIR + ! $LFS setquota -t -g --block-grace $GR_STR4 --inode-grace $GR_STR6 $DIR # cleanup - $LFS setquota -t -u $MAX_DQ_TIME $MAX_IQ_TIME $DIR - $LFS setquota -t -g $MAX_DQ_TIME $MAX_IQ_TIME $DIR + $LFS setquota -t -u --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR + $LFS setquota -t -g --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR } -run_test 4a "Grace time strings handling ===" +run_test 4b "Grace time strings handling ===" # chown & chgrp (chown & chgrp successfully even out of block/file quota) test_5() { + mkdir -p $DIR/$tdir BLIMIT=$(( $BUNIT_SZ * $((OSTCOUNT + 1)) * 10)) # 10 bunits on each server ILIMIT=$(( $IUNIT_SZ * 10 )) # 10 iunits on mds wait_delete_completed echo " Set quota limit (0 $BLIMIT 0 $ILIMIT) for $TSTUSR.$TSTUSR" - $LFS setquota -u $TSTUSR 0 $BLIMIT 0 $ILIMIT $DIR - $LFS setquota -g $TSTUSR 0 $BLIMIT 0 $ILIMIT $DIR + $LFS setquota -u $TSTUSR -b 0 -B $BLIMIT -i 0 -I $ILIMIT $DIR + $LFS setquota -g $TSTUSR -b 0 -B $BLIMIT -i 0 -I $ILIMIT $DIR $SHOW_QUOTA_USER $SHOW_QUOTA_GROUP @@ -564,8 +573,8 @@ test_5() { unlinkmany $DIR/$tdir/$tfile-0_ $((ILIMIT + 1)) sync; sleep 3; sync; - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } run_test 5 "Chown & chgrp successfully even out of block/file quota ===" @@ -578,6 +587,7 @@ test_6() { wait_delete_completed + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir LIMIT=$((BUNIT_SZ * (OSTCOUNT + 1) * 5)) # 5 bunits per server @@ -585,8 +595,8 @@ test_6() { FILEB="$DIR/$tdir/$tfile-0_b" echo " Set block limit $LIMIT kbytes to $TSTUSR.$TSTUSR" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR - $LFS setquota -g $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR $SHOW_QUOTA_USER $SHOW_QUOTA_GROUP @@ -630,8 +640,8 @@ test_6() { rm -f $FILEA sync; sleep 3; sync; - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR return 0 } run_test 6 "Block quota acquire & release =========" @@ -639,6 +649,7 @@ run_test 6 "Block quota acquire & release =========" # quota recovery (block quota only by now) test_7() { + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir remote_mds && skip "remote mds" && return 0 @@ -647,7 +658,7 @@ test_7() LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) )) TESTFILE="$DIR/$tdir/$tfile-0" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR $LFS setstripe $TESTFILE -c 1 chown $TSTUSR.$TSTUSR $TESTFILE @@ -673,7 +684,7 @@ test_7() # check limits PATTERN="`echo $DIR | sed 's/\//\\\\\//g'`" - TOTAL_LIMIT="`$LFS quota -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" + TOTAL_LIMIT="`$LFS quota -v -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" [ $TOTAL_LIMIT -eq $LIMIT ] || error "total limits not recovery!" echo " total limits = $TOTAL_LIMIT" @@ -684,12 +695,13 @@ test_7() echo " limits on $OST0_UUID = $OST0_LIMIT" # cleanup - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } run_test 7 "Quota recovery (only block limit) ======" # run dbench with quota enabled test_8() { + mkdir -p $DIR/$tdir BLK_LIMIT=$((100 * 1024 * 1024)) # 100G FILE_LIMIT=1000000 DBENCH_LIB=${DBENCH_LIB:-/usr/lib/dbench} @@ -699,9 +711,9 @@ test_8() { wait_delete_completed echo " Set enough high limit for user: $TSTUSR" - $LFS setquota -u $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR echo " Set enough high limit for group: $TSTUSR" - $LFS setquota -g $USER 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR + $LFS setquota -g $USER -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR TGT=$DIR/$tdir/client.txt SRC=${SRC:-$DBENCH_LIB/client.txt} @@ -754,16 +766,18 @@ test_9() { set_blk_tunesz 512 set_blk_unitsz 1024 + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir TESTFILE="$DIR/$tdir/$tfile-0" BLK_LIMIT=$((100 * KB * KB)) # 100G FILE_LIMIT=1000000 + echo " Set block limit $BLK_LIMIT kbytes to $TSTUSR.$TSTUSR" log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for user: $TSTUSR" - $LFS setquota -u $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for group: $TSTUSR" - $LFS setquota -g $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR + $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR echo " Set stripe" $LFS setstripe $TESTFILE -c 1 @@ -798,6 +812,7 @@ run_test 9 "run for fixing bug10707(64bit) ===========" # run for fixing bug10707, it need a big room. test for 32bit test_10() { + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir check_whether_skip && return 0 @@ -815,9 +830,9 @@ test_10() { FILE_LIMIT=1000000 log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for user: $TSTUSR" - $LFS setquota -u $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for group: $TSTUSR" - $LFS setquota -g $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR + $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR echo " Set stripe" $LFS setstripe $TESTFILE -c 1 @@ -859,14 +874,14 @@ test_11() { #prepare the test block_limit=`(echo 0; df -t lustre -P | awk '{print $(NF - 4)}') | tail -n 1` echo $block_limit - orig_dbr=`cat /proc/sys/vm/dirty_background_ratio` - orig_dec=`cat /proc/sys/vm/dirty_expire_centisecs` - orig_dr=`cat /proc/sys/vm/dirty_ratio` - orig_dwc=`cat /proc/sys/vm/dirty_writeback_centisecs` - echo 1 > /proc/sys/vm/dirty_background_ratio - echo 30 > /proc/sys/vm/dirty_expire_centisecs - echo 1 > /proc/sys/vm/dirty_ratio - echo 50 > /proc/sys/vm/dirty_writeback_centisecs + orig_dbr=`sysctl -n vm.dirty_background_ratio` + orig_dec=`sysctl -n vm.dirty_expire_centisecs` + orig_dr=`sysctl -n vm.dirty_ratio` + orig_dwc=`sysctl -n vm.dirty_writeback_centisecs` + sysctl -w vm.dirty_background_ratio=1 + sysctl -w vm.dirty_expire_centisecs=30 + sysctl -w vm.dirty_ratio=1 + sysctl -w vm.dirty_writeback_centisecs=50 TESTDIR="$DIR/$tdir" local RV=0 @@ -878,12 +893,12 @@ test_11() { local i=1 while [ $i -le $REPS ]; do echo "test: cycle($i of $REPS) start at $(date)" - mkdir -p $DIR/$tdir && chmod 777 $DIR/$tdir + mkdir -p $TESTDIR && chmod 777 $TESTDIR echo -n " create a file for uid " for j in `seq 1 30`; do echo -n "$j " # 30MB per dd for a total of 900MB (if space even permits) - runas -u $j dd if=/dev/zero of=$DIR/$tdir/$tfile bs=$blksize count=15 > /dev/null 2>&1 & + runas -u $j dd if=/dev/zero of=$TESTDIR/$tfile bs=$blksize count=15 > /dev/null 2>&1 & done echo "" PROCS=$(ps -ef | grep -v grep | grep "dd if /dev/zero of $TESTDIR" | wc -l) @@ -892,11 +907,11 @@ test_11() { sleep 20 SECS=$((SECS + sleep)) PROCS=$(ps -ef | grep -v grep | grep "dd if /dev/zero of $TESTDIR" | wc -l) - USED=$(du -s $DIR/$tdir | awk '{print $1}') + USED=$(du -s $TESTDIR | awk '{print $1}') PCT=$(($USED * 100 / $block_limit)) echo "${i}/${REPS} ${PCT}% p${PROCS} t${SECS} " if [ $USED -le $LAST_USED ]; then - kill -9 $(ps -ef | grep "dd if /dev/zero of $DIR/$tdir" | grep -v grep | awk '{ print $2 }') + kill -9 $(ps -ef | grep "dd if /dev/zero of $TESTDIR" | grep -v grep | awk '{ print $2 }') i=$REPS RV=2 break @@ -904,17 +919,17 @@ test_11() { LAST_USED=$USED done echo " removing the test files..." - rm -f $DIR/$tdir/$tfile + rm -f $TESTDIR/$tfile echo "cycle $i done at $(date)" i=$[$i+1] done echo "Test took $SECS sec" #clean - echo $orig_dbr > /proc/sys/vm/dirty_background_ratio - echo $orig_dec > /proc/sys/vm/dirty_expire_centisecs - echo $orig_dr > /proc/sys/vm/dirty_ratio - echo $orig_dwc > /proc/sys/vm/dirty_writeback_centisecs + sysctl -w vm.dirty_background_ratio=$orig_dbr + sysctl -w vm.dirty_expire_centisecs=$orig_dec + sysctl -w vm.dirty_ratio=$orig_dr + sysctl -w vm.dirty_writeback_centisecs=$orig_dwc if [ $RV -ne 0 ]; then error "Nothing was written for $SECS sec ... aborting" fi @@ -925,6 +940,7 @@ run_test 11 "run for fixing bug10912 ===========" # test a deadlock between quota and journal b=11693 test_12() { + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir [ "$(grep $DIR2 /proc/mounts)" ] || mount_client $DIR2 || \ @@ -942,7 +958,7 @@ test_12() { wait_delete_completed echo " User quota (limit: $LIMIT kbytes)" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR $LFS setstripe $TESTFILE -i 0 -c 1 chown $TSTUSR.$TSTUSR $TESTFILE @@ -979,7 +995,7 @@ test_12() { while [ true ]; do if [ -z `ps -ef | awk '$2 == '${DDPID}' { print $8 }'` ]; then break; fi count=$[count+1] - if [ $count -gt 100 ]; then + if [ $count -gt 150 ]; then error "dd should be finished!" fi sleep 1 @@ -989,20 +1005,21 @@ test_12() { rm -f $TESTFILE $TESTFILE2 sync; sleep 3; sync; - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR # clear user limit + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } run_test 12 "test a deadlock between quota and journal ===" # test multiple clients write block quota b=11693 test_13() { - wait_delete_completed + mkdir -p $DIR/$tdir + wait_delete_completed # one OST * 10 + (mds + other OSTs) LIMIT=$((BUNIT_SZ * 10 + (BUNIT_SZ * OSTCOUNT))) TESTFILE="$DIR/$tdir/$tfile" echo " User quota (limit: $LIMIT kbytes)" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR $SHOW_QUOTA_USER $LFS setstripe $TESTFILE -i 0 -c 1 @@ -1052,12 +1069,12 @@ test_13() { rm -f $TESTFILE $TESTFILE.2 sync; sleep 3; sync; - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR # clear user limit + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } run_test 13 "test multiple clients write block quota ===" check_if_quota_zero(){ - line=`$LFS quota -$1 $2 $DIR | wc -l` + line=`$LFS quota -v -$1 $2 $DIR | wc -l` for i in `seq 3 $line`; do if [ $i -eq 3 ]; then field="3 4 6 7" @@ -1065,29 +1082,27 @@ check_if_quota_zero(){ field="3 5" fi for j in $field; do - tmp=`$LFS quota -$1 $2 $DIR | sed -n ${i}p | + tmp=`$LFS quota -v -$1 $2 $DIR | sed -n ${i}p | awk '{print $'"$j"'}'` - [ -n "$tmp" ] && [ $tmp -ne 0 ] && $LFS quota -$1 $2 $DIR && \ + [ -n "$tmp" ] && [ $tmp -ne 0 ] && $LFS quota -v -$1 $2 $DIR && \ error "quota on $2 isn't clean" done done echo "pass check_if_quota_zero" } -pre_test_14 () { +test_14a() { # was test_14 b=12223 -- setting quota on root + TESTFILE="$DIR/$tdir/$tfile" + # reboot the lustre sync; sleep 5; sync - cd $T_PWD; sh llmountcleanup.sh || error "llmountcleanup failed" - sh llmount.sh - run_test 0 "reboot lustre" -} -pre_test_14 + cleanup_and_setup_lustre + test_0 -test_14(){ # b=12223 -- setting quota on root - TESTFILE="$DIR/$tdir/$tfile" + mkdir -p $DIR/$tdir # out of root's file and block quota - $LFS setquota -u root 10 10 10 10 $DIR + $LFS setquota -u root -b 10 -B 10 -i 10 -I 10 $DIR createmany -m ${TESTFILE} 20 || \ error "unexpected: user(root) create files failly!" dd if=/dev/zero of=$TESTFILE bs=4k count=4096 || \ @@ -1103,7 +1118,7 @@ test_14(){ # b=12223 -- setting quota on root # do the check dmesg | tail | grep "\-122" |grep llog_obd_origin_add && error "err -122 not found in dmesg" - $LFS setquota -u root 0 0 0 0 $DIR + $LFS setquota -u root -b 0 -B 0 -i 0 -I 0 $DIR #check_if_quota_zero u root # clean @@ -1111,15 +1126,26 @@ test_14(){ # b=12223 -- setting quota on root rm -f $TESTFILE sync; sleep 3; sync; } -run_test 14 "test setting quota on root ===" +run_test 14a "test setting quota on root ===" +# set quota version (both administrative and operational quotas) quota_set_version() { - do_facet mds "for i in /proc/fs/lustre/mds/${FSNAME}-MDT*/quota_type; do - echo $1 >> \\\$i; - done" + do_facet mds "lctl set_param mds.${FSNAME}-MDT*.quota_type=$1" + for j in `seq $OSTCOUNT`; do + do_facet ost$j "lctl set_param obdfilter.*.quota_type=$1" + done } -test_14a(){ +# save quota version (both administrative and operational quotas) +quota_save_version() { + do_facet mgs "lctl conf_param ${FSNAME}-MDT*.mdt.quota_type=$1" + do_facet mgs "lctl conf_param ${FSNAME}-OST*.ost.quota_type=$1" +} + +test_14b(){ + local l + local CURSPACE + # 1. check that required users exist # 2. ensure that switch to new mode will start conversion # 3. start quota in old mode and put some entries @@ -1129,7 +1155,7 @@ test_14a(){ MISSING_USERS="" for i in `seq 1 30`; do - check_runas_id_ret quota15_$i "runas -u quota15_$i" + check_runas_id_ret quota15_$i "runas -u quota15_$i" >/dev/null 2>/dev/null if [ "$?" != "0" ]; then MISSING_USERS="$MISSING_USERS quota15_$i" fi @@ -1141,26 +1167,52 @@ test_14a(){ fi $LFS quotaoff -ug $DIR + echo "setting quota version 1" quota_set_version 1 + echo "running quotacheck" $LFS quotacheck -ug $DIR + mkdir -p $DIR/$tdir + chmod 0777 $DIR/$tdir + for i in `seq 1 30`; do + l=$[$i*1024*128] # set limits in 128 Mb units + $LFS setquota -u quota15_$i -b $l -B $l -i $l -I $l $DIR || error "lfs setquota failed" + runas -u quota15_$i dd if=/dev/zero of="$DIR/$tdir/quota15_$i" \ + bs=1048576 count=$[($i+1)/2] || error "dd failed" + done + cancel_lru_locks osc + + echo "saving quota data" for i in `seq 1 30`; do - $LFS setquota -u quota15_$i $i $i $i $i $DIR || error "lfs setquota failed" + CURSPACE[$i]=`$LFS quota -v -u quota15_$i $MOUNT | awk '{if(start) {start=0; sum += $1} if(($1 ~ /OST/) && (NF==1)) {start=1;} + if(($1 ~ /OST/) && (NF != 1)) {sum += $2}; } END { print sum }'` done $LFS quotaoff -ug $DIR - quota_set_version 2 + echo "setting version 3 or 2 (dependent on the kernel support)" + quota_set_version 3 2>&1 | grep "Invalid argument" && quota_set_version 2 + + echo "invalidating quota files" $LFS quotainv -ug $DIR + $LFS quotainv -ugf $DIR $LFS quotacheck -ug $DIR for i in `seq 1 30`; do + l=$[$i*1024*128] # the format is "mntpnt curspace[*] bsoftlimit bhardlimit [time] curinodes[*] isoftlimit ihardlimit" - ($LFS quota -u quota15_$i $DIR | grep -E '^ *'$DIR' *[0-9]+\** *'$i' *'$i' *[0-9]+\** *'$i' *'$i) \ - || error "lfs quota output is unexpected" - $LFS setquota -u quota15_$i 0 0 0 0 $DIR || error "ifs setquota clear failed" + echo "checking administrative quota migration results for user quota15_$i" + $LFS quota -v -u quota15_$i $DIR | grep -E '^ *'$MOUNT' *[0-9]+\** *'$l' *'$l' *[0-9]+\** *'$l' *'$l \ + || error "lfs quota output is unexpected" + echo "checking operational quota migration results for user quota15_$i, curspace should be ${CURSPACE[$i]}" + l=`$LFS quota -v -u quota15_$i $MOUNT | awk '{if(start) {start=0; sum += $1} if(($1 ~ /OST/) && (NF==1)) {start=1;} + if(($1 ~ /OST/) && (NF != 1)) {sum += $2}; } END { print sum }'` + echo "...real is $l" + [ "$l" -eq "${CURSPACE[$i]}" ] || error "curspace mismatch" + rm $DIR/$tdir/quota15_$i || error "could not remove quota15_$i" + $LFS setquota -u quota15_$i -b 0 -B 0 -i 0 -I 0 $DIR || error "lfs setquota clear failed" done } -run_test 14a "setting 30 quota entries in quota v1 file before conversion ===" +run_test 14b "setting 30 quota entries in quota v1 file before conversion ===" test_15(){ LIMIT=$((24 * 1024 * 1024 * 1024 * 1024)) # 24 TB @@ -1169,24 +1221,24 @@ test_15(){ wait_delete_completed # test for user - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR || error "failed setting user quota limit $LIMIT" - TOTAL_LIMIT="`$LFS quota -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + TOTAL_LIMIT="`$LFS quota -v -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" [ $TOTAL_LIMIT -eq $LIMIT ] || error " (user)total limits = $TOTAL_LIMIT; limit = $LIMIT, failed!" echo " (user)total limits = $TOTAL_LIMIT; limit = $LIMIT, successful!" - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR || error "failed removing user quota limit" + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR # test for group - $LFS setquota -g $TSTUSR 0 $LIMIT 0 0 $DIR || error "failed setting group quota limit $LIMIT" - TOTAL_LIMIT="`$LFS quota -g $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" + $LFS setquota -g $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + TOTAL_LIMIT="`$LFS quota -v -g $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" [ $TOTAL_LIMIT -eq $LIMIT ] || error " (group)total limits = $TOTAL_LIMIT; limit = $LIMIT, failed!" echo " (group)total limits = $TOTAL_LIMIT; limit = $LIMIT, successful!" - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR || error "failed removing group quota limit" + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR $LFS quotaoff -ug $DIR quota_set_version 1 $LFS quotacheck -ug $DIR || error "quotacheck failed" echo "Testing that >4GB quota limits fail on volume with quota v1" - ! $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR + ! $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR } run_test 15 "set block quota more than 4T ===" @@ -1194,15 +1246,16 @@ run_test 15 "set block quota more than 4T ===" test_16_tub() { LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 4)) TESTFILE="$DIR/$tdir/$tfile" + mkdir -p $DIR/$tdir wait_delete_completed echo " User quota (limit: $LIMIT kbytes)" if [ $1 == "u" ]; then - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR $SHOW_QUOTA_USER else - $LFS setquota -g $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR $SHOW_QUOTA_GROUP fi @@ -1228,7 +1281,7 @@ test_16_tub() { rm -f $TESTFILE sync; sleep 3; sync; - $LFS setquota -$1 $TSTUSR 0 0 0 0 $DIR + $LFS setquota -$1 $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } # test without adjusting qunit @@ -1262,13 +1315,14 @@ test_17() { TESTFILE="$DIR/$tdir/$tfile-a" TESTFILE2="$DIR/$tdir/$tfile-b" + mkdir -p $DIR/$tdir BLK_LIMIT=$((100 * 1024)) # 100M log " Set enough high limit(block:$BLK_LIMIT) for user: $TSTUSR" - $LFS setquota -u $TSTUSR 0 $BLK_LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I 0 $DIR log " Set enough high limit(block:$BLK_LIMIT) for group: $TSTUSR" - $LFS setquota -g $TSTUSR 0 $BLK_LIMIT 0 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I 0 $DIR touch $TESTFILE chown $TSTUSR.$TSTUSR $TESTFILE @@ -1311,6 +1365,7 @@ run_test 17 "run for fixing bug14526 ===========" test_18() { LIMIT=$((100 * 1024 * 1024)) # 100G TESTFILE="$DIR/$tdir/$tfile" + mkdir -p $DIR/$tdir wait_delete_completed @@ -1318,7 +1373,7 @@ test_18() { set_blk_unitsz 1024 log " User quota (limit: $LIMIT kbytes)" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $MOUNT + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT $SHOW_QUOTA_USER $LFS setstripe $TESTFILE -i 0 -c 1 @@ -1336,7 +1391,7 @@ test_18() { echo " step2: testing ......" count=0 - timeout=$(sysctl -n lustre.timeout) + timeout=$(lctl get_param -n timeout) while [ true ]; do if [ -z `ps -ef | awk '$2 == '${DDPID}' { print $8 }'` ]; then break; fi count=$[count+1] @@ -1355,7 +1410,7 @@ test_18() { rm -f $TESTFILE sync; sleep 3; sync; - $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT # clear user limit + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT set_blk_unitsz $((128 * 1024)) set_blk_tunesz $((128 * 1024 / 2)) @@ -1366,6 +1421,7 @@ run_test 18 "run for fixing bug14840 ===========" test_18a() { LIMIT=$((100 * 1024 * 1024)) # 100G TESTFILE="$DIR/$tdir/$tfile-a" + mkdir -p $DIR/$tdir wait_delete_completed @@ -1373,7 +1429,7 @@ test_18a() { set_blk_unitsz 1024 log " User quota (limit: $LIMIT kbytes)" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $MOUNT + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT $SHOW_QUOTA_USER $LFS setstripe $TESTFILE -i 0 -c 1 @@ -1388,7 +1444,7 @@ test_18a() { echo " step2: testing ......" count=0 - timeout=$(sysctl -n lustre.timeout) + timeout=$(lctl get_param -n timeout) while [ true ]; do if [ -z `ps -ef | awk '$2 == '${DDPID}' { print $8 }'` ]; then break; fi count=$[count+1] @@ -1411,18 +1467,186 @@ test_18a() { rm -f $TESTFILE sync; sleep 3; sync; - $LFS setquota -u $TSTUSR 0 0 0 0 $MOUNT # clear user limit + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT set_blk_unitsz $((128 * 1024)) set_blk_tunesz $((128 * 1024 / 2)) } run_test 18a "run for fixing bug14840 ===========" +test_19() { + # 1 Mb bunit per each MDS/OSS + LIMIT=$((($OSTCOUNT + 1) * 1024)) + TESTFILE="$DIR/$tdir/$tfile" + mkdir -p $DIR/$tdir + + wait_delete_completed + + # set 1 Mb quota unit size + set_blk_tunesz 512 + set_blk_unitsz 1024 + + # bind file to a single OST + $LFS setstripe -c 1 $TESTFILE + chown $TSTUSR.$TSTUSR $TESTFILE + + echo " User quota (limit: $LIMIT kbytes)" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT + $SHOW_QUOTA_USER + echo " Updating quota limits" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT + $SHOW_QUOTA_USER + + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=1028 || true + # for now page cache of TESTFILE may still be dirty, + # let's push it to the corresponding OST, this will also + # cache NOQUOTA on the client from OST's reply + cancel_lru_locks osc + $RUNAS dd if=/dev/zero of=$TESTFILE seek=1028 bs=$BLK_SZ count=1 && \ + error "(usr) write success, should be EDQUOT" + $SHOW_QUOTA_USER + + # cleanup + rm -f $TESTFILE + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) + +} +run_test 19 "test if administrative limits updates do not zero operational limits (14790) ===" + +test_20() +{ + LSTR=(1t 2g 3m 4k) # limits strings + LVAL=($[1*1024*1024*1024] $[2*1024*1024] $[3*1024*1024] $[4*1024]) # limits values + + $LFS setquota -u $TSTUSR --block-softlimit ${LSTR[0]} \ + $MOUNT || error "could not set quota limits" + + $LFS setquota -u $TSTUSR --block-hardlimit ${LSTR[1]} \ + --inode-softlimit ${LSTR[2]} \ + --inode-hardlimit ${LSTR[3]} \ + $MOUNT || error "could not set quota limits" + + ($LFS quota -v -u $TSTUSR $MOUNT | \ + grep -E '^ *'$MOUNT' *[0-9]+\** *'${LVAL[0]}' *'${LVAL[1]}' *[0-9]+\** *'${LVAL[2]}' *'${LVAL[3]}) \ + || error "lfs quota output is unexpected" + + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 \ + $MOUNT || error "could not reset quota limits" + +} +run_test 20 "test if setquota specifiers work properly (15754)" + +test_21_sub() { + local testfile=$1 + local blk_number=$2 + local seconds=$3 + + time=$(($(date +%s) + seconds)) + while [ $(date +%s) -lt $time ]; do + $RUNAS dd if=/dev/zero of=$testfile bs=$BLK_SZ count=$blk_number > /dev/null 2>&1 + rm -f $testfile + done +} + +# run for fixing bug16053, setquota shouldn't fail when writing and +# deleting are happening +test_21() { + set_blk_tunesz 512 + set_blk_unitsz 1024 + + wait_delete_completed + + TESTFILE="$DIR/$tdir/$tfile" + + BLK_LIMIT=$((10 * 1024 * 1024)) # 10G + FILE_LIMIT=1000000 + + log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for user: $TSTUSR" + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $MOUNT + log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for group: $TSTUSR" + $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $MOUNT + + # repeat writing on a 1M file + test_21_sub ${TESTFILE}_1 1024 30 & + DDPID1=$! + # repeat writing on a 128M file + test_21_sub ${TESTFILE}_2 $((1024 * 128)) 30 & + DDPID2=$! + + time=$(($(date +%s) + 30)) + i=1 + while [ $(date +%s) -lt $time ]; do + log " Set quota for $i times" + $LFS setquota -u $TSTUSR -b 0 -B $((BLK_LIMIT + 1024 * i)) -i 0 -I $((FILE_LIMIT + i)) $MOUNT + $LFS setquota -g $TSTUSR -b 0 -B $((BLK_LIMIT + 1024 * i)) -i 0 -I $((FILE_LIMIT + i)) $MOUNT + i=$((i+1)) + sleep 1 + done + + count=0 + while [ true ]; do + if [ $(ps -p ${DDPID1} | wc -l) -eq 1 ]; then break; fi + count=$[count+1] + if [ $count -gt 60 ]; then + error "dd should be finished!" + fi + sleep 1 + done + echo "(dd_pid=$DDPID1, time=$count)successful" + + count=0 + while [ true ]; do + if [ $(ps -p ${DDPID2} | wc -l) -eq 1 ]; then break; fi + count=$[count+1] + if [ $count -gt 60 ]; then + error "dd should be finished!" + fi + sleep 1 + done + echo "(dd_pid=$DDPID2, time=$count)successful" + + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + + return $RC +} +run_test 21 "run for fixing bug16053 ===========" + +test_22() { + local SAVEREFORMAT + + SAVEREFORMAT=$REFORMAT + $LFS quotaoff -ug $DIR || error "could not turn quotas off" + quota_set_version "1" + $LFS quotacheck -ug $DIR || error "quotacheck failed" + + quota_save_version "ug1" + + REFORMAT="reformat" + stopall + mount + setupall + REFORMAT=$SAVEREFORMAT + + echo "checking parameters" + + do_facet mds "lctl get_param mds.${FSNAME}-MDT*.quota_type" | grep "ug1" || error "admin failure" + do_facet ost1 "lctl get_param obdfilter.*.quota_type" | grep "ug1" || error "op failure" + + run_test 0 "reboot lustre" +} +run_test 22 "test if quota_type saved as permanent parameter ====" + # turn off quota test_99() { $LFS quotaoff $DIR - sysctl -w lnet.debug="-quota" + lctl set_param debug="-quota" return 0 } diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 0d3786136cf1826f2824c9e4ed181065fb63c79d..af35b8eaa0f4998c214aa7b1085acdaa480bf630 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -7,8 +7,8 @@ set -e ONLY=${ONLY:-"$*"} -# bug number for skipped test: 13297 2108 9789 3637 9789 3561 12622 13310 10764 -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27u 42a 42b 42c 42d 45 51d 74b 75 $SANITY_EXCEPT" } +# bug number for skipped test: 13297 2108 9789 3637 9789 3561 12622 15528/2330 5188 10764 +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"27u 42a 42b 42c 42d 45 51d 62 68 75 $SANITY_EXCEPT" } # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! # Tests that fail on uml, maybe elsewhere, FIXME @@ -23,8 +23,8 @@ case `uname -r` in *) error "unsupported kernel" ;; esac -SRCDIR=`dirname $0` -export PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH:/sbin +SRCDIR=$(cd $(dirname $0); echo $PWD) +export PATH=$PATH:/sbin TMP=${TMP:-/tmp} @@ -41,7 +41,7 @@ LCTL=${LCTL:-lctl} MCREATE=${MCREATE:-mcreate} OPENFILE=${OPENFILE:-openfile} OPENUNLINK=${OPENUNLINK:-openunlink} -RANDOM_READS=${RANDOM_READS:-"random-reads"} +READS=${READS:-"reads"} TOEXCL=${TOEXCL:-toexcl} TRUNCATE=${TRUNCATE:-truncate} MUNLINK=${MUNLINK:-munlink} @@ -63,7 +63,7 @@ SAVE_PWD=$PWD CLEANUP=${CLEANUP:-:} SETUP=${SETUP:-:} TRACE=${TRACE:-""} -LUSTRE=${LUSTRE:-`dirname $0`/..} +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} @@ -106,7 +106,7 @@ fi check_and_setup_lustre DIR=${DIR:-$MOUNT} -[ -z "`echo $DIR | grep $MOUNT`" ] && echo "$DIR not in $MOUNT" && exit 99 +assert_DIR LOVNAME=`lctl get_param -n llite.*.lov.common_name | tail -n 1` OSTCOUNT=`lctl get_param -n lov.$LOVNAME.numobd` @@ -134,8 +134,8 @@ echo # add a newline after mke2fs. umask 077 -OLDDEBUG="`sysctl -n lnet.debug 2> /dev/null`" -sysctl -w lnet.debug=-1 2> /dev/null || true +OLDDEBUG="`lctl get_param -n debug 2> /dev/null`" +lctl set_param debug=-1 2> /dev/null || true test_0() { touch $DIR/$tfile $CHECKSTAT -t file $DIR/$tfile || error @@ -493,6 +493,7 @@ run_test 21 "write to dangling link ============================" test_22() { WDIR=$DIR/$tdir + mkdir -p $WDIR chown $RUNAS_ID $WDIR (cd $WDIR || error "cd $WDIR failed"; $RUNAS tar cf - /etc/hosts /etc/sysconfig/network | \ @@ -759,6 +760,7 @@ run_test 26e "unlink multiple component recursive symlink ======" # recursive symlinks (bug 7022) test_26f() { + mkdir -p $DIR/$tdir mkdir $DIR/$tdir/$tfile || error "mkdir $DIR/$tdir/$tfile failed" cd $DIR/$tdir/$tfile || error "cd $DIR/$tdir/$tfile failed" mkdir -p lndir/bar1 || error "mkdir lndir/bar1 failed" @@ -894,7 +896,7 @@ reset_enospc() { [ "$1" ] && FAIL_LOC=$1 || FAIL_LOC=0 mkdir -p $DIR/d27/nospc rmdir $DIR/d27/nospc - sysctl -w lustre.fail_loc=$FAIL_LOC + lctl set_param fail_loc=$FAIL_LOC } exhaust_precreations() { @@ -908,7 +910,7 @@ exhaust_precreations() { mkdir -p $DIR/d27/${OST} $SETSTRIPE $DIR/d27/${OST} -i $OSTIDX -c 1 #define OBD_FAIL_OST_ENOSPC 0x215 - sysctl -w lustre.fail_loc=0x215 + lctl set_param fail_loc=0x215 echo "Creating to objid $last_id on ost $OST..." createmany -o $DIR/d27/${OST}/f $next_id $((last_id - next_id + 2)) lctl get_param -n osc.${OST}-osc.prealloc* | grep '[0-9]' @@ -1028,10 +1030,10 @@ test_27u() { # bug 4900 #define OBD_FAIL_MDS_OSC_PRECREATE 0x139 - sysctl -w lustre.fail_loc=0x139 + lctl set_param fail_loc=0x139 mkdir -p $DIR/d27u createmany -o $DIR/d27u/t- 1000 - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 TLOG=$DIR/$tfile.getstripe $GETSTRIPE $DIR/d27u > $TLOG @@ -1053,15 +1055,15 @@ test_27v() { # bug 4900 touch $DIR/$tdir/$tfile #define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 - sysctl -w lustre.fail_loc=0x705 + lctl set_param fail_loc=0x705 START=`date +%s` for F in `seq 1 32`; do touch $DIR/$tdir/$tfile.$F done - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 FINISH=`date +%s` - TIMEOUT=`sysctl -n lustre.timeout` + TIMEOUT=`lctl get_param -n timeout` [ $((FINISH - START)) -ge $((TIMEOUT / 2)) ] && \ error "$FINISH - $START >= $TIMEOUT / 2" @@ -1543,7 +1545,7 @@ test_36f() { DATESTR="Dec 20 2000" mkdir -p $DIR/$tdir #define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 - sysctl -w lustre.fail_loc=0x80000214 + lctl set_param fail_loc=0x80000214 date; date +%s cp /etc/hosts $DIR/$tdir/$tfile sync & # write RPC generated with "current" inode timestamp, but delayed @@ -1639,9 +1641,9 @@ start_writeback() { # in 2.6, restore /proc/sys/vm/dirty_writeback_centisecs, # dirty_ratio, dirty_background_ratio if [ -f /proc/sys/vm/dirty_writeback_centisecs ]; then - echo $WRITEBACK_SAVE > /proc/sys/vm/dirty_writeback_centisecs - echo $BG_DIRTY_RATIO_SAVE > /proc/sys/vm/dirty_background_ratio - echo $DIRTY_RATIO_SAVE > /proc/sys/vm/dirty_ratio + sysctl -w vm.dirty_writeback_centisecs=$WRITEBACK_SAVE + sysctl -w vm.dirty_background_ratio=$BG_DIRTY_RATIO_SAVE + sysctl -w vm.dirty_ratio=$DIRTY_RATIO_SAVE else # if file not here, we are a 2.4 kernel kill -CONT `pidof kupdated` @@ -1654,14 +1656,14 @@ stop_writeback() { trap start_writeback EXIT # in 2.6, save and 0 /proc/sys/vm/dirty_writeback_centisecs if [ -f /proc/sys/vm/dirty_writeback_centisecs ]; then - WRITEBACK_SAVE=`cat /proc/sys/vm/dirty_writeback_centisecs` - echo 0 > /proc/sys/vm/dirty_writeback_centisecs + WRITEBACK_SAVE=`sysctl -n vm.dirty_writeback_centisecs` + sysctl -w vm.dirty_writeback_centisecs=0 # save and increase /proc/sys/vm/dirty_ratio - DIRTY_RATIO_SAVE=`cat /proc/sys/vm/dirty_ratio` - echo $MAX_DIRTY_RATIO > /proc/sys/vm/dirty_ratio + DIRTY_RATIO_SAVE=`sysctl -n vm.dirty_ratio` + sysctl -w vm.dirty_ratio=$MAX_DIRTY_RATIO # save and increase /proc/sys/vm/dirty_background_ratio - BG_DIRTY_RATIO_SAVE=`cat /proc/sys/vm/dirty_background_ratio` - echo $MAX_BG_DIRTY_RATIO > /proc/sys/vm/dirty_background_ratio + BG_DIRTY_RATIO_SAVE=`sysctl -n vm.dirty_background_ratio` + sysctl -w vm.dirty_background_ratio=$MAX_BG_DIRTY_RATIO else # if file not here, we are a 2.4 kernel kill -STOP `pidof kupdated` @@ -1768,6 +1770,7 @@ test_42d() { run_test 42d "test complete truncate of file with cached dirty data" test_43() { + mkdir -p $DIR/$tdir cp -p /bin/ls $DIR/$tdir/$tfile multiop $DIR/$tdir/$tfile Ow_c & pid=$! @@ -1961,7 +1964,7 @@ run_test 48b "Access removed working dir (should return errors)=" test_48c() { # bug 2350 check_kernel_version 36 || return 0 - #sysctl -w lnet.debug=-1 + #lctl set_param debug=-1 #set -vx mkdir -p $DIR/d48c/dir cd $DIR/d48c/dir @@ -1984,7 +1987,7 @@ run_test 48c "Access removed working subdir (should return errors)" test_48d() { # bug 2350 check_kernel_version 36 || return 0 - #sysctl -w lnet.debug=-1 + #lctl set_param debug=-1 #set -vx mkdir -p $DIR/d48d/dir cd $DIR/d48d/dir @@ -2008,7 +2011,7 @@ run_test 48d "Access removed parent subdir (should return errors)" test_48e() { # bug 4134 check_kernel_version 41 || return 0 - #sysctl -w lnet.debug=-1 + #lctl set_param debug=-1 #set -vx mkdir -p $DIR/d48e/dir cd $DIR/d48e/dir @@ -2032,7 +2035,7 @@ test_50() { } run_test 50 "special situations: /proc symlinks ===============" -test_51() { +test_51a() { # was test_51 # bug 1516 - create an empty entry right after ".." then split dir mkdir $DIR/d51 touch $DIR/d51/foo @@ -2048,7 +2051,7 @@ test_51() { echo ls -l $DIR/d51 > /dev/null || error } -run_test 51 "special situations: split htree with empty entry ==" +run_test 51a "special situations: split htree with empty entry ==" export NUMTEST=70000 test_51b() { @@ -2145,11 +2148,11 @@ test_52c() { # 12848 simulating client < 1.4.7 touch $DIR/d52c/foo # skip MDS_BFLAG_EXT_FLAGS in mdc_getattr_pack #define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 - sysctl -w lustre.fail_loc=0x802 + lctl set_param fail_loc=0x802 chattr =i $DIR/d52c/foo || error lsattr $DIR/d52c/foo | egrep -q "^-+i-+ $DIR/d52c/foo" || error chattr -i $DIR/d52c/foo || error - sysctl -w lustre.fail_loc=0 + lctl set_param -n fail_loc=0 rm -fr $DIR/d52c || error } @@ -2266,7 +2269,7 @@ test_55() { } run_test 55 "check iopen_connect_dentry() ======================" -test_56() { +test_56a() { # was test_56 rm -rf $DIR/d56 $SETSTRIPE -d $DIR mkdir $DIR/d56 @@ -2316,7 +2319,7 @@ test_56() { error "lfs getstripe --obd wrong: should not show file on other obd" echo "lfs getstripe --obd passed." } -run_test 56 "check lfs getstripe ====================================" +run_test 56a "check lfs getstripe ====================================" NUMFILES=3 NUMDIRS=3 @@ -2464,6 +2467,52 @@ test_56o() { } run_test 56o "check lfs find -mtime for old files ==========================" +test_56p() { + [ $RUNAS_ID -eq $UID ] && skip "RUNAS_ID = UID = $UID -- skipping" && return + + TDIR=$DIR/${tdir}g + rm -rf $TDIR + + setup_56 $NUMFILES $NUMDIRS + + chown $RUNAS_ID $TDIR/file* || error "chown $DIR/${tdir}g/file$i failed" + EXPECTED=$NUMFILES + NUMS="`$LFIND -uid $RUNAS_ID $TDIR | wc -l`" + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find -uid $TDIR wrong: found $NUMS, expected $EXPECTED" + + EXPECTED=$(( ($NUMFILES+1) * $NUMDIRS + 1)) + NUMS="`$LFIND ! -uid $RUNAS_ID $TDIR | wc -l`" + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find ! -uid $TDIR wrong: found $NUMS, expected $EXPECTED" + + echo "lfs find -uid and ! -uid passed." +} +run_test 56p "check lfs find -uid and ! -uid ===============================" + +test_56q() { + [ $RUNAS_ID -eq $UID ] && skip "RUNAS_ID = UID = $UID -- skipping" && return + + TDIR=$DIR/${tdir}g + rm -rf $TDIR + + setup_56 $NUMFILES $NUMDIRS + + chgrp $RUNAS_ID $TDIR/file* || error "chown $DIR/${tdir}g/file$i failed" + EXPECTED=$NUMFILES + NUMS="`$LFIND -gid $RUNAS_ID $TDIR | wc -l`" + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find -gid $TDIR wrong: found $NUMS, expected $EXPECTED" + + EXPECTED=$(( ($NUMFILES+1) * $NUMDIRS + 1)) + NUMS="`$LFIND ! -gid $RUNAS_ID $TDIR | wc -l`" + [ $NUMS -eq $EXPECTED ] || \ + error "lfs find ! -gid $TDIR wrong: found $NUMS, expected $EXPECTED" + + echo "lfs find -gid and ! -gid passed." +} +run_test 56q "check lfs find -gid and ! -gid ===============================" + test_57a() { remote_mds && skip "remote MDS" && return local MNTDEV="mds.*.mntdev" @@ -2536,12 +2585,12 @@ test_59() { run_test 59 "verify cancellation of llog records async =========" TEST60_HEAD="test_60 run $RANDOM" -test_60() { +test_60a() { # was test_60 [ ! -f run-llog.sh ] && skip "missing subtest run-llog.sh" && return log "$TEST60_HEAD - from kernel mode" sh run-llog.sh } -run_test 60 "llog sanity tests run from kernel module ==========" +run_test 60a "llog sanity tests run from kernel module ==========" test_60b() { # bug 6411 dmesg > $DIR/$tfile @@ -2565,28 +2614,27 @@ test_60c() { echo "create 5000 files" createmany -o $DIR/f60c- 5000 #define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137 - sysctl -w lustre.fail_loc=0x80000137 + lctl set_param fail_loc=0x80000137 unlinkmany $DIR/f60c- 5000 } run_test 60c "unlink file when mds full" test_60d() { - SAVEPRINTK=$(sysctl -n lnet.printk) + SAVEPRINTK=$(lctl get_param -n printk) # verify "lctl mark" is even working" MESSAGE="test message ID $RANDOM $$" $LCTL mark "$MESSAGE" || error "$LCTL mark failed" dmesg | grep -q "$MESSAGE" || error "didn't find debug marker in log" - sysctl -w lnet.printk=0 || error "set lnet.printk failed" - sysctl -n lnet.printk | grep emerg || error "lnet.printk dropped emerg" - + lctl set_param printk=0 || error "set lnet.printk failed" + lctl get_param -n printk | grep emerg || error "lnet.printk dropped emerg" MESSAGE="new test message ID $RANDOM $$" # Assume here that libcfs_debug_mark_buffer() uses D_WARNING $LCTL mark "$MESSAGE" || error "$LCTL mark failed" dmesg | grep -q "$MESSAGE" && error "D_WARNING wasn't masked" || true - sysctl -w lnet.printk="$SAVEPRINTK" + lctl set_param -n printk="$SAVEPRINTK" } run_test 60d "test printk console message masking" @@ -2605,16 +2653,16 @@ test_62() { echo foo > $f cancel_lru_locks osc #define OBD_FAIL_OSC_MATCH 0x405 - sysctl -w lustre.fail_loc=0x405 + lctl set_param fail_loc=0x405 cat $f && error "cat succeeded, expect -EIO" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 } # This test is now irrelevant (as of bug 10718 inclusion), we no longer # match every page all of the time. -#run_test 62 "verify obd_match failure doesn't LBUG (should -EIO)" +run_test 62 "verify obd_match failure doesn't LBUG (should -EIO)" # bug 2319 - oig_wait() interrupted causes crash because of invalid waitq. -test_63() { +test_63a() { # was test_63 MAX_DIRTY_MB=`lctl get_param -n osc.*.max_dirty_mb | head -n 1` lctl set_param -n osc.*.max_dirty_mb 0 for i in `seq 10` ; do @@ -2627,20 +2675,20 @@ test_63() { lctl set_param -n osc.*.max_dirty_mb $MAX_DIRTY_MB rm -f $DIR/f63 || true } -run_test 63 "Verify oig_wait interruption does not crash =======" +run_test 63a "Verify oig_wait interruption does not crash =======" # bug 2248 - async write errors didn't return to application on sync # bug 3677 - async write errors left page locked test_63b() { debugsave - sysctl -w lnet.debug=-1 + lctl set_param debug=-1 # ensure we have a grant to do async writes dd if=/dev/zero of=$DIR/$tfile bs=4k count=1 rm $DIR/$tfile #define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 - sysctl -w lustre.fail_loc=0x80000406 + lctl set_param fail_loc=0x80000406 multiop $DIR/$tfile Owy && \ error "sync didn't return ENOMEM" sync; sleep 2; sync # do a real sync this time to flush page @@ -2795,10 +2843,10 @@ test_66() { } run_test 66 "update inode blocks count on client ===============" -test_67() { # bug 3285 - supplementary group fails on MDS, passes on client +test_67a() { # was test_67 bug 3285 - supplementary group fails on MDS, passes on client [ "$RUNAS_ID" = "$UID" ] && skip "RUNAS_ID = UID = $UID -- skipping" && return check_kernel_version 35 || return 0 - mkdir $DIR/$tdir + mkdir -p $DIR/$tdir chmod 771 $DIR/$tdir chgrp $RUNAS_ID $DIR/$tdir $RUNAS -u $RUNAS_ID -g $(($RUNAS_ID + 1)) -G1,2,$RUNAS_ID ls $DIR/$tdir @@ -2809,7 +2857,7 @@ test_67() { # bug 3285 - supplementary group fails on MDS, passes on client [ "$GROUP_UPCALL" != "NONE" -a $RC -ne 0 ] && \ error "upcall failed" || true } -run_test 67 "supplementary group failure (should return error) =" +run_test 67a "supplementary group failure (should return error) =" cleanup_67b() { set +vx @@ -2909,21 +2957,21 @@ test_69() { $DIRECTIO write ${f}.2 0 1 || error "directio write error" #define OBD_FAIL_OST_ENOENT 0x217 - sysctl -w lustre.fail_loc=0x217 + lctl set_param fail_loc=0x217 truncate $f 1 # vmtruncate() will ignore truncate() error. $DIRECTIO write $f 0 2 && error "write succeeded, expect -ENOENT" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 $DIRECTIO write $f 0 2 || error "write error" cancel_lru_locks osc $DIRECTIO read $f 0 1 || error "read error" #define OBD_FAIL_OST_ENOENT 0x217 - sysctl -w lustre.fail_loc=0x217 + lctl set_param fail_loc=0x217 $DIRECTIO read $f 1 1 && error "read succeeded, expect -ENOENT" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 rm -f $f } run_test 69 "verify oa2dentry return -ENOENT doesn't LBUG ======" @@ -2983,10 +3031,10 @@ test_73() { pid1=$! #define OBD_FAIL_MDS_PAUSE_OPEN 0x129 - sysctl -w lustre.fail_loc=0x80000129 + lctl set_param fail_loc=0x80000129 multiop $DIR/d73-1/f73-2 Oc & sleep 1 - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 multiop $DIR/d73-2/f73-3 Oc & pid3=$! @@ -3010,10 +3058,10 @@ test_74a() { # bug 6149, 6184 # very important to OR with OBD_FAIL_ONCE (0x80000000) -- otherwise it # will spin in a tight reconnection loop touch $DIR/f74a - sysctl -w lustre.fail_loc=0x8000030e + lctl set_param fail_loc=0x8000030e # get any lock that won't be difficult - lookup works. ls $DIR/f74a - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 true } run_test 74a "ldlm_enqueue freed-export error path, ls (shouldn't LBUG)" @@ -3023,10 +3071,10 @@ test_74b() { # bug 13310 # # very important to OR with OBD_FAIL_ONCE (0x80000000) -- otherwise it # will spin in a tight reconnection loop - sysctl -w lustre.fail_loc=0x8000030e + lctl set_param fail_loc=0x8000030e # get a "difficult" lock touch $DIR/f74b - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 true } run_test 74b "ldlm_enqueue freed-export error path, touch (shouldn't LBUG)" @@ -3179,7 +3227,7 @@ set_checksum_type() [ "$ORIG_CSUM_TYPE" ] || \ ORIG_CSUM_TYPE=`lctl get_param -n osc.*osc-[^mM]*.checksum_type | sed 's/.*\[\(.*\)\].*/\1/g' \ | head -n1` - lctl set_param -n osc.*osc-*.checksum_type $1 + lctl set_param -n osc.*osc-[^mM]*.checksum_type $1 log "set checksum type to $1" return 0 } @@ -3201,11 +3249,11 @@ run_test 77a "normal checksum read/write operation =============" test_77b() { # bug 10889 [ ! -f $F77_TMP ] && setup_f77 #define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 - sysctl -w lustre.fail_loc=0x80000409 + lctl set_param fail_loc=0x80000409 set_checksums 1 dd if=$F77_TMP of=$DIR/f77b bs=1M count=$F77SZ conv=sync || \ error "dd error: $?" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 set_checksums 0 } run_test 77b "checksum error on client write ====================" @@ -3217,9 +3265,9 @@ test_77c() { # bug 10889 cancel_lru_locks osc set_checksum_type $algo #define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 - sysctl -w lustre.fail_loc=0x80000408 + lctl set_param fail_loc=0x80000408 cmp $F77_TMP $DIR/f77b || error "file compare failed" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 done set_checksums 0 set_checksum_type $ORIG_CSUM_TYPE @@ -3228,11 +3276,11 @@ run_test 77c "checksum error on client read ===================" test_77d() { # bug 10889 #define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 - sysctl -w lustre.fail_loc=0x80000409 + lctl set_param fail_loc=0x80000409 set_checksums 1 directio write $DIR/f77 0 $F77SZ $((1024 * 1024)) || \ error "direct write: rc=$?" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 set_checksums 0 } run_test 77d "checksum error on OST direct write ===============" @@ -3240,12 +3288,12 @@ run_test 77d "checksum error on OST direct write ===============" test_77e() { # bug 10889 [ ! -f $DIR/f77 ] && skip "requires 77d - skipping" && return #define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 - sysctl -w lustre.fail_loc=0x80000408 + lctl set_param fail_loc=0x80000408 set_checksums 1 cancel_lru_locks osc directio read $DIR/f77 0 $F77SZ $((1024 * 1024)) || \ error "direct read: rc=$?" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 set_checksums 0 } run_test 77e "checksum error on OST direct read ================" @@ -3256,10 +3304,10 @@ test_77f() { # bug 10889 cancel_lru_locks osc set_checksum_type $algo #define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 - sysctl -w lustre.fail_loc=0x409 + lctl set_param fail_loc=0x409 directio write $DIR/f77 0 $F77SZ $((1024 * 1024)) && \ error "direct write succeeded" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 done set_checksum_type $ORIG_CSUM_TYPE set_checksums 0 @@ -3271,11 +3319,11 @@ test_77g() { # bug 10889 skip "remote OST" && return [ ! -f $F77_TMP ] && setup_f77 #define OBD_FAIL_OST_CHECKSUM_RECEIVE 0x21a - sysctl -w lustre.fail_loc=0x8000021a + lctl set_param fail_loc=0x8000021a set_checksums 1 dd if=$F77_TMP of=$DIR/f77g bs=1M count=$F77SZ || \ error "write error: rc=$?" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 set_checksums 0 } run_test 77g "checksum error on OST write ======================" @@ -3286,19 +3334,19 @@ test_77h() { # bug 10889 [ ! -f $DIR/f77g ] && skip "requires 77g - skipping" && return cancel_lru_locks osc #define OBD_FAIL_OST_CHECKSUM_SEND 0x21b - sysctl -w lustre.fail_loc=0x8000021b + lctl set_param fail_loc=0x8000021b set_checksums 1 cmp $F77_TMP $DIR/f77g || error "file compare failed" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 set_checksums 0 } run_test 77h "checksum error on OST read =======================" test_77i() { # bug 13805 #define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b - sysctl -w lustre.fail_loc=0x40b + lctl set_param fail_loc=0x40b remount_client $MOUNT - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 for VALUE in `lctl get_param osc.*osc-[^mM]*.checksum_type`; do param=`echo ${VALUE[0]} | cut -d "=" -f1` algo=`lctl get_param -n $param | sed 's/.*\[\(.*\)\].*/\1/g'` @@ -3310,9 +3358,9 @@ run_test 77i "client not supporting OSD_CONNECT_CKSUM ==========" test_77j() { # bug 13805 #define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c - sysctl -w lustre.fail_loc=0x40c + lctl set_param fail_loc=0x40c remount_client $MOUNT - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 for VALUE in `lctl get_param osc.*osc-[^mM]*.checksum_type`; do param=`echo ${VALUE[0]} | cut -d "=" -f1` algo=`lctl get_param -n $param | sed 's/.*\[\(.*\)\].*/\1/g'` @@ -3528,7 +3576,7 @@ test_101() { # randomly read 10000 of 64K chunks from file 3x 32MB in size # echo "nreads: $nreads file size: $((cache_limit * 3))MB" - $RANDOM_READS -f $DIR/$tfile -s$((cache_limit * 3192 * 1024)) -b65536 -C -n$nreads -t 180 + $READS -f $DIR/$tfile -s$((cache_limit * 3192 * 1024)) -b65536 -C -n$nreads -t 180 discard=0 for s in `lctl get_param -n llite.*.read_ahead_stats | get_named_value 'read but discarded'`; do @@ -3545,6 +3593,79 @@ test_101() { } run_test 101 "check read-ahead for random reads ================" +export SETUP_TEST101=no +setup_test101() { + [ "$SETUP_TEST101" = "yes" ] && return + mkdir -p $DIR/$tdir + STRIPE_SIZE=1048576 + STRIPE_COUNT=$OSTCOUNT + STRIPE_OFFSET=0 + + trap cleanup_test101 EXIT + # prepare the read-ahead file + $SETSTRIPE $DIR/$tfile -s $STRIPE_SIZE -i $STRIPE_OFFSET -c $OSTCOUNT + + dd if=/dev/zero of=$DIR/$tfile bs=1024k count=100 2> /dev/null + SETUP_TEST102=yes +} + +cleanup_test101() { + [ "$SETUP_TEST101" = "yes" ] || return + trap 0 + rm -rf $DIR/$tdir + SETUP_TEST102=no +} + +calc_total() { + awk 'BEGIN{total=0}; {total+=$1}; END{print total}' +} + +ra_check_101() { + local READ_SIZE=$1 + local STRIPE_SIZE=1048576 + local RA_INC=1048576 + local STRIDE_LENGTH=$((STRIPE_SIZE/READ_SIZE)) + local FILE_LENGTH=$((64*100)) + local discard_limit=$(((((((STRIDE_LENGTH - 1))*3)/(STRIDE_LENGTH*OSTCOUNT))* \ + (STRIDE_LENGTH*OSTCOUNT - STRIDE_LENGTH)))) + + DISCARD=`$LCTL get_param -n llite.*.read_ahead_stats | \ + get_named_value 'read but discarded' | calc_total` + + if [ $DISCARD -gt $discard_limit ]; then + lctl get_param llite.*.read_ahead_stats + error "Too many ($DISCARD) discarded pages with size (${READ_SIZE})" + else + echo "Read-ahead success for size ${READ_SIZE}" + fi +} + +test_101b() { + [ "$OSTCOUNT" -lt "2" ] && skip "skipping stride IO stride-ahead test" && return + local STRIPE_SIZE=1048576 + local STRIDE_SIZE=$((STRIPE_SIZE*OSTCOUNT)) + local FILE_LENGTH=$((STRIPE_SIZE*100)) + local ITERATION=$((FILE_LENGTH/STRIDE_SIZE)) + # prepare the read-ahead file + setup_test101 + cancel_lru_locks osc + for BIDX in 2 4 8 16 32 64 128 256 + do + local BSIZE=$((BIDX*4096)) + local READ_COUNT=$((STRIPE_SIZE/BSIZE)) + local STRIDE_LENGTH=$((STRIDE_SIZE/BSIZE)) + local OFFSET=$((STRIPE_SIZE/BSIZE*(OSTCOUNT - 1))) + $LCTL set_param -n llite.*.read_ahead_stats 0 + $READS -f $DIR/$tfile -l $STRIDE_LENGTH -o $OFFSET \ + -s $FILE_LENGTH -b $STRIPE_SIZE -a $READ_COUNT -n $ITERATION + cancel_lru_locks osc + ra_check_101 $BSIZE + done + cleanup_test101 + true +} +run_test 101b "check stride-io mode read-ahead =================" + export SETUP_TEST102=no setup_test102() { [ "$SETUP_TEST102" = "yes" ] && return @@ -3812,6 +3933,50 @@ test_102g() { } run_test 102g "star copy files, keep osts ===========" +test_102h() { # bug 15777 + [ -z $(lctl get_param -n mdc.*.connect_flags | grep xattr) ] && + skip "must have user_xattr" && return + [ -z "$(which setfattr 2>/dev/null)" ] && + skip "could not find setfattr" && return + + XBIG=trusted.big + XSIZE=1024 + touch $DIR/$tfile + VALUE=datadatadatadatadatadatadatadata + while [ $(echo $VALUE | wc -c) -lt $XSIZE ]; do + VALUE="$VALUE$VALUE" + done + log "save $XBIG on $DIR/$tfile" + setfattr -n $XBIG -v "$VALUE" $DIR/$tfile || + error "saving $XBIG on $DIR/$tfile failed" + ORIG=$(getfattr -n $XBIG $DIR/$tfile 2> /dev/null | grep $XBIG) + OSIZE=$(echo $ORIG | wc -c) + [ $OSIZE -lt $XSIZE ] && error "set $XBIG too small ($OSIZE < $XSIZE)" + + XSML=trusted.sml + log "save $XSML on $DIR/$tfile" + setfattr -n $XSML -v val $DIR/$tfile || + error "saving $XSML on $DIR/$tfile failed" + NEW=$(getfattr -n $XBIG $DIR/$tfile 2> /dev/null | grep $XBIG) + if [ "$NEW" != "$ORIG" ]; then + log "orig: $ORIG" + log "new: $NEW" + error "$XBIG different after saving $XSML" + fi + + log "grow $XSML on $DIR/$tfile" + setfattr -n $XSML -v "$VALUE" $DIR/$tfile || + error "growing $XSML on $DIR/$tfile failed" + NEW=$(getfattr -n $XBIG $DIR/$tfile 2> /dev/null | grep $XBIG) + if [ "$NEW" != "$ORIG" ]; then + log "orig: $ORIG" + log "new: $NEW" + error "$XBIG different after growing $XSML" + fi + log "$XBIG still valid after growing $XSML" +} +run_test 102h "grow xattr from inside inode to external block" + run_acl_subtest() { $LUSTRE/tests/acl/run $LUSTRE/tests/acl/$1.test @@ -3903,6 +4068,7 @@ test_105c() { run_test 105c "lockf when mounted without -o flock test ========" test_106() { #bug 10921 + mkdir -p $DIR/$tdir $DIR/$tdir && error "exec $DIR/$tdir succeeded" chmod 777 $DIR/$tdir || error "chmod $DIR/$tdir failed" } @@ -3915,8 +4081,8 @@ test_107() { sleep 60 & SLEEPPID=$! - file=`cat /proc/sys/kernel/core_pattern` - core_pid=`cat /proc/sys/kernel/core_uses_pid` + file=`sysctl -n kernel.core_pattern` + core_pid=`sysctl -n kernel.core_uses_pid` [ $core_pid -eq 1 ] && file=$file.$SLEEPPID rm -f $file sleep 1 @@ -4067,9 +4233,9 @@ test_117() # bug 10891 { dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 #define OBD_FAIL_OST_SETATTR_CREDITS 0x21e - sysctl -w lustre.fail_loc=0x21e + lctl set_param fail_loc=0x21e > $DIR/$tfile || error "truncate failed" - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 echo "Truncate succeeded." } run_test 117 "verify fsfilt_extend =============================" @@ -4100,8 +4266,8 @@ test_118a() #bug 11710 reset_async multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c - DIRTY=$(lctl get_param -n llite.*.dump_page_cache | grep -c dirty) - WRITEBACK=$(lctl get_param llite.*.dump_page_cache | grep -c writeback) + DIRTY=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c dirty) + WRITEBACK=$(lctl get_param "llite.*.dump_page_cache" | grep -c writeback) if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" @@ -4117,10 +4283,10 @@ test_118b() reset_async #define OBD_FAIL_OST_ENOENT 0x217 - do_facet ost sysctl -w lustre.fail_loc=0x217 + set_nodes_failloc "$(osts_nodes)" 0x217 multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c RC=$? - do_facet ost sysctl -w lustre.fail_loc=0 + set_nodes_failloc "$(osts_nodes)" 0 DIRTY=$(lctl get_param llite.*.dump_page_cache | grep -c dirty) WRITEBACK=$(lctl get_param -n llite.*.dump_page_cache | grep -c writeback) @@ -4152,7 +4318,7 @@ test_118c() reset_async #define OBD_FAIL_OST_EROFS 0x216 - do_facet ost sysctl -w lustre.fail_loc=0x216 + set_nodes_failloc "$(osts_nodes)" 0x216 # multiop should block due to fsync until pages are written multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & @@ -4168,7 +4334,7 @@ test_118c() error "No page in writeback, writeback=$WRITEBACK" fi - do_facet ost sysctl -w lustre.fail_loc=0 + set_nodes_failloc "$(osts_nodes)" 0 wait $MULTIPID RC=$? if [[ $RC -ne 0 ]]; then @@ -4194,7 +4360,7 @@ test_118d() reset_async #define OBD_FAIL_OST_BRW_PAUSE_BULK - do_facet ost sysctl -w lustre.fail_loc=0x214 + set_nodes_failloc "$(osts_nodes)" 0x214 # multiop should block due to fsync until pages are written multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & MULTIPID=$! @@ -4210,7 +4376,7 @@ test_118d() fi wait $MULTIPID || error "Multiop fsync failed, rc=$?" - do_facet ost sysctl -w lustre.fail_loc=0 + set_nodes_failloc "$(osts_nodes)" 0 DIRTY=$(lctl get_param -n llite.*.dump_page_cache | grep -c dirty) WRITEBACK=$(lctl get_param -n llite.*.dump_page_cache | grep -c writeback) @@ -4228,7 +4394,7 @@ test_118f() { reset_async #define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a - sysctl -w lustre.fail_loc=0x8000040a + lctl set_param fail_loc=0x8000040a # Should simulate EINVAL error which is fatal multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c @@ -4237,11 +4403,11 @@ test_118f() { error "Must return error due to dropped pages, rc=$RC" fi - sysctl -w lustre.fail_loc=0x0 + lctl set_param fail_loc=0x0 - LOCKED=$(lctl get_param -n llite.*.dump_page_cache | grep -c locked) - DIRTY=$(lctl get_param -n llite.*.dump_page_cache | grep -c dirty) - WRITEBACK=$(lctl get_param -n llite.*.dump_page_cache | grep -c writeback) + LOCKED=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c locked) + DIRTY=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c dirty) + WRITEBACK=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c writeback) if [[ $LOCKED -ne 0 ]]; then error "Locked pages remain in cache, locked=$LOCKED" fi @@ -4262,20 +4428,20 @@ test_118g() { reset_async #define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 - sysctl -w lustre.fail_loc=0x406 + lctl set_param fail_loc=0x406 # simulate local -ENOMEM multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c RC=$? - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 if [[ $RC -eq 0 ]]; then error "Must return error due to dropped pages, rc=$RC" fi - LOCKED=$(lctl get_param -n llite.*.dump_page_cache | grep -c locked) - DIRTY=$(lctl get_param -n llite.*.dump_page_cache | grep -c dirty) - WRITEBACK=$(lctl get_param -n llite.*.dump_page_cache | grep -c writeback) + LOCKED=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c locked) + DIRTY=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c dirty) + WRITEBACK=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c writeback) if [[ $LOCKED -ne 0 ]]; then error "Locked pages remain in cache, locked=$LOCKED" fi @@ -4287,7 +4453,7 @@ test_118g() { rm -f $DIR/$tfile echo "No pages locked after fsync" - reset_async + reset_async return 0 } run_test 118g "Don't stay in wait if we got local -ENOMEM ==========" @@ -4298,12 +4464,12 @@ test_118h() { reset_async #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e - do_facet ost sysctl -w lustre.fail_loc=0x20e + set_nodes_failloc "$(osts_nodes)" 0x20e # Should simulate ENOMEM error which is recoverable and should be handled by timeout multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c RC=$? - do_facet ost sysctl -w lustre.fail_loc=0 + set_nodes_failloc "$(osts_nodes)" 0 if [[ $RC -eq 0 ]]; then error "Must return error due to dropped pages, rc=$RC" fi @@ -4332,13 +4498,13 @@ test_118i() { reset_async #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e - do_facet ost sysctl -w lustre.fail_loc=0x20e + set_nodes_failloc "$(osts_nodes)" 0x20e # Should simulate ENOMEM error which is recoverable and should be handled by timeout multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & PID=$! sleep 5 - do_facet ost sysctl -w lustre.fail_loc=0 + set_nodes_failloc "$(osts_nodes)" 0 wait $PID RC=$? @@ -4346,9 +4512,9 @@ test_118i() { error "got error, but should be not, rc=$RC" fi - LOCKED=$(lctl get_param -n llite.*.dump_page_cache | grep -c locked) - DIRTY=$(lctl get_param -n llite.*.dump_page_cache | grep -c dirty) - WRITEBACK=$(lctl get_param -n llite.*.dump_page_cache | grep -c writeback) + LOCKED=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c locked) + DIRTY=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c dirty) + WRITEBACK=$(lctl get_param -n "llite.*.dump_page_cache" | grep -c writeback) if [[ $LOCKED -ne 0 ]]; then error "Locked pages remain in cache, locked=$LOCKED" fi @@ -4370,12 +4536,12 @@ test_118j() { reset_async #define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 - do_facet ost sysctl -w lustre.fail_loc=0x220 + set_nodes_failloc "$(osts_nodes)" 0x220 # return -EIO from OST multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c RC=$? - do_facet ost sysctl -w lustre.fail_loc=0x0 + set_nodes_failloc "$(osts_nodes)" 0x0 if [[ $RC -eq 0 ]]; then error "Must return error due to dropped pages, rc=$RC" fi @@ -4402,7 +4568,7 @@ run_test 118j "Simulate unrecoverable OST side error ==========" test_118k() { #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e - do_facet ost sysctl -w lustre.fail_loc=0x20e + set_nodes_failloc "$(osts_nodes)" 0x20e mkdir -p $DIR/$tdir for ((i=0;i<10;i++)); do @@ -4414,7 +4580,7 @@ test_118k() wait $SLEEPPID done - sysctl -w lustre.fail_loc=0 + set_nodes_failloc "$(osts_nodes)" 0 } run_test 118k "bio alloc -ENOMEM and IO TERM handling =========" @@ -4458,7 +4624,33 @@ test_119c() # bug 13099 } run_test 119c "Testing for direct read hitting hole" +test_119d() # bug 15950 +{ + MAX_RPCS_IN_FLIGHT=`$LCTL get_param -n osc.*OST0000-osc-[^mM]*.max_rpcs_in_flight` + $LCTL set_param -n osc.*OST0000-osc-[^mM]*.max_rpcs_in_flight 1 + BSIZE=1048576 + $SETSTRIPE $DIR/$tfile -i 0 -c 1 || error "setstripe failed" + $DIRECTIO write $DIR/$tfile 0 1 $BSIZE || error "first directio failed" + #define OBD_FAIL_OSC_DIO_PAUSE 0x40d + lctl set_param fail_loc=0x40d + $DIRECTIO write $DIR/$tfile 1 4 $BSIZE & + pid_dio=$! + sleep 1 + cat $DIR/$tfile > /dev/null & + lctl set_param fail_loc=0 + pid_reads=$! + wait $pid_dio + log "the DIO writes have completed, now wait for the reads (should not block very long)" + sleep 2 + [ -n "`ps h -p $pid_reads -o comm`" ] && \ + error "the read rpcs have not completed in 2s" + rm -f $DIR/$tfile + $LCTL set_param -n osc.*OST0000-osc-[^mM]*.max_rpcs_in_flight $MAX_RPCS_IN_FLIGHT +} +run_test 119d "The DIO path should try to send a new rpc once one is completed" + test_120a() { + mkdir -p $DIR/$tdir [ -z "`lctl get_param -n mdc.*.connect_flags | grep early_lock_cancel`" ] && \ skip "no early lock cancel on server" && return 0 lru_resize_disable mdc @@ -4478,6 +4670,7 @@ test_120a() { run_test 120a "Early Lock Cancel: mkdir test ===================" test_120b() { + mkdir -p $DIR/$tdir [ -z "`lctl get_param -n mdc.*.connect_flags | grep early_lock_cancel`" ] && \ skip "no early lock cancel on server" && return 0 lru_resize_disable mdc @@ -4497,6 +4690,7 @@ test_120b() { run_test 120b "Early Lock Cancel: create test ==================" test_120c() { + mkdir -p $DIR/$tdir [ -z "`lctl get_param -n mdc.*.connect_flags | grep early_lock_cancel`" ] && \ skip "no early lock cancel on server" && return 0 lru_resize_disable mdc @@ -4518,6 +4712,7 @@ test_120c() { run_test 120c "Early Lock Cancel: link test ====================" test_120d() { + mkdir -p $DIR/$tdir [ -z "`lctl get_param -n mdc.*.connect_flags | grep early_lock_cancel`" ] && \ skip "no early lock cancel on server" && return 0 lru_resize_disable mdc @@ -4538,6 +4733,7 @@ test_120d() { run_test 120d "Early Lock Cancel: setattr test =================" test_120e() { + mkdir -p $DIR/$tdir [ -z "`lctl get_param -n mdc.*.connect_flags | grep early_lock_cancel`" ] && \ skip "no early lock cancel on server" && return 0 lru_resize_disable mdc @@ -4562,6 +4758,7 @@ run_test 120e "Early Lock Cancel: unlink test ==================" test_120f() { [ -z "`lctl get_param -n mdc.*.connect_flags | grep early_lock_cancel`" ] && \ skip "no early lock cancel on server" && return 0 + mkdir -p $DIR/$tdir lru_resize_disable mdc lru_resize_disable osc mkdir -p $DIR/$tdir/d1 $DIR/$tdir/d2 @@ -4621,20 +4818,20 @@ run_test 120g "Early Lock Cancel: performance test =============" test_121() { #bug 10589 writes=$(LANG=C dd if=/dev/zero of=$DIR/$tfile count=1 2>&1 | awk -F '+' '/out/ {print $1}') - sysctl -w lustre.fail_loc=0x310 + lctl set_param fail_loc=0x310 cancel_lru_locks osc > /dev/null reads=$(LANG=C dd if=$DIR/$tfile of=/dev/null 2>&1 | awk -F '+' '/in/ {print $1}') - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 [ "$reads" -eq "$writes" ] || error "read" $reads "blocks, must be" $writes } run_test 121 "read cancel race =================================" test_122() { #bug 11544 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 - sysctl -w lustre.fail_loc=0x508 + lctl set_param fail_loc=0x508 dd if=/dev/zero of=$DIR/$tfile count=1 sync - sysctl -w lustre.fail_loc=0 + lctl set_param fail_loc=0 } run_test 122 "fail client bulk callback (shouldn't LBUG) =======" @@ -4643,6 +4840,7 @@ test_123a() { # was test 123, statahead(bug 11401) log "testing on UP system. Performance may be not as good as expected." fi + remount_client $MOUNT mkdir -p $DIR/$tdir error=0 NUMFREE=`df -i -P $DIR | tail -n 1 | awk '{ print $4 }'` @@ -4707,10 +4905,10 @@ test_123b () { # statahead(bug 15027) cancel_lru_locks osc #define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 - sysctl -w lustre.fail_loc=0x80000803 + lctl set_param fail_loc=0x80000803 ls -lR $DIR/$tdir > /dev/null log "ls done" - sysctl -w lustre.fail_loc=0x0 + lctl set_param fail_loc=0x0 lctl get_param -n llite.*.statahead_stats rm -r $DIR/$tdir sync @@ -4875,6 +5073,89 @@ test_126() { # bug 12829/13455 } run_test 126 "check that the fsgid provided by the client is taken into account" +test_127() { # bug 15521 + $LSTRIPE -i 0 -c 1 $DIR/$tfile + $LCTL set_param osc.*.stats=0 + FSIZE=$((2048 * 1024)) + dd if=/dev/zero of=$DIR/$tfile bs=$FSIZE count=1 + cancel_lru_locks osc + dd if=$DIR/$tfile of=/dev/null bs=$FSIZE + + $LCTL get_param osc.*0000-osc-*.stats | grep samples > $DIR/${tfile}.tmp + while read NAME COUNT SAMP UNIT MIN MAX SUM SUMSQ; do + eval $NAME=$COUNT + echo "got $COUNT $NAME" + + case $NAME in + ost_read|ost_write) + [ $MIN -lt 4096 ] && error "min is too small: $MIN" + [ $MIN -gt $FSIZE ] && error "min is too big: $MIN" + [ $MAX -lt 4096 ] && error "max is too small: $MAX" + [ $MAX -gt $FSIZE ] && error "max is too big: $MAX" + [ $SUM -ne $FSIZE ] && error "sum is wrong: $SUM" + [ $SUMSQ -lt $(((FSIZE /4096) * (4096 * 4096))) ] && + error "sumsquare is too small: $SUMSQ" + [ $SUMSQ -gt $((FSIZE * FSIZE)) ] && + error "sumsquare is too big: $SUMSQ" + ;; + *) ;; + esac + done < $DIR/${tfile}.tmp + + #check that we actually got some stats + [ "$ost_read" ] || error "no read done" + [ "$ost_write" ] || error "no write done" +} +run_test 127 "verify the client stats are sane" + +test_128() { # bug 15212 + touch $DIR/$tfile + $LFS 2>&1 <<-EOF | tee $TMP/$tfile.log + find $DIR/$tfile + find $DIR/$tfile + EOF + + result=$(grep error $TMP/$tfile.log) + rm -f $DIR/$tfile + [ -z "$result" ] || error "consecutive find's under interactive lfs failed" +} +run_test 128 "interactive lfs for 2 consecutive find's" + +test_129() { + [ "$FSTYPE" != "ldiskfs" ] && skip "not needed for FSTYPE=$FSTYPE" && return 0 + + DEV=$(basename $(do_facet mds lctl get_param -n mds.*.mntdev)) + [ -z "$DEV" ] && error "can't access mds mntdev" + EFBIG=27 + LDPROC=/proc/fs/ldiskfs/$DEV/max_dir_size + MAX=16384 + + do_facet mds "echo $MAX > $LDPROC" + + mkdir -p $DIR/$tdir + + I=0 + J=0 + while [ ! $I -gt $MAX ]; do + multiop $DIR/$tdir/$J Oc + rc=$? + if [ $rc -eq $EFBIG ]; then + do_facet mds "echo 0 >$LDPROC" + echo "return code $rc received as expected" + return 0 + elif [ $rc -ne 0 ]; then + do_facet mds "echo 0 >$LDPROC" + error_exit "return code $rc received instead of expected $EFBIG" + fi + J=$((J+1)) + I=$(stat -c%s "$DIR/$tdir") + done + + error "exceeded dir size limit: $I bytes" + do_facet mds "echo 0 >$LDPROC" +} +run_test 129 "test directory size limit ========================" + TMPDIR=$OLDTMPDIR TMP=$OLDTMP HOME=$OLDHOME @@ -4882,7 +5163,7 @@ HOME=$OLDHOME log "cleanup: ======================================================" check_and_cleanup_lustre if [ "$I_MOUNTED" != "yes" ]; then - sysctl -w lnet.debug="$OLDDEBUG" 2> /dev/null || true + lctl set_param debug="$OLDDEBUG" 2> /dev/null || true fi echo '=========================== finished ===============================' diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index dbcb0c4e7ffd8ccefcb7f90b28dd4d83a6052903..16c1e14839c872c895fffdcc4a59474f8bb9dfd8 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -3,8 +3,8 @@ set -e ONLY=${ONLY:-"$*"} -# bug number for skipped test: 3192 12652 9977 -ALWAYS_EXCEPT=" 14b 14c 28 $SANITYN_EXCEPT" +# bug number for skipped test: 3192 12652 15528/3811 9977 15528/11549 +ALWAYS_EXCEPT=" 14b 14c 19 28 29 $SANITYN_EXCEPT" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! # bug number for skipped test: 12652 12652 @@ -60,6 +60,7 @@ LPROC=/proc/fs/lustre LOVNAME=`lctl get_param -n llite.*.lov.common_name | tail -n 1` OSTCOUNT=`lctl get_param -n lov.$LOVNAME.numobd` +assert_DIR rm -rf $DIR1/[df][0-9]* $DIR1/lnk # $RUNAS_ID may get set incorrectly somewhere else @@ -158,11 +159,9 @@ test_6() { run_test 6 "remove of open file on other node ==================" test_7() { - # run_one creates uniq $tdir (bug 13798) - # opendirunlink failes if it exists - rmdir $DIR1/$tdir || true - opendirunlink $DIR1/$tdir $DIR2/$tdir || \ - error "opendirunlink $DIR1/$tdir $DIR2/$tdir" + local dir=d7 + opendirunlink $DIR1/$dir $DIR2/$dir || \ + error "opendirunlink $DIR1/$dir $DIR2/$dir" } run_test 7 "remove of open directory on other node =============" @@ -174,10 +173,11 @@ run_test 8 "remove of open special file on other node ==========" test_9() { MTPT=1 + local dir > $DIR2/f9 for C in a b c d e f g h i j k l; do - DIR=`eval echo \\$DIR$MTPT` - echo -n $C >> $DIR/f9 + dir=`eval echo \\$DIR$MTPT` + echo -n $C >> $dir/f9 [ "$MTPT" -eq 1 ] && MTPT=2 || MTPT=1 done [ "`cat $DIR1/f9`" = "abcdefghijkl" ] || \ @@ -187,11 +187,12 @@ run_test 9 "append of file with sub-page size on multiple mounts" test_10a() { MTPT=1 + local dir OFFSET=0 > $DIR2/f10 for C in a b c d e f g h i j k l; do - DIR=`eval echo \\$DIR$MTPT` - echo -n $C | dd of=$DIR/f10 bs=1 seek=$OFFSET count=1 + dir=`eval echo \\$DIR$MTPT` + echo -n $C | dd of=$dir/f10 bs=1 seek=$OFFSET count=1 [ "$MTPT" -eq 1 ] && MTPT=2 || MTPT=1 OFFSET=`expr $OFFSET + 1` done @@ -201,17 +202,19 @@ test_10a() { run_test 10a "write of file with sub-page size on multiple mounts " test_10b() { - yes "R" | dd of=$DIR1/f10b bs=3k count=1 || error "dd $DIR1" + # create a seed file + yes "R" | head -c 4000 >$TMP/f10b-seed + dd if=$TMP/f10b-seed of=$DIR1/f10b bs=3k count=1 || error "dd $DIR1" truncate $DIR1/f10b 4096 || error "truncate 4096" dd if=$DIR2/f10b of=$TMP/f10b-lustre bs=4k count=1 || error "dd $DIR2" # create a test file locally to compare - yes "R" | dd of=$TMP/f10b bs=3k count=1 || error "dd random" + dd if=$TMP/f10b-seed of=$TMP/f10b bs=3k count=1 || error "dd random" truncate $TMP/f10b 4096 || error "truncate 4096" cmp $TMP/f10b $TMP/f10b-lustre || error "file miscompare" - rm $TMP/f10b $TMP/f10b-lustre + rm $TMP/f10b $TMP/f10b-lustre $TMP/f10b-seed } run_test 10b "write of file with sub-page size on multiple mounts " @@ -333,7 +336,7 @@ test_17() { # bug 3513, 3667 cp /etc/termcap $DIR1/f17 cancel_lru_locks osc > /dev/null #define OBD_FAIL_ONCE|OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a - sysctl -w lustre.fail_loc=0x8000030a + lctl set_param fail_loc=0x8000030a ls -ls $DIR1/f17 | awk '{ print $1,$6 }' > $DIR1/f17-1 & \ ls -ls $DIR2/f17 | awk '{ print $1,$6 }' > $DIR2/f17-2 wait @@ -373,7 +376,7 @@ test_19() { # bug3811 done rm $DIR1/f19b } -#run_test 19 "test concurrent uncached read races ===============" +run_test 19 "test concurrent uncached read races ===============" test_20() { mkdir $DIR1/d20 @@ -473,6 +476,7 @@ test_25() { [ `lctl get_param -n mdc.*-mdc-*.connect_flags | grep -c acl` -lt 2 ] && \ skip "must have acl, skipping" && return + mkdir -p $DIR1/$tdir touch $DIR1/$tdir/f1 || error "touch $DIR1/$tdir/f1" chmod 0755 $DIR1/$tdir/f1 || error "chmod 0755 $DIR1/$tdir/f1" @@ -557,16 +561,17 @@ run_test 28 "read/write/truncate file with lost stripes" test_29() { # bug 10999 touch $DIR1/$tfile #define OBD_FAIL_LDLM_GLIMPSE 0x30f - sysctl -w lustre.fail_loc=0x8000030f + lctl set_param fail_loc=0x8000030f ls -l $DIR2/$tfile & sleep 0.500s dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 wait } #bug 11549 - permanently turn test off in b1_5 -#run_test 29 "lock put race between glimpse and enqueue =========" +run_test 29 "lock put race between glimpse and enqueue =========" test_30() { #bug #11110 + mkdir -p $DIR1/$tdir cp -f /bin/bash $DIR1/$tdir/bash /bin/sh -c 'sleep 1; rm -f $DIR2/$tdir/bash; cp /bin/bash $DIR2/$tdir' & err=$($DIR1/$tdir/bash -c 'sleep 2; openfile -f O_RDONLY /proc/$$/exe >& /dev/null; echo $?') @@ -582,13 +587,92 @@ test_31() { writes=`LANG=C dd if=/dev/zero of=$DIR/$tdir/$tfile count=1 2>&1 | awk 'BEGIN { FS="+" } /out/ {print $1}'` #define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 - sysctl -w lustre.fail_loc=0x314 + lctl set_param fail_loc=0x314 reads=`LANG=C dd if=$DIR2/$tdir/$tfile of=/dev/null 2>&1 | awk 'BEGIN { FS="+" } /in/ {print $1}'` [ $reads -eq $writes ] || error "read" $reads "blocks, must be" $writes } run_test 31 "voluntary cancel / blocking ast race==============" +# enable/disable lockless truncate feature, depending on the arg 0/1 +enable_lockless_truncate() { + lctl set_param -n llite.*.lockless_truncate $1 +} + +test_32a() { # bug 11270 + local p="$TMP/sanityN-$TESTNAME.parameters" + save_lustre_params $HOSTNAME llite.*.lockless_truncate > $p + cancel_lru_locks osc + clear_llite_stats + enable_lockless_truncate 1 + dd if=/dev/zero of=$DIR1/$tfile count=10 bs=1M > /dev/null 2>&1 + + log "checking cached lockless truncate" + $TRUNCATE $DIR1/$tfile 8000000 + $CHECKSTAT -s 8000000 $DIR2/$tfile || error "wrong file size" + [ $(calc_llite_stats lockless_truncate) -eq 0 ] || + error "lockless truncate doesn't use cached locks" + + log "checking not cached lockless truncate" + $TRUNCATE $DIR2/$tfile 5000000 + $CHECKSTAT -s 5000000 $DIR1/$tfile || error "wrong file size" + [ $(calc_llite_stats lockless_truncate) -ne 0 ] || + error "not cached trancate isn't lockless" + + log "disabled lockless truncate" + enable_lockless_truncate 0 + clear_llite_stats + $TRUNCATE $DIR2/$tfile 3000000 + $CHECKSTAT -s 3000000 $DIR1/$tfile || error "wrong file size" + [ $(calc_llite_stats lockless_truncate) -eq 0 ] || + error "lockless truncate disabling failed" + rm $DIR1/$tfile + # restore lockless_truncate default values + restore_lustre_params < $p + rm -f $p +} +run_test 32a "lockless truncate" + +test_32b() { # bug 11270 + local node + local p="$TMP/sanityN-$TESTNAME.parameters" + save_lustre_params $HOSTNAME "llite.*.contention_seconds" > $p + for node in $(osts_nodes); do + save_lustre_params $node "ldlm.namespaces.filter-*.max_nolock_bytes" >> $p + save_lustre_params $node "ldlm.namespaces.filter-*.contended_locks" >> $p + save_lustre_params $node "ldlm.namespaces.filter-*.contention_seconds" >> $p + done + clear_llite_stats + # agressive lockless i/o settings + for node in $(osts_nodes); do + do_node $node 'lctl set_param -n ldlm.namespaces.filter-*.max_nolock_bytes 2000000; lctl set_param -n ldlm.namespaces.filter-*.contended_locks 0; lctl set_param -n ldlm.namespaces.filter-*.contention_seconds 60' + done + lctl set_param -n llite.*.contention_seconds 60 + for i in $(seq 5); do + dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1 + dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1 + done + [ $(calc_llite_stats lockless_write_bytes) -ne 0 ] || error "lockless i/o was not triggered" + # disable lockless i/o (it is disabled by default) + for node in $(osts_nodes); do + do_node $node 'lctl set_param -n ldlm.namespaces.filter-*.max_nolock_bytes 0; lctl set_param -n ldlm.namespaces.filter-*.contended_locks 32; lctl set_param -n ldlm.namespaces.filter-*.contention_seconds 0' + done + # set contention_seconds to 0 at client too, otherwise Lustre still + # remembers lock contention + lctl set_param -n llite.*.contention_seconds 0 + clear_llite_stats + for i in $(seq 5); do + dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1 + dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1 + done + [ $(calc_llite_stats lockless_write_bytes) -eq 0 ] || + error "lockless i/o works when disabled" + rm -f $DIR1/$tfile + restore_lustre_params <$p + rm -f $p +} +run_test 32b "lockless i/o" + log "cleanup: ======================================================" check_and_cleanup_lustre diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index fb824d3aa90e35c2c6bab225bda38af4f9399d58..de8e911d4586111631b8fade12720dec7cff8cb7 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -24,6 +24,18 @@ assert_env() { [ $failed ] && exit 1 || true } +assert_DIR () { + local failed="" + [ -z "`echo :$DIR: | grep :$MOUNT:`" ] && \ + failed=1 && echo "DIR not in $MOUNT. Aborting." + [ -z "`echo :$DIR1: | grep :$MOUNT1:`" ] && \ + failed=1 && echo "DIR1 not in $MOUNT1. Aborting." + [ -z "`echo :$DIR2: | grep :$MOUNT2:`" ] && \ + failed=1 && echo "DIR2 not in $MOUNT2. Aborting" + + [ -n "$failed" ] && exit 99 || true +} + usage() { echo "usage: $0 [-r] [-f cfgfile]" echo " -r: reformat" @@ -66,14 +78,17 @@ print_summary () { init_test_env() { export LUSTRE=`absolute_path $LUSTRE` export TESTSUITE=`basename $0 .sh` - export LTESTDIR=${LTESTDIR:-$LUSTRE/../ltest} [ -d /r ] && export ROOT=${ROOT:-/r} export TMP=${TMP:-$ROOT/tmp} export TESTSUITELOG=${TMP}/${TESTSUITE}.log export HOSTNAME=${HOSTNAME:-`hostname`} - - export PATH=:$PATH:$LUSTRE/utils:$LUSTRE/tests + if ! echo $PATH | grep -q $LUSTRE/utils; then + export PATH=$PATH:$LUSTRE/utils + fi + if ! echo $PATH | grep -q $LUSTRE/test; then + export PATH=$PATH:$LUSTRE/tests + fi export LCTL=${LCTL:-"$LUSTRE/utils/lctl"} export LFS=${LFS:-"$LUSTRE/utils/lfs"} [ ! -f "$LCTL" ] && export LCTL=$(which lctl) @@ -89,6 +104,7 @@ init_test_env() { export LPROC=/proc/fs/lustre export DIR2 export AT_MAX_PATH + export SAVE_PWD=${SAVE_PWD:-$LUSTRE/tests} if [ "$ACCEPTOR_PORT" ]; then export PORT_OPT="--port $ACCEPTOR_PORT" @@ -114,6 +130,7 @@ init_test_env() { ONLY=${ONLY:-$*} [ "$TESTSUITELOG" ] && rm -f $TESTSUITELOG || true + rm -f $TMP/*active } @@ -148,8 +165,8 @@ load_modules() { echo Loading modules from $LUSTRE load_module ../lnet/libcfs/libcfs - [ "$PTLDEBUG" ] && sysctl -w lnet.debug=$PTLDEBUG - [ "$SUBSYSTEM" ] && sysctl -w lnet.subsystem_debug=${SUBSYSTEM# } + [ "$PTLDEBUG" ] && lctl set_param debug=$PTLDEBUG + [ "$SUBSYSTEM" ] && lctl set_param subsystem_debug=${SUBSYSTEM# } [ -f /etc/modprobe.conf ] && MODPROBECONF=/etc/modprobe.conf [ -f /etc/modprobe.d/Lustre ] && MODPROBECONF=/etc/modprobe.d/Lustre [ -z "$LNETOPTS" -a -n "$MODPROBECONF" ] && \ @@ -215,18 +232,26 @@ wait_for_lnet() { done } +unload_dep_module() { + #lsmod output + #libcfs 107852 17 llite_lloop,lustre,obdfilter,ost,... + local MODULE=$1 + local DEPS=$(lsmod | awk '($1 == "'$MODULE'") { print $4 }' | tr ',' ' ') + for SUBMOD in $DEPS; do + unload_dep_module $SUBMOD + done + [ "$MODULE" = "libcfs" ] && $LCTL dk $TMP/debug || true + $RMMOD $MODULE || true +} + unload_modules() { wait_exit_ST client # bug 12845 lsmod | grep libcfs > /dev/null && $LCTL dl - local MODULES=$($LCTL modules | awk '{ print $2 }' | grep -v libcfs) || true - $RMMOD $MODULES > /dev/null 2>&1 || true - # do it again, in case we tried to unload ksocklnd too early - MODULES=$($LCTL modules | awk '{ print $2 }' | grep -v libcfs) || true - [ -n "$MODULES" ] && $RMMOD $MODULES > /dev/null 2>&1 || true - lsmod | grep libcfs > /dev/null && $LCTL dk $TMP/debug - $RMMOD libcfs - MODULES=$($LCTL modules | awk '{ print $2 }') + unload_dep_module $FSTYPE + unload_dep_module libcfs + + local MODULES=$($LCTL modules | awk '{ print $2 }') if [ -n "$MODULES" ]; then echo "Modules still loaded: " echo $MODULES @@ -257,32 +282,42 @@ unload_modules() { } # Facet functions +mount_facet() { + local facet=$1 + shift + local dev=${facet}_dev + local opt=${facet}_opt + echo "Starting ${facet}: ${!opt} $@ ${!dev} ${MOUNT%/*}/${facet}" + do_facet ${facet} mount -t lustre ${!opt} $@ ${!dev} ${MOUNT%/*}/${facet} + RC=${PIPESTATUS[0]} + if [ $RC -ne 0 ]; then + echo "mount -t lustre $@ ${!dev} ${MOUNT%/*}/${facet}" + echo "Start of ${!dev} on ${facet} failed ${RC}" + else + do_facet ${facet} "lctl set_param debug=$PTLDEBUG; \ + lctl set_param subsystem_debug=${SUBSYSTEM# }; \ + lctl set_param debug_mb=${DEBUG_SIZE}; \ + sync" + + label=$(do_facet ${facet} "e2label ${!dev}") + [ -z "$label" ] && echo no label for ${!dev} && exit 1 + eval export ${facet}_svc=${label} + echo Started ${label} + fi + return $RC +} + # start facet device options start() { facet=$1 shift device=$1 shift - echo "Starting ${facet}: $@ ${device} ${MOUNT%/*}/${facet}" + eval export ${facet}_dev=${device} + eval export ${facet}_opt=\"$@\" do_facet ${facet} mkdir -p ${MOUNT%/*}/${facet} - do_facet ${facet} mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} - RC=${PIPESTATUS[0]} - if [ $RC -ne 0 ]; then - echo "mount -t lustre $@ ${device} ${MOUNT%/*}/${facet}" - echo "Start of ${device} on ${facet} failed ${RC}" - else - do_facet ${facet} "sysctl -w lnet.debug=$PTLDEBUG; \ - sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }; \ - sysctl -w lnet.debug_mb=${DEBUG_SIZE}" - - do_facet ${facet} sync - label=$(do_facet ${facet} "e2label ${device}") - [ -z "$label" ] && echo no label for ${device} && exit 1 - eval export ${facet}_svc=${label} - eval export ${facet}_dev=${device} - eval export ${facet}_opt=\"$@\" - echo Started ${label} - fi + mount_facet ${facet} + RC=$? return $RC } @@ -319,29 +354,64 @@ zconf_mount() { exit 1 fi - echo "Starting client: $OPTIONS $device $mnt" + echo "Starting client: $client: $OPTIONS $device $mnt" do_node $client mkdir -p $mnt do_node $client mount -t lustre $OPTIONS $device $mnt || return 1 + do_node $client "lctl set_param debug=$PTLDEBUG; + lctl set_param subsystem_debug=${SUBSYSTEM# }; + lctl set_param debug_mb=${DEBUG_SIZE}" - do_node $client "sysctl -w lnet.debug=$PTLDEBUG; - sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }; - sysctl -w lnet.debug_mb=${DEBUG_SIZE}" [ -d /r ] && $LCTL modules > /r/tmp/ogdb-$HOSTNAME return 0 } zconf_umount() { - client=$1 - mnt=$2 + local client=$1 + local mnt=$2 [ "$3" ] && force=-f local running=$(do_node $client "grep -c $mnt' ' /proc/mounts") || true if [ $running -ne 0 ]; then - echo "Stopping client $mnt (opts:$force)" + echo "Stopping client $client $mnt (opts:$force)" lsof | grep "$mnt" || true do_node $client umount $force $mnt fi } +zconf_mount_clients() { + local OPTIONS + local clients=$1 + local mnt=$2 + + # Only supply -o to mount if we have options + if [ -n "$MOUNTOPT" ]; then + OPTIONS="-o $MOUNTOPT" + fi + local device=$MGSNID:/$FSNAME + if [ -z "$mnt" -o -z "$FSNAME" ]; then + echo Bad zconf mount command: opt=$OPTIONS dev=$device mnt=$mnt + exit 1 + fi + + echo "Starting client $clients: $OPTIONS $device $mnt" + do_nodes $clients mkdir -p $mnt + do_nodes $clients mount -t lustre $OPTIONS $device $mnt || return 1 + + do_nodes $clients "sysctl -w lnet.debug=$PTLDEBUG; + sysctl -w lnet.subsystem_debug=${SUBSYSTEM# }; + sysctl -w lnet.debug_mb=${DEBUG_SIZE};" + + return 0 +} + +zconf_umount_clients() { + local clients=$1 + local mnt=$2 + [ "$3" ] && force=-f + + echo "Stopping clients: $clients $mnt (opts:$force)" + do_nodes $clients umount $force $mnt +} + shutdown_facet() { facet=$1 if [ "$FAILURE_MODE" = HARD ]; then @@ -393,15 +463,15 @@ cleanup_check() { } wait_delete_completed () { - local TOTALPREV=`awk 'BEGIN{total=0}; {total+=$1}; END{print total}' \ - $LPROC/osc/*/kbytesavail` + local TOTALPREV=`lctl get_param -n osc.*.kbytesavail | \ + awk 'BEGIN{total=0}; {total+=$1}; END{print total}'` local WAIT=0 local MAX_WAIT=20 while [ "$WAIT" -ne "$MAX_WAIT" ]; do sleep 1 - TOTAL=`awk 'BEGIN{total=0}; {total+=$1}; END{print total}' \ - $LPROC/osc/*/kbytesavail` + TOTAL=`lctl get_param -n osc.*.kbytesavail | \ + awk 'BEGIN{total=0}; {total+=$1}; END{print total}'` [ "$TOTAL" -eq "$TOTALPREV" ] && break echo "Waiting delete completed ... prev: $TOTALPREV current: $TOTAL " TOTALPREV=$TOTAL @@ -423,14 +493,14 @@ wait_for() { } wait_mds_recovery_done () { - local timeout=`do_facet mds sysctl -n lustre.timeout` + local timeout=`do_facet mds lctl get_param -n timeout` #define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) # as we are in process of changing obd_timeout in different ways # let's set MAX longer than that MAX=$(( timeout * 4 )) WAIT=0 while [ $WAIT -lt $MAX ]; do - STATUS=`do_facet mds grep status /proc/fs/lustre/mds/*-MDT*/recovery_status` + STATUS=`do_facet mds "lctl get_param -n mds.*-MDT*.recovery_status | grep status"` echo $STATUS | grep COMPLETE && return 0 sleep 5 WAIT=$((WAIT + 5)) @@ -491,9 +561,7 @@ facet_failover() { TO=`facet_active_host $facet` echo "Failover $facet to $TO" wait_for $facet - local dev=${facet}_dev - local opt=${facet}_opt - start $facet ${!dev} ${!opt} || error "Restart of $facet failed" + mount_facet $facet || error "Restart of $facet failed" } obd_name() { @@ -523,13 +591,13 @@ replay_barrier_nodf() { } mds_evict_client() { - UUID=`cat /proc/fs/lustre/mdc/${mds_svc}-mdc-*/uuid` - do_facet mds "echo $UUID > /proc/fs/lustre/mds/${mds_svc}/evict_client" + UUID=`lctl get_param -n mdc.${mds_svc}-mdc-*.uuid` + do_facet mds "lctl set_param -n mds.${mds_svc}.evict_client $UUID" } ost_evict_client() { - UUID=`cat /proc/fs/lustre/osc/${ost1_svc}-osc-*/uuid` - do_facet ost1 "echo $UUID > /proc/fs/lustre/obdfilter/${ost1_svc}/evict_client" + UUID=`lctl get_param -n osc.${ost1_svc}-osc-*.uuid` + do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.evict_client $UUID" } fail() { @@ -546,11 +614,7 @@ fail_abort() { local facet=$1 stop $facet change_active $facet - local svc=${facet}_svc - local dev=${facet}_dev - local opt=${facet}_opt - start $facet ${!dev} ${!opt} - do_facet $facet lctl --device %${!svc} abort_recovery + mount_facet $facet -o abort_recovery df $MOUNT || echo "first df failed: $?" sleep 1 df $MOUNT || error "post-failover df: $?" @@ -568,6 +632,12 @@ h2gm () { fi } +h2name_or_ip() { + if [ "$1" = "client" -o "$1" = "'*'" ]; then echo \'*\'; else + echo $1"@$2" + fi +} + h2ptl() { if [ "$1" = "client" -o "$1" = "'*'" ]; then echo \'*\'; else ID=`xtprocadmin -n $1 2>/dev/null | egrep -v 'NID' | awk '{print $1}'` @@ -607,6 +677,11 @@ h2openib() { } declare -fx h2openib +h2o2ib() { + h2name_or_ip "$1" "o2ib" +} +declare -fx h2o2ib + facet_host() { local facet=$1 varname=${facet}_HOST @@ -622,8 +697,8 @@ facet_active() { local facet=$1 local activevar=${facet}active - if [ -f ./${facet}active ] ; then - source ./${facet}active + if [ -f $TMP/${facet}active ] ; then + source $TMP/${facet}active fi active=${!activevar} @@ -657,7 +732,7 @@ change_active() { fi # save the active host for this facet activevar=${facet}active - echo "$activevar=${!activevar}" > ./$activevar + echo "$activevar=${!activevar}" > $TMP/$activevar } do_node() { @@ -689,6 +764,46 @@ do_node() { return ${PIPESTATUS[0]} } +do_nodes() { + local nodes=$1 + shift + + nodes=${nodes//,/ } + # split list to local and remote + local rnodes=$(echo " $nodes " | sed -re "s/\s+$HOSTNAME\s+/ /g") + + if [ "$(get_node_count $nodes)" != "$(get_node_count $rnodes)" ]; then + do_node $HOSTNAME $@ + fi + + [ -z "$(echo $rnodes)" ] && return 0 + + # This is part from do_node + local myPDSH=$PDSH + + rnodes=$(comma_list $rnodes) + [ -z "$myPDSH" -o "$myPDSH" = "no_dsh" ] && \ + echo "cannot run remote command on $rnodes with $myPDSH" && return 128 + + if $VERBOSE; then + echo "CMD: $rnodes $@" >&2 + $myPDSH $rnodes $LCTL mark "$@" > /dev/null 2>&1 || : + fi + + if [ "$myPDSH" = "rsh" ]; then +# we need this because rsh does not return exit code of an executed command + local command_status="$TMP/cs" + rsh $rnodes ":> $command_status" + rsh $rnodes "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; + cd $RPWD; sh -c \"$@\") || + echo command failed >$command_status" + [ -n "$($myPDSH $rnodes cat $command_status)" ] && return 1 || true + return 0 + fi + $myPDSH $rnodes "(PATH=\$PATH:$RLUSTRE/utils:$RLUSTRE/tests:/sbin:/usr/sbin; cd $RPWD; sh -c \"$@\")" | sed -re "s/\w+:\s//g" + return ${PIPESTATUS[0]} +} + do_facet() { facet=$1 shift @@ -702,7 +817,7 @@ add() { shift # make sure its not already running stop ${facet} -f - rm -f ${facet}active + rm -f $TMP/${facet}active do_facet ${facet} $MKFS $* } @@ -728,10 +843,21 @@ stopall() { # assume client mount is local grep " $MOUNT " /proc/mounts && zconf_umount $HOSTNAME $MOUNT $* grep " $MOUNT2 " /proc/mounts && zconf_umount $HOSTNAME $MOUNT2 $* + + if [ -n "$CLIENTS" ]; then + zconf_umount_clients $CLIENTS $MOUNT "$*" || true + [ -n "$MOUNT2" ] && zconf_umount_clients $CLIENTS $MOUNT2 "$*" || true + fi + [ "$CLIENTONLY" ] && return + # The add fn does rm ${facet}active file, this would be enough + # if we use do_facet <facet> only after the facet added, but + # currently we use do_facet mds in local.sh stop mds -f + rm -f ${TMP}/mdsactive for num in `seq $OSTCOUNT`; do stop ost$num -f + rm -f $TMP/ost${num}active done return 0 } @@ -781,7 +907,7 @@ set_obd_timeout() { do_facet $facet lsmod | grep -q obdclass || \ do_facet $facet "modprobe obdclass" - do_facet $facet "sysctl -w lustre.timeout=$timeout" + do_facet $facet "lctl set_param timeout=$timeout" } setupall() { @@ -792,16 +918,31 @@ setupall() { || do_facet mds "$TUNEFS --writeconf $MDSDEV" set_obd_timeout mds $TIMEOUT start mds $MDSDEV $MDS_MOUNT_OPTS + # We started mds, now we should set failover variable properly. + # Set mdsfailover_HOST if it is not set (the default failnode). + mdsfailover_HOST=$(facet_host mds) + for num in `seq $OSTCOUNT`; do DEVNAME=`ostdevname $num` set_obd_timeout ost$num $TIMEOUT start ost$num $DEVNAME $OST_MOUNT_OPTS + + # We started ost$num, now we should set ost${num}failover variable properly. + # Set ost${num}failover_HOST if it is not set (the default failnode). + varname=ost${num}failover_HOST + if [ -z "${!varname}" ]; then + eval ost${num}failover_HOST=$(facet_host ost${num}) + fi + done fi [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE mount_client $MOUNT + [ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT + if [ "$MOUNT_2" ]; then mount_client $MOUNT2 + [ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT2 fi sleep 5 } @@ -826,7 +967,7 @@ check_and_setup_lustre() { cleanup_and_setup_lustre() { if [ "$ONLY" == "cleanup" -o "`mount | grep $MOUNT`" ]; then - sysctl -w lnet.debug=0 || true + lctl set_param debug=0 || true cleanupall if [ "$ONLY" == "cleanup" ]; then exit 0 @@ -940,64 +1081,64 @@ at_max_set() { drop_request() { # OBD_FAIL_MDS_ALL_REQUEST_NET RC=0 - do_facet mds sysctl -w lustre.fail_loc=0x123 + do_facet mds lctl set_param fail_loc=0x123 do_facet client "$1" || RC=$? - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 return $RC } drop_reply() { # OBD_FAIL_MDS_ALL_REPLY_NET RC=0 - do_facet mds sysctl -w lustre.fail_loc=0x122 + do_facet mds lctl set_param fail_loc=0x122 do_facet client "$@" || RC=$? - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 return $RC } drop_reint_reply() { # OBD_FAIL_MDS_REINT_NET_REP RC=0 - do_facet mds sysctl -w lustre.fail_loc=0x119 + do_facet mds lctl set_param fail_loc=0x119 do_facet client "$@" || RC=$? - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 return $RC } pause_bulk() { #define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 RC=0 - do_facet ost1 sysctl -w lustre.fail_loc=0x214 + do_facet ost1 lctl set_param fail_loc=0x214 do_facet client "$1" || RC=$? do_facet client "sync" - do_facet ost1 sysctl -w lustre.fail_loc=0 + do_facet ost1 lctl set_param fail_loc=0 return $RC } drop_ldlm_cancel() { #define OBD_FAIL_LDLM_CANCEL 0x304 RC=0 - do_facet client sysctl -w lustre.fail_loc=0x304 + do_facet client lctl set_param fail_loc=0x304 do_facet client "$@" || RC=$? - do_facet client sysctl -w lustre.fail_loc=0 + do_facet client lctl set_param fail_loc=0 return $RC } drop_bl_callback() { #define OBD_FAIL_LDLM_BL_CALLBACK 0x305 RC=0 - do_facet client sysctl -w lustre.fail_loc=0x305 + do_facet client lctl set_param fail_loc=0x305 do_facet client "$@" || RC=$? - do_facet client sysctl -w lustre.fail_loc=0 + do_facet client lctl set_param fail_loc=0 return $RC } drop_ldlm_reply() { #define OBD_FAIL_LDLM_REPLY 0x30c RC=0 - do_facet mds sysctl -w lustre.fail_loc=0x30c + do_facet mds lctl set_param fail_loc=0x30c do_facet client "$@" || RC=$? - do_facet mds sysctl -w lustre.fail_loc=0 + do_facet mds lctl set_param fail_loc=0 return $RC } @@ -1006,15 +1147,31 @@ clear_failloc() { pause=$2 sleep $pause echo "clearing fail_loc on $facet" - do_facet $facet "sysctl -e -w lustre.fail_loc=0" + do_facet $facet "lctl set_param fail_loc=0 2>/dev/null || true" +} + +set_nodes_failloc () { + local nodes=$1 + local node + + for node in $nodes ; do + do_node $node lctl set_param fail_loc=$2 + done +} + +set_nodes_failloc () { + local nodes=$1 + local node + + for node in $nodes ; do + do_node $node sysctl -w lustre.fail_loc=$2 + done } cancel_lru_locks() { $LCTL mark "cancel_lru_locks $1 start" - for d in `find $LPROC/ldlm/namespaces | egrep -i $1`; do - [ -f $d/lru_size ] && echo clear > $d/lru_size - [ -f $d/lock_unused_count ] && grep [1-9] $d/lock_unused_count /dev/null - done + lctl set_param ldlm.namespaces.*$1*.lru_size=0 + lctl get_param ldlm.namespaces.*$1*.lock_unused_count | grep -v '=0' $LCTL mark "cancel_lru_locks $1 stop" } @@ -1027,44 +1184,32 @@ default_lru_size() lru_resize_enable() { - NS=$1 - test "x$NS" = "x" && NS="mdc" - for F in $LPROC/ldlm/namespaces/*$NS*/lru_size; do - D=$(dirname $F) - log "Enable lru resize for $(basename $D)" - echo "0" > $F - done + lctl set_param ldlm.namespaces.*$1*.lru_size=0 } lru_resize_disable() { - NS=$1 - test "x$NS" = "x" && NS="mdc" - for F in $LPROC/ldlm/namespaces/*$NS*/lru_size; do - D=$(dirname $F) - log "Disable lru resize for $(basename $D)" - DEFAULT_LRU_SIZE=$(default_lru_size) - echo "$DEFAULT_LRU_SIZE" > $F - done + lctl set_param ldlm.namespaces.*$1*.lru_size $(default_lru_size) } pgcache_empty() { - for a in /proc/fs/lustre/llite/*/dump_page_cache; do - if [ `wc -l $a | awk '{print $1}'` -gt 1 ]; then - echo there is still data in page cache $a ? - cat $a; - return 1; + local FILE + for FILE in `lctl get_param -N "llite.*.dump_page_cache"`; do + if [ `lctl get_param -n $FILE | wc -l` -gt 1 ]; then + echo there is still data in page cache $FILE ? + lctl get_param -n $FILE + return 1 fi done return 0 } debugsave() { - DEBUGSAVE="$(sysctl -n lnet.debug)" + DEBUGSAVE="$(lctl get_param -n debug)" } debugrestore() { - [ -n "$DEBUGSAVE" ] && sysctl -w lnet.debug="${DEBUGSAVE}" + [ -n "$DEBUGSAVE" ] && lctl set_param debug="${DEBUGSAVE}" DEBUGSAVE="" } @@ -1075,7 +1220,7 @@ debugrestore() { error_noexit() { local TYPE=${TYPE:-"FAIL"} local ERRLOG - sysctl -e -w lustre.fail_loc=0 || true + lctl set_param fail_loc=0 2>/dev/null || true log " ${TESTSUITE} ${TESTNAME}: @@@@@@ ${TYPE}: $@ " ERRLOG=$TMP/lustre_${TESTSUITE}_${TESTNAME}.$(date +%s) echo "Dumping lctl log to $ERRLOG" @@ -1102,7 +1247,7 @@ error_exit() { # (like ALWAYS_EXCEPT, but run the test and ignore the results.) # e.g. error_ignore 5494 "your message" error_ignore() { - TYPE="IGNORE (bz$1)" + local TYPE="IGNORE (bz$1)" shift error_noexit "$@" } @@ -1118,7 +1263,7 @@ build_test_filter() { eval ONLY_${O}=true done [ "$EXCEPT$ALWAYS_EXCEPT" ] && \ - log "skipping tests: `echo $EXCEPT $ALWAYS_EXCEPT`" + log "excepting tests: `echo $EXCEPT $ALWAYS_EXCEPT`" [ "$EXCEPT_SLOW" ] && \ log "skipping tests SLOW=no: `echo $EXCEPT_SLOW`" for E in $EXCEPT $ALWAYS_EXCEPT; do @@ -1141,6 +1286,8 @@ basetest() { } run_test() { + assert_DIR + export base=`basetest $1` if [ ! -z "$ONLY" ]; then testname=ONLY_$1 @@ -1223,8 +1370,8 @@ pass() { } check_mds() { - FFREE=`cat /proc/fs/lustre/mds/*/filesfree` - FTOTAL=`cat /proc/fs/lustre/mds/*/filestotal` + FFREE=`lctl get_param -n mds.*.filesfree` + FTOTAL=`lctl get_param -n mds.*.filestotal` [ $FFREE -ge $FTOTAL ] && error "files free $FFREE > total $FTOTAL" || true } @@ -1233,7 +1380,7 @@ reset_fail_loc () { local NODE for NODE in $myNODES; do - do_node $NODE sysctl -e -w lustre.fail_loc=0 || true + do_node $NODE "lctl set_param fail_loc=0 2>/dev/null || true" done } @@ -1244,7 +1391,6 @@ run_one() { export tdir=d0.${TESTSUITE}/d${base} local SAVE_UMASK=`umask` umask 0022 - mkdir -p $DIR/$tdir BEFORE=`date +%s` log "== test $testnum: $message ============ `date +%H:%M:%S` ($BEFORE)" @@ -1252,17 +1398,16 @@ run_one() { export TESTNAME=test_$testnum test_${testnum} || error "test_$testnum failed with $?" #check_mds + cd $SAVE_PWD reset_fail_loc check_grant ${testnum} || error "check_grant $testnum failed with $?" [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \ error "LBUG/LASSERT detected" ps auxww | grep -v grep | grep -q multiop && error "multiop still running" pass "($((`date +%s` - $BEFORE))s)" - rmdir ${DIR}/$tdir >/dev/null 2>&1 || true unset TESTNAME unset tdir umask $SAVE_UMASK - cd $SAVE_PWD $CLEANUP } @@ -1281,9 +1426,9 @@ check_grant() { [ "$CHECK_GRANT" == "no" ] && return 0 testname=GCHECK_ONLY_${base} - [ ${!testname}x == x ] && return 0 + [ ${!testname}x == x ] && return 0 - echo -n "checking grant......" + echo -n "checking grant......" cd $SAVE_PWD # write some data to sync client lost_grant rm -f $DIR1/${tfile}_check_grant_* 2>&1 @@ -1292,18 +1437,18 @@ check_grant() { dd if=/dev/zero of=$DIR1/${tfile}_check_grant_$i bs=4k \ count=1 > /dev/null 2>&1 done - # sync all the data and make sure no pending data on server - sync_clients - - #get client grant and server grant - client_grant=0 - for d in ${LPROC}/osc/*/cur_grant_bytes; do - client_grant=$((client_grant + `cat $d`)) - done - server_grant=0 - for d in ${LPROC}/obdfilter/*/tot_granted; do - server_grant=$((server_grant + `cat $d`)) - done + # sync all the data and make sure no pending data on server + sync_clients + + #get client grant and server grant + client_grant=0 + for d in `lctl get_param -n osc.*.cur_grant_bytes`; do + client_grant=$((client_grant + $d)) + done + server_grant=0 + for d in `lctl get_param -n obdfilter.*.tot_granted`; do + server_grant=$((server_grant + $d)) + done # cleanup the check_grant file for i in `seq $OSTCOUNT`; do @@ -1370,6 +1515,9 @@ nodes_list () { local myNODES=$HOSTNAME local myNODES_sort + # CLIENTS (if specified) contains the local client + [ -n "$CLIENTS" ] && myNODES=${CLIENTS//,/ } + if [ "$PDSH" -a "$PDSH" != "no_dsh" ]; then myNODES="$myNODES $(osts_nodes) $mds_HOST" fi @@ -1384,6 +1532,17 @@ is_patchless () lctl get_param version | grep -q patchless } +get_node_count() { + local nodes="$@" + echo $nodes | wc -w || true +} + +mixed_ost_devs () { + local nodes=$(osts_nodes) + local osscount=$(get_node_count "$nodes") + [ ! "$OSTCOUNT" = "$osscount" ] +} + check_runas_id_ret() { local myRC=0 local myRUNAS_ID=$1 @@ -1438,3 +1597,33 @@ multiop_bg_pause() { return 0 } + +# reset llite stat counters +clear_llite_stats(){ + lctl set_param -n llite.*.stats 0 +} + +# sum llite stat items +calc_llite_stats() { + local res=$(lctl get_param -n llite.*.stats | + awk 'BEGIN {s = 0} END {print s} /^'"$1"'/ {s += $2}') + echo $res +} + +# save_lustre_params(node, parameter_mask) +# generate a stream of formatted strings (<node> <param name>=<param value>) +save_lustre_params() { + local s + do_node $1 "lctl get_param $2" | while read s; do echo "$1 $s"; done +} + +# restore lustre parameters from input stream, produces by save_lustre_params +restore_lustre_params() { + local node + local name + local val + while IFS=" =" read node name val; do + do_node $node "lctl set_param -n $name $val" + done +} + diff --git a/lustre/utils/.cvsignore b/lustre/utils/.cvsignore index b9424085c5e407106bc838a307676a4b953d1817..2e49b3f39bde799b0fddfceedec32f5050c29132 100644 --- a/lustre/utils/.cvsignore +++ b/lustre/utils/.cvsignore @@ -19,6 +19,7 @@ lr_reader ltrack_stats obdio obdbarrier +ll_recover_lost_found_objs lload llverfs llverdev diff --git a/lustre/utils/Makefile.am b/lustre/utils/Makefile.am index 192b358fff5c97b333650818d21ebe17642fdee2..ca52aec476000c4a5d84a01e5bab9e1637e64e01 100644 --- a/lustre/utils/Makefile.am +++ b/lustre/utils/Makefile.am @@ -10,11 +10,13 @@ sbin_scripts = lrun bin_scripts = llstat llobdstat plot-llstat if UTILS -noinst_PROGRAMS = wirecheck obdio obdbarrier +noinst_PROGRAMS = obdio obdbarrier +EXTRA_PROGRAMS = wirecheck # mount only finds helpers in /sbin rootsbin_PROGRAMS = mount.lustre sbin_PROGRAMS = mkfs.lustre tunefs.lustre lctl wiretest \ - l_getgroups llverfs llverdev llog_reader lr_reader ltrack_stats + l_getgroups llverfs llverdev llog_reader ll_recover_lost_found_objs \ + lr_reader ltrack_stats if LIBPTHREAD sbin_PROGRAMS += loadgen endif @@ -69,13 +71,17 @@ llog_reader_SOURCES = llog_reader.c llog_reader_LDADD := $(LIBPTLCTL) llog_reader_DEPENDENCIES := $(LIBPTLCTL) +ll_recover_lost_found_objs_SOURCES = ll_recover_lost_found_objs.c +ll_recover_lost_found_objs_LDADD := $(LIBPTLCTL) +ll_recover_lost_found_objs_DEPENDENCIES := $(LIBPTLCTL) + lr_reader_SOURCES = lr_reader.c -mount_lustre_SOURCES = mount_lustre.c +mount_lustre_SOURCES = mount_lustre.c mount_utils.c mount_utils.h mount_lustre_LDADD := $(LIBPTLCTL) mount_lustre_DEPENDENCIES := $(LIBPTLCTL) -mkfs_lustre_SOURCES = mkfs_lustre.c +mkfs_lustre_SOURCES = mkfs_lustre.c mount_utils.c mount_utils.h mkfs_lustre_CPPFLAGS = -UTUNEFS $(AM_CPPFLAGS) mkfs_lustre_LDADD := $(LIBPTLCTL) mkfs_lustre_DEPENDENCIES := $(LIBPTLCTL) diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index 85e41055c23bce88336f5e87dc6907898dcc1f6e..cb9a8d8ac06557092d2c8790c881f6b06c0f1b2e 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -103,7 +103,7 @@ command_t cmdlist[] = { {"device_list", jt_obd_list, 0, "show all devices\n" "usage: device_list"}, {"dl", jt_obd_list, 0, "show all devices\n" - "usage: dl"}, + "usage: dl [-t]"}, /* Device operations */ {"==== obd device operations ===", jt_noop, 0, "device operations"}, @@ -120,9 +120,10 @@ command_t cmdlist[] = { {"local_param", jt_lcfg_param, 0, "set a temporary, local param\n" "usage: local_param <target.keyword=val> ...\n"}, {"get_param", jt_lcfg_getparam, 0, "get the Lustre or LNET parameter\n" - "usage: get_param [-n] path/to/param/file \n" + "usage: get_param [-n | -N] path/to/param/file \n" "Get the value of Lustre or LNET parameter from the specified path\n" - "Use '-n' to disable printing of the key name when printing values."}, + "Use '-n' to disable printing of the key name when printing values.\n" + "Use '-N' to print only path names and not the values."}, {"set_param", jt_lcfg_setparam, 0, "set the Lustre or LNET parameter\n" "usage: set_param [-n] path/to/param/file value\n" "Set the value of the Lustre or LNET parameter at the specified path\n" @@ -208,7 +209,7 @@ command_t cmdlist[] = { {"add_peer", jt_ptl_add_peer, 0, "add an peer entry\n" "usage: add_peer <nid> <host> <port>"}, {"del_peer", jt_ptl_del_peer, 0, "remove an peer entry\n" - "usage: del_autoconn [<nid>] [<host>] [ks]"}, + "usage: del_peer [<nid>] [<ipaddr|pid>]"}, {"add_conn ", jt_lcfg_add_conn, 0, "usage: add_conn <conn_uuid> [priority]\n"}, {"del_conn ", jt_lcfg_del_conn, 0, diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index dba6a473c2acfe855ba04ba336f8ad7f08878ef3..dd87f86c0453cebd1c5d67bc46c26fb3bb7e4b6b 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -97,7 +97,9 @@ command_t cmdlist[] = { "usage: find <dir/file> ... \n" " [[!] --atime|-A [+-]N] [[!] --mtime|-M [+-]N] [[!] --ctime|-C [+-]N]\n" " [--maxdepth|-D N] [[!] --name|-n <pattern>] [--print0|-P]\n" - " [--print|-p] [--obd|-O <uuid>] [[!] --type|-t <filetype>]\n" + " [--print|-p] [--obd|-O <uuid[s]>] [[!] --size|-s [+-]N[bkMGTP]]\n" + " [[!] --type|-t <filetype>] [[!] --gid|-g N] [[!] --group|-G <name>]\n" + " [[!] --uid|-u N] [[!] --user|-U <name>]\n" "\t !: used before an option indicates 'NOT' the requested attribute\n" "\t -: used before an value indicates 'AT MOST' the requested value\n" "\t +: used before an option indicates 'AT LEAST' the requested value\n"}, @@ -133,7 +135,19 @@ command_t cmdlist[] = { "usage: quotaoff [ -ug ] <filesystem>"}, {"setquota", lfs_setquota, 0, "Set filesystem quotas.\n" "usage: setquota [ -u | -g ] <name> <block-softlimit> <block-hardlimit> <inode-softlimit> <inode-hardlimit> <filesystem>\n" - " setquota -t [ -u | -g ] <block-grace> <inode-grace> <filesystem>"}, + " setquota -t [ -u | -g ] <block-grace> <inode-grace> <filesystem>\n" + " setquota [ -u | --user | -g | --group ] <name>\n" + " [--block-softlimit <block-softlimit>]\n" + " [--block-hardlimit <block-hardlimit>]\n" + " [--inode-softlimit <inode-softlimit>]\n" + " [--inode-hardlimit <inode-hardlimit>] <filesystem>\n" + " setquota [-t] [ -u | --user | -g | --group ]\n" + " [--block-grace <block-grace>]\n" + " [--inode-grace <inode-grace>] <filesystem>\n" + " -b can be used instead of --block-softlimit/--block-grace\n" + " -B can be used instead of --block-hardlimit\n" + " -i can be used instead of --inode-softlimit/--inode-grace\n" + " -I can be used instead of --inode-hardlimit"}, {"quota", lfs_quota, 0, "Display disk usage and limits.\n" "usage: quota [ -o obd_uuid ] [{-u|-g <name>}|-t] <filesystem>"}, {"quotainv", lfs_quotainv, 0, "Invalidate quota data.\n" @@ -188,9 +202,9 @@ static int lfs_setstripe(int argc, char **argv) stripe_count_arg = argv[4]; optind = 4; } else { + optind = 0; while ((c = getopt_long(argc, argv, "c:di:s:", - long_opts, NULL)) >= 0) - { + long_opts, NULL)) >= 0) { switch (c) { case 0: /* Long options. */ @@ -242,7 +256,7 @@ static int lfs_setstripe(int argc, char **argv) /* get the stripe size */ if (stripe_size_arg != NULL) { - result = parse_size(stripe_size_arg, &st_size, &size_units); + result = parse_size(stripe_size_arg, &st_size, &size_units, 0); if (result) { fprintf(stderr,"error: bad size '%s'\n", stripe_size_arg); @@ -301,6 +315,60 @@ static int set_time(time_t *time, time_t *set, char *str) return res; } +static int name2id(unsigned int *id, char *name, int type) +{ + if (type == USRQUOTA) { + struct passwd *entry; + + if (!(entry = getpwnam(name))) { + if (!errno) + errno = ENOENT; + return -1; + } + + *id = entry->pw_uid; + } else { + struct group *entry; + + if (!(entry = getgrnam(name))) { + if (!errno) + errno = ENOENT; + return -1; + } + + *id = entry->gr_gid; + } + + return 0; +} + +static int id2name(char **name, unsigned int id, int type) +{ + if (type == USRQUOTA) { + struct passwd *entry; + + if (!(entry = getpwuid(id))) { + if (!errno) + errno = ENOENT; + return -1; + } + + *name = entry->pw_name; + } else { + struct group *entry; + + if (!(entry = getgrgid(id))) { + if (!errno) + errno = ENOENT; + return -1; + } + + *name = entry->gr_name; + } + + return 0; +} + static int lfs_find(int argc, char **argv) { int new_fashion = 1; @@ -314,6 +382,10 @@ static int lfs_find(int argc, char **argv) {"ctime", required_argument, 0, 'C'}, {"mtime", required_argument, 0, 'M'}, {"maxdepth", required_argument, 0, 'D'}, + {"gid", required_argument, 0, 'g'}, + {"group", required_argument, 0, 'G'}, + {"uid", required_argument, 0, 'u'}, + {"user", required_argument, 0, 'U'}, {"name", required_argument, 0, 'n'}, /* --obd is considered as a new option. */ {"obd", required_argument, 0, 'O'}, @@ -334,10 +406,12 @@ static int lfs_find(int argc, char **argv) time_t *xtime; int *xsign; int isoption; + char *endptr; time(&t); - while ((c = getopt_long_only(argc, argv, "-A:C:D:M:n:PpO:qrs:t:v", + optind = 0; + while ((c = getopt_long_only(argc, argv, "-A:C:D:g:G:M:n:PpO:qrs:t:u:U:v", long_opts, NULL)) >= 0) { xtime = NULL; xsign = NULL; @@ -404,6 +478,54 @@ static int lfs_find(int argc, char **argv) new_fashion = 1; param.maxdepth = strtol(optarg, 0, 0); break; + case 'g': + new_fashion = 1; + param.gid = strtol(optarg, &endptr, 10); + if (optarg == endptr) { + fprintf(stderr, "Bad gid: %s\n", optarg); + return CMD_HELP; + } + param.exclude_gid = !!neg_opt; + param.check_gid = 1; + break; + case 'G': + new_fashion = 1; + param.gid = strtol(optarg, &endptr, 10); + if (optarg == endptr) { + ret = name2id(¶m.gid, optarg, GRPQUOTA); + if (ret != 0) { + fprintf(stderr, "Group/GID: %s cannot " + "be found.\n", optarg); + return -1; + } + } + param.exclude_gid = !!neg_opt; + param.check_gid = 1; + break; + case 'u': + new_fashion = 1; + param.uid = strtol(optarg, &endptr, 10); + if (optarg == endptr) { + fprintf(stderr, "Bad uid: %s\n", optarg); + return CMD_HELP; + } + param.exclude_uid = !!neg_opt; + param.check_uid = 1; + break; + case 'U': + new_fashion = 1; + param.uid = strtol(optarg, &endptr, 10); + if (optarg == endptr) { + ret = name2id(¶m.uid, optarg, USRQUOTA); + if (ret != 0) { + fprintf(stderr, "User/UID: %s cannot " + "be found.\n", optarg); + return -1; + } + } + param.exclude_uid = !!neg_opt; + param.check_uid = 1; + break; case 'n': new_fashion = 1; param.pattern = (char *)optarg; @@ -495,7 +617,8 @@ static int lfs_find(int argc, char **argv) if (param.size_sign) optarg++; - ret = parse_size(optarg, ¶m.size,¶m.size_units); + ret = parse_size(optarg, ¶m.size, + ¶m.size_units, 0); if (ret) { fprintf(stderr,"error: bad size '%s'\n", optarg); @@ -621,7 +744,6 @@ static int lfs_osts(int argc, char **argv) { FILE *fp; struct mntent *mnt = NULL; - struct obd_uuid *obduuid = NULL; struct find_param param; int rc=0; @@ -635,9 +757,8 @@ static int lfs_osts(int argc, char **argv) strerror (errno)); } else { mnt = getmntent(fp); - memset(¶m, 0, sizeof(param)); - param.obduuid = obduuid; while (feof(fp) == 0 && ferror(fp) ==0) { + memset(¶m, 0, sizeof(param)); if (llapi_is_lustre_mnt(mnt)) { rc = llapi_getstripe(mnt->mnt_dir, ¶m); if (rc) @@ -868,7 +989,7 @@ static int lfs_df(int argc, char **argv) FILE *fp; char *path = NULL; struct mntent *mnt = NULL; - char mntdir[PATH_MAX] = {'\0'}; + char *mntdir = NULL; int ishow = 0, cooked = 0; int c, rc = 0; @@ -895,10 +1016,19 @@ static int lfs_df(int argc, char **argv) argv[0], MOUNTED, strerror(errno)); return rc; } + + if ((mntdir = malloc(PATH_MAX)) == NULL) { + fprintf(stderr, "error: cannot allocate %d bytes\n", + PATH_MAX); + return -ENOMEM; + } + memset(mntdir, 0, PATH_MAX); + if (path) { - rc = path2mnt(path, fp, mntdir, sizeof(mntdir)); + rc = path2mnt(path, fp, mntdir, PATH_MAX); if (rc) { endmntent(fp); + free(mntdir); return rc; } @@ -919,6 +1049,7 @@ static int lfs_df(int argc, char **argv) endmntent(fp); } + free(mntdir); return rc; } @@ -1060,6 +1191,7 @@ static int lfs_quotachown(int argc, char **argv) int c,rc; int flag = 0; + optind = 0; while ((c = getopt(argc, argv, "i")) != -1) { switch (c) { case 'i': @@ -1119,7 +1251,6 @@ static int lfs_quotacheck(int argc, char **argv) memset(&qctl, 0, sizeof(qctl)); qctl.qc_cmd = LUSTRE_Q_QUOTAOFF; - qctl.qc_id = QFMT_LDISKFS; qctl.qc_type = check_type; rc = llapi_quotactl(mnt, &qctl); if (rc) { @@ -1144,7 +1275,6 @@ static int lfs_quotacheck(int argc, char **argv) memset(&qctl, 0, sizeof(qctl)); qctl.qc_cmd = LUSTRE_Q_QUOTAON; - qctl.qc_id = QFMT_LDISKFS; qctl.qc_type = check_type; rc = llapi_quotactl(mnt, &qctl); if (rc) { @@ -1169,7 +1299,6 @@ static int lfs_quotaon(int argc, char **argv) memset(&qctl, 0, sizeof(qctl)); qctl.qc_cmd = LUSTRE_Q_QUOTAON; - qctl.qc_id = QFMT_LDISKFS; optind = 0; while ((c = getopt(argc, argv, "ugf")) != -1) { @@ -1277,7 +1406,7 @@ static int lfs_quotainv(int argc, char **argv) qctl.qc_cmd = LUSTRE_Q_INVALIDATE; optind = 0; - while ((c = getopt(argc, argv, "ug")) != -1) { + while ((c = getopt(argc, argv, "ugf")) != -1) { switch (c) { case 'u': qctl.qc_type |= 0x01; @@ -1285,6 +1414,9 @@ static int lfs_quotainv(int argc, char **argv) case 'g': qctl.qc_type |= 0x02; break; + case 'f': + qctl.qc_cmd = LUSTRE_Q_FINVALIDATE; + break; default: fprintf(stderr, "error: %s: option '-%c' " "unrecognized\n", argv[0], c); @@ -1311,60 +1443,6 @@ static int lfs_quotainv(int argc, char **argv) return 0; } -static int name2id(unsigned int *id, char *name, int type) -{ - if (type == USRQUOTA) { - struct passwd *entry; - - if (!(entry = getpwnam(name))) { - if (!errno) - errno = ENOENT; - return -1; - } - - *id = entry->pw_uid; - } else { - struct group *entry; - - if (!(entry = getgrnam(name))) { - if (!errno) - errno = ENOENT; - return -1; - } - - *id = entry->gr_gid; - } - - return 0; -} - -static int id2name(char **name, unsigned int id, int type) -{ - if (type == USRQUOTA) { - struct passwd *entry; - - if (!(entry = getpwuid(id))) { - if (!errno) - errno = ENOENT; - return -1; - } - - *name = entry->pw_name; - } else { - struct group *entry; - - if (!(entry = getgrgid(id))) { - if (!errno) - errno = ENOENT; - return -1; - } - - *name = entry->gr_name; - } - - return 0; -} - #define ARG2INT(nr, str, msg) \ do { \ char *endp; \ @@ -1434,93 +1512,278 @@ error: return ULONG_MAX; } -#define ARG2ULL(nr, str, msg) \ +#define ARG2ULL(nr, str, defscale) \ do { \ - char *endp; \ - nr = strtoull(str, &endp, 0); \ - if (*endp) { \ - fprintf(stderr, "error: bad %s: %s\n", msg, str); \ + unsigned long long limit, units = 0; \ + int rc; \ + \ + rc = parse_size(str, &limit, &units, 1); \ + if (rc < 0) { \ + fprintf(stderr, "error: bad limit value %s\n", str); \ return CMD_HELP; \ } \ + nr = ((units == 0) ? (defscale) : 1) * limit; \ } while (0) +static inline int has_times_option(int argc, char **argv) +{ + int i; -int lfs_setquota(int argc, char **argv) + for (i = 1; i < argc; i++) + if (!strcmp(argv[i], "-t")) + return 1; + + return 0; +} + +int lfs_setquota_times(int argc, char **argv) { - int c; - char *mnt; + int c, rc; struct if_quotactl qctl; - char *obd_type = (char *)qctl.obd_type; - int rc; + char *mnt, *obd_type = (char *)qctl.obd_type; + struct obd_dqblk *dqb = &qctl.qc_dqblk; + struct obd_dqinfo *dqi = &qctl.qc_dqinfo; + struct option long_opts[] = { + {"user", no_argument, 0, 'u'}, + {"group", no_argument, 0, 'g'}, + {"block-grace", required_argument, 0, 'b'}, + {"inode-grace", required_argument, 0, 'i'}, + {"times", no_argument, 0, 't'}, + {0, 0, 0, 0} + }; memset(&qctl, 0, sizeof(qctl)); - qctl.qc_cmd = LUSTRE_Q_SETQUOTA; + qctl.qc_cmd = LUSTRE_Q_SETINFO; + qctl.qc_type = UGQUOTA; + +#if 1 + /* compatibility syntax: setquota -t -[u|g] t1 t2 mnt */ + if (argc == 6 && !strcmp(argv[1], "-t") && + (!strcmp(argv[2], "-u") || !strcmp(argv[2], "-g")) && + argv[3][0] != '-' && argv[4][0] != '-') { + fprintf(stderr, "warning: using compatibility syntax, it may not" + " be available in future releases!\n"); + + qctl.qc_type = !strcmp(argv[2], "-u") ? USRQUOTA : GRPQUOTA; + + if ((dqi->dqi_bgrace = str2sec(argv[3])) == ULONG_MAX) { + fprintf(stderr, "error: bad block-grace: %s\n", optarg); + return CMD_HELP; + } + if ((dqi->dqi_igrace = str2sec(argv[4])) == ULONG_MAX) { + fprintf(stderr, "error: bad inode-grace: %s\n", optarg); + return CMD_HELP; + } + dqb->dqb_valid = QIF_TIMES; + mnt = argv[argc - 1]; + goto quotactl; + } +#endif optind = 0; - while ((c = getopt(argc, argv, "ugt")) != -1) { + while ((c = getopt_long(argc, argv, "ugb:i:t", long_opts, NULL)) != -1) { switch (c) { case 'u': - qctl.qc_type |= 0x01; - break; case 'g': - qctl.qc_type |= 0x02; + if (qctl.qc_type != UGQUOTA) { + fprintf(stderr, "error: -u and -g can't be used " + "more than once\n"); + return CMD_HELP; + } + qctl.qc_type = (c == 'u') ? USRQUOTA : GRPQUOTA; break; - case 't': - qctl.qc_cmd = LUSTRE_Q_SETINFO; + case 'b': + if ((dqi->dqi_bgrace = str2sec(optarg)) == ULONG_MAX) { + fprintf(stderr, "error: bad block-grace: %s\n", + optarg); + return CMD_HELP; + } + dqb->dqb_valid |= QIF_BTIME; break; - default: - fprintf(stderr, "error: %s: option '-%c' " - "unrecognized\n", argv[0], c); + case 'i': + if ((dqi->dqi_igrace = str2sec(optarg)) == ULONG_MAX) { + fprintf(stderr, "error: bad inode-grace: %s\n", + optarg); + return CMD_HELP; + } + dqb->dqb_valid |= QIF_ITIME; + break; + case 't': /* Yes, of course! */ + break; + default: /* getopt prints error message for us when opterr != 0 */ return CMD_HELP; } } - if (qctl.qc_type) - qctl.qc_type--; - if (qctl.qc_type == UGQUOTA) { - fprintf(stderr, "error: user and group quotas can't be set " - "both\n"); + fprintf(stderr, "error: neither -u nor -g specified\n"); + return CMD_HELP; + } + + if (optind != argc - 1) { + fprintf(stderr, "error: unexpected parameters encountered\n"); return CMD_HELP; } - if (qctl.qc_cmd == LUSTRE_Q_SETQUOTA) { - struct obd_dqblk *dqb = &qctl.qc_dqblk; + mnt = argv[optind]; - if (optind + 6 != argc) - return CMD_HELP; +#if 1 +quotactl: +#endif + rc = llapi_quotactl(mnt, &qctl); + if (rc) { + if (*obd_type) + fprintf(stderr, "%s %s ", obd_type, + obd_uuid2str(&qctl.obd_uuid)); + fprintf(stderr, "setquota failed: %s\n", strerror(errno)); + return rc; + } + + return 0; +} - rc = name2id(&qctl.qc_id, argv[optind++], qctl.qc_type); +#define BSLIMIT (1 << 0) +#define BHLIMIT (1 << 1) +#define ISLIMIT (1 << 2) +#define IHLIMIT (1 << 3) + +int lfs_setquota(int argc, char **argv) +{ + int c, rc; + struct if_quotactl qctl; + char *mnt, *obd_type = (char *)qctl.obd_type; + struct obd_dqblk *dqb = &qctl.qc_dqblk; + struct option long_opts[] = { + {"user", required_argument, 0, 'u'}, + {"group", required_argument, 0, 'g'}, + {"block-softlimit", required_argument, 0, 'b'}, + {"block-hardlimit", required_argument, 0, 'B'}, + {"inode-softlimit", required_argument, 0, 'i'}, + {"inode-hardlimit", required_argument, 0, 'I'}, + {0, 0, 0, 0} + }; + unsigned limit_mask = 0; + + if (has_times_option(argc, argv)) + return lfs_setquota_times(argc, argv); + + memset(&qctl, 0, sizeof(qctl)); + qctl.qc_cmd = LUSTRE_Q_SETQUOTA; + qctl.qc_type = UGQUOTA; /* UGQUOTA makes no sense for setquota, + * so it can be used as a marker that qc_type + * isn't reinitialized from command line */ + +#if 1 + /* compatibility syntax: [-u|-g] <user|group> b B i I mount + * will be removed in the future */ + if (argc == 8 && (!strcmp(argv[1], "-u") || !strcmp(argv[1], "-g")) && + argv[3][0] != '-' && argv[4][0] != '-' && argv[5][0] != '-' && + argv[6][0] != '-') { + fprintf(stderr, "warning: using compatibility syntax, it may not" + " be available in future releases!\n"); + + qctl.qc_type = !strcmp(argv[1], "-u") ? USRQUOTA : GRPQUOTA; + rc = name2id(&qctl.qc_id, argv[2], qctl.qc_type); if (rc) { - fprintf(stderr, "error: find id for name %s failed: %s\n", - argv[optind - 1], strerror(errno)); + fprintf(stderr, "error: unknown id %s\n", optarg); return CMD_HELP; } - ARG2ULL(dqb->dqb_bsoftlimit, argv[optind++], "block-softlimit"); - ARG2ULL(dqb->dqb_bhardlimit, argv[optind++], "block-hardlimit"); - ARG2ULL(dqb->dqb_isoftlimit, argv[optind++], "inode-softlimit"); - ARG2ULL(dqb->dqb_ihardlimit, argv[optind++], "inode-hardlimit"); + ARG2ULL(dqb->dqb_bsoftlimit, argv[3], 1024); + dqb->dqb_bsoftlimit >>= 10; + ARG2ULL(dqb->dqb_bhardlimit, argv[4], 1024); + dqb->dqb_bhardlimit >>= 10; + ARG2ULL(dqb->dqb_isoftlimit, argv[5], 1); + ARG2ULL(dqb->dqb_ihardlimit, argv[6], 1); dqb->dqb_valid = QIF_LIMITS; - } else { - struct obd_dqinfo *dqi = &qctl.qc_dqinfo; + mnt = argv[argc - 1]; + goto quotactl; + } +#endif - if (optind + 3 != argc) + optind = 0; + while ((c = getopt_long(argc, argv, "u:g:b:B:i:I:", long_opts, NULL)) != -1) { + switch (c) { + case 'u': + case 'g': + if (qctl.qc_type != UGQUOTA) { + fprintf(stderr, "error: -u and -g can't be used" + " more than once\n"); + return CMD_HELP; + } + qctl.qc_type = (c == 'u') ? USRQUOTA : GRPQUOTA; + rc = name2id(&qctl.qc_id, optarg, qctl.qc_type); + if (rc) { + fprintf(stderr, "error: unknown id %s\n", + optarg); + return CMD_HELP; + } + break; + case 'b': + ARG2ULL(dqb->dqb_bsoftlimit, optarg, 1024); + dqb->dqb_bsoftlimit >>= 10; + limit_mask |= BSLIMIT; + break; + case 'B': + ARG2ULL(dqb->dqb_bhardlimit, optarg, 1024); + dqb->dqb_bhardlimit >>= 10; + limit_mask |= BHLIMIT; + break; + case 'i': + ARG2ULL(dqb->dqb_isoftlimit, optarg, 1); + limit_mask |= ISLIMIT; + break; + case 'I': + ARG2ULL(dqb->dqb_ihardlimit, optarg, 1); + limit_mask |= IHLIMIT; + break; + default: /* getopt prints error message for us when opterr != 0 */ return CMD_HELP; + } + } + + if (qctl.qc_type == UGQUOTA) { + fprintf(stderr, "error: neither -u nor -g are specified\n"); + return CMD_HELP; + } - if ((dqi->dqi_bgrace = str2sec(argv[optind++])) == ULONG_MAX) { - fprintf(stderr, "error: bad %s: %s\n", "block-grace", argv[optind - 1]); + if (optind != argc - 1) { + fprintf(stderr, "error: unexpected parameters encountered\n"); + return CMD_HELP; + } + + mnt = argv[optind]; + + if ((!(limit_mask & BHLIMIT) ^ !(limit_mask & BSLIMIT)) || + (!(limit_mask & IHLIMIT) ^ !(limit_mask & ISLIMIT))) { + /* sigh, we can't just set blimits/ilimits */ + struct if_quotactl tmp_qctl = {.qc_cmd = LUSTRE_Q_GETQUOTA, + .qc_type = qctl.qc_type, + .qc_id = qctl.qc_id}; + + rc = llapi_quotactl(mnt, &tmp_qctl); + if (rc < 0) { + fprintf(stderr, "error: getquota failed\n"); return CMD_HELP; } - if ((dqi->dqi_igrace = str2sec(argv[optind++])) == ULONG_MAX) { - fprintf(stderr, "error: bad %s: %s\n", "inode-grace", argv[optind - 1]); - return CMD_HELP; - } + + if (!(limit_mask & BHLIMIT)) + dqb->dqb_bhardlimit = tmp_qctl.qc_dqblk.dqb_bhardlimit; + if (!(limit_mask & BSLIMIT)) + dqb->dqb_bsoftlimit = tmp_qctl.qc_dqblk.dqb_bsoftlimit; + if (!(limit_mask & IHLIMIT)) + dqb->dqb_ihardlimit = tmp_qctl.qc_dqblk.dqb_ihardlimit; + if (!(limit_mask & ISLIMIT)) + dqb->dqb_isoftlimit = tmp_qctl.qc_dqblk.dqb_isoftlimit; } - mnt = argv[optind]; + dqb->dqb_valid |= (limit_mask & (BHLIMIT | BSLIMIT)) ? QIF_BLIMITS : 0; + dqb->dqb_valid |= (limit_mask & (IHLIMIT | ISLIMIT)) ? QIF_ILIMITS : 0; +#if 1 +quotactl: +#endif rc = llapi_quotactl(mnt, &qctl); if (rc) { if (*obd_type) @@ -1770,21 +2033,30 @@ out: static int lfs_quota(int argc, char **argv) { int c; - char *name = NULL, *mnt; + char *mnt, *name = NULL; struct if_quotactl qctl = { .qc_cmd = LUSTRE_Q_GETQUOTA, - .qc_type = 0x01 }; + .qc_type = UGQUOTA }; char *obd_type = (char *)qctl.obd_type; char *obd_uuid = (char *)qctl.obd_uuid.uuid; - int rc, rc1 = 0, rc2 = 0, rc3 = 0; + int rc, rc1 = 0, rc2 = 0, rc3 = 0, verbose = 0; + int pass = 0; optind = 0; - while ((c = getopt(argc, argv, "ugto:")) != -1) { + while ((c = getopt(argc, argv, "ugto:v")) != -1) { switch (c) { case 'u': - qctl.qc_type = 0x01; + if (qctl.qc_type != UGQUOTA) { + fprintf(stderr, "error: use either -u or -g\n"); + return CMD_HELP; + } + qctl.qc_type = USRQUOTA; break; case 'g': - qctl.qc_type = 0x02; + if (qctl.qc_type != UGQUOTA) { + fprintf(stderr, "error: use either -u or -g\n"); + return CMD_HELP; + } + qctl.qc_type = GRPQUOTA; break; case 't': qctl.qc_cmd = LUSTRE_Q_GETINFO; @@ -1792,6 +2064,9 @@ static int lfs_quota(int argc, char **argv) case 'o': strncpy(obd_uuid, optarg, sizeof(qctl.obd_uuid)); break; + case 'v': + verbose = 1; + break; default: fprintf(stderr, "error: %s: option '-%c' " "unrecognized\n", argv[0], c); @@ -1799,11 +2074,23 @@ static int lfs_quota(int argc, char **argv) } } - if (qctl.qc_type) - qctl.qc_type--; - - - if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) { + /* current uid/gid info for "lfs quota /path/to/lustre/mount" */ + if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA && qctl.qc_type == UGQUOTA && + optind == argc - 1) { +ug_output: + memset(&qctl, 0, sizeof(qctl)); /* spoiled by print_*_quota */ + qctl.qc_cmd = LUSTRE_Q_GETQUOTA; + if (pass++ == 0) { + qctl.qc_type = USRQUOTA; + qctl.qc_id = geteuid(); + } else { + qctl.qc_type = GRPQUOTA; + qctl.qc_id = getegid(); + } + rc = id2name(&name, qctl.qc_id, qctl.qc_type); + if (rc) + name = "<unknown>"; + } else if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) { if (optind + 2 != argc) { fprintf(stderr, "error: missing quota argument(s)\n"); return CMD_HELP; @@ -1816,34 +2103,31 @@ static int lfs_quota(int argc, char **argv) name, strerror(errno)); return CMD_HELP; } - print_quota_title(name, &qctl); } else if (optind + 1 != argc) { fprintf(stderr, "error: missing quota info argument(s)\n"); return CMD_HELP; } + if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) + print_quota_title(name, &qctl); + mnt = argv[optind]; rc1 = llapi_quotactl(mnt, &qctl); if (rc1 == -1 && errno == ESRCH) { fprintf(stderr, "\n%s quotas are not enabled.\n", - qctl.qc_type == 0x00 ? "user" : "group"); - return 0; + qctl.qc_type == USRQUOTA ? "user" : "group"); + goto out; } if (rc1 && *obd_type) fprintf(stderr, "%s %s ", obd_type, obd_uuid); - if (!name) - rc = id2name(&name, getuid(), qctl.qc_type); - - if (*obd_uuid) { + if (*obd_uuid) mnt = ""; - name = obd_uuid; - } print_quota(mnt, &qctl, GENERAL_QUOTA_INFO); - if (!*obd_uuid && qctl.qc_cmd != LUSTRE_Q_GETINFO) { + if (!*obd_uuid && qctl.qc_cmd != LUSTRE_Q_GETINFO && verbose) { rc2 = print_mds_quota(mnt, &qctl); rc3 = print_lov_quota(mnt, &qctl); } @@ -1852,6 +2136,11 @@ static int lfs_quota(int argc, char **argv) printf("Some errors happened when getting quota info. " "Some devices may be not working or deactivated. " "The data in \"[]\" is inaccurate.\n"); + +out: + if (pass == 1) + goto ug_output; + return 0; } #endif /* HAVE_QUOTA_SUPPORT */ diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 40f2f205a40d3c4dd7ef04f55b3cc7ce4e88be23..4950022730bf0e0874769b30f7fe47aa5f5afd4b 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -24,7 +24,9 @@ */ /* for O_DIRECTORY */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include <stdlib.h> #include <stdio.h> @@ -143,18 +145,23 @@ void llapi_printf(int level, char *fmt, ...) va_end(args); } +/* size_units is unchanged if no specifier used */ int parse_size(char *optarg, unsigned long long *size, - unsigned long long *size_units) + unsigned long long *size_units, int bytes_spec) { char *end; - *size = strtoul(optarg, &end, 0); + *size = strtoull(optarg, &end, 0); if (*end != '\0') { if ((*end == 'b') && *(end+1) == '\0' && - (*size & (~0ULL << (64 - 9))) == 0) { + (*size & (~0ULL << (64 - 9))) == 0 && + !bytes_spec) { *size <<= 9; *size_units = 1 << 9; + } else if ((*end == 'b') && *(end+1) == '\0' && + bytes_spec) { + *size_units = 1; } else if ((*end == 'k' || *end == 'K') && *(end+1) == '\0' && (*size & (~0ULL << (64 - 10))) == 0) { @@ -322,6 +329,34 @@ static void find_param_fini(struct find_param *param) free(param->lmd); } +int llapi_file_get_lov_fuuid(int fd, struct obd_uuid *lov_name) +{ + int rc = ioctl(fd, OBD_IOC_GETNAME, lov_name); + if (rc) { + rc = errno; + llapi_err(LLAPI_MSG_ERROR, "error: can't get lov name."); + } + return rc; +} + +int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid) +{ + int fd, rc; + + fd = open(path, O_RDONLY); + if (fd < 0) { + rc = errno; + llapi_err(LLAPI_MSG_ERROR, "error opening %s\n", path); + return rc; + } + + rc = llapi_file_get_lov_fuuid(fd, lov_uuid); + + close(fd); + + return rc; +} + /* * If uuidp is NULL, return the number of available obd uuids. * If uuidp is non-NULL, then it will return the uuids of the obds. If @@ -330,22 +365,19 @@ static void find_param_fini(struct find_param *param) */ int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count) { - char lov_name[sizeof(struct obd_uuid)]; + struct obd_uuid lov_name; char buf[1024]; FILE *fp; int rc = 0, index = 0; /* Get the lov name */ - rc = ioctl(fd, OBD_IOC_GETNAME, (void *) lov_name); - if (rc) { - rc = errno; - llapi_err(LLAPI_MSG_ERROR, "error: can't get lov name"); + rc = llapi_file_get_lov_fuuid(fd, &lov_name); + if (rc) return rc; - } /* Now get the ost uuids from /proc */ snprintf(buf, sizeof(buf), "/proc/fs/lustre/lov/%s/target_obd", - lov_name); + lov_name.uuid); fp = fopen(buf, "r"); if (fp == NULL) { rc = errno; @@ -374,13 +406,14 @@ int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count) * returned in param->obdindex */ static int setup_obd_uuid(DIR *dir, char *dname, struct find_param *param) { + struct obd_uuid lov_uuid; char uuid[sizeof(struct obd_uuid)]; char buf[1024]; FILE *fp; int rc = 0, index; /* Get the lov name */ - rc = ioctl(dirfd(dir), OBD_IOC_GETNAME, (void *)uuid); + rc = llapi_file_get_lov_fuuid(dirfd(dir), &lov_uuid); if (rc) { if (errno != ENOTTY) { rc = errno; @@ -396,7 +429,7 @@ static int setup_obd_uuid(DIR *dir, char *dname, struct find_param *param) /* Now get the ost uuids from /proc */ snprintf(buf, sizeof(buf), "/proc/fs/lustre/lov/%s/target_obd", - uuid); + lov_uuid.uuid); fp = fopen(buf, "r"); if (fp == NULL) { rc = errno; @@ -1064,6 +1097,26 @@ static int cb_find_init(char *path, DIR *parent, DIR *dir, } } + if (param->check_uid) { + if (st->st_uid == param->uid) { + if (param->exclude_uid) + goto decided; + } else { + if (!param->exclude_uid) + goto decided; + } + } + + if (param->check_gid) { + if (st->st_gid == param->gid) { + if (param->exclude_gid) + goto decided; + } else { + if (!param->exclude_gid) + goto decided; + } + } + /* Check the time on mds. */ if (!decision) { int for_mds; diff --git a/lustre/utils/ll_recover_lost_found_objs.c b/lustre/utils/ll_recover_lost_found_objs.c new file mode 100644 index 0000000000000000000000000000000000000000..33ee9c8e4ad417b80e39c4ad7bde4e9034d73eb7 --- /dev/null +++ b/lustre/utils/ll_recover_lost_found_objs.c @@ -0,0 +1,453 @@ +/* + * Copyright (C) 2008 Sun Microssystems, Inc. + * Author: Rupesh Thakare <rupesh.thakare@sun.com> + * Author: Kalpak Shah <kalpak.shah@sun.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* + * Tool for recovering objects from lost+found that might result from a + * Lustre OST with a corrupted directory. Running e2fsck will fix the + * directory, but puts all of the objects into lost+found, where they are + * inaccessible to Lustre. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <dirent.h> +#include <sys/types.h> +#include <sys/xattr.h> +#include <sys/stat.h> + +#include <liblustre.h> + +#define MAX_GROUPS 64 + +int verbose = 0; + +struct obd_group_info { + int dir_exists; +}; +struct obd_group_info grp_info[MAX_GROUPS]; + +void usage(char *progname) +{ + fprintf(stderr, "Usage: %s [-hv] -d lost+found_directory\n", progname); + fprintf(stderr, "You need to mount the corrupted OST filesystem and" + "provide the path for the lost+found directory as the -d " + "option, for example:\n" + "ll_recover_lost_found_objs -d /mnt/ost/lost+found\n"); + exit(1); +} + +int mkdir_p(char *dest_path, char *mount, __u64 ff_group) +{ + struct stat stat_buf; + char tmp_path[PATH_MAX]; + int retval; + mode_t mode = 0700; + + if (stat(dest_path, &stat_buf) == 0) + return 0; + + if (grp_info[ff_group].dir_exists == 0) { + sprintf(tmp_path, "%s/O/"LPU64, mount, ff_group); + if (stat(tmp_path, &stat_buf) != 0) { + retval = mkdir(tmp_path, 0700); + if (retval < 0) { + fprintf(stderr, "error: creating directory %s: " + "%s\n", tmp_path, strerror(errno)); + return 1; + } + grp_info[ff_group].dir_exists = 1; + } + } + + retval = mkdir(dest_path, mode); + if (retval < 0) + return 1; + + return 0; +} + +/* This is returning 0 for an error */ +__u64 read_last_id(char *file_path) +{ + __u64 last_id; + int fd; + int count; + + fd = open(file_path, O_RDONLY); + if (fd < 0) + return 0; + + count = read(fd, &last_id, sizeof(last_id)); + if (count < 0) { + fprintf(stderr, "error: reading file %s: %s\n", file_path, + strerror(errno)); + close(fd); + return 0; + } + if (count != sizeof(last_id)) { + fprintf(stderr, "error: Could not read full last_id from %s\n", + file_path); + close(fd); + return 0; + } + + close(fd); + return le64_to_cpu(last_id); +} + +static unsigned filetype_dir_table[] = { + [0]= DT_UNKNOWN, + [S_IFIFO]= DT_FIFO, + [S_IFCHR] = DT_CHR, + [S_IFDIR] = DT_DIR, + [S_IFBLK] = DT_BLK, + [S_IFREG] = DT_REG, + [S_IFLNK] = DT_LNK, + [S_IFSOCK]= DT_SOCK, +#if defined(DT_DOOR) && defined(S_IFDOOR) + [S_IFDOOR]= DT_DOOR, +#endif +}; + +static int traverse_lost_found(char *src_dir, char *mount_path) +{ + DIR *dir_ptr; + struct filter_fid trusted_fid; + struct dirent64 *dirent; + __u64 ff_group, ff_objid; + char file_path[PATH_MAX]; + char dest_path[PATH_MAX]; + char last_id_file[PATH_MAX]; + __u64 last_id[MAX_GROUPS] = {0}; + __u64 tmp_last_id; + struct stat st; + int obj_exists, xattr_len; + int len, ret = 0, error = 0; + + len = strlen(src_dir); + + dir_ptr = opendir(src_dir); + if (!dir_ptr) { + fprintf(stderr, "error: opening directory: %s\n", + strerror(errno)); + return errno; + } + + while ((dirent = readdir64(dir_ptr)) != NULL) { + if (!strcmp(dirent->d_name, ".") || + !strcmp(dirent->d_name, "..")) + continue; + + src_dir[len] = 0; + if ((len + dirent->d_reclen + 2) > PATH_MAX) { + fprintf(stderr, "error: %s: string buffer is too small", + __FUNCTION__); + break; + } + strcat(src_dir, "/"); + strcat(src_dir, dirent->d_name); + + if (dirent->d_type == DT_UNKNOWN) { + struct stat st; + + ret = stat(src_dir, &st); + if (ret == 0) + dirent->d_type = filetype_dir_table[st.st_mode & + S_IFMT]; + } + + switch(dirent->d_type) { + case DT_DIR: + ret = traverse_lost_found(src_dir, mount_path); + if (ret) + goto out; + break; + + case DT_REG: + sprintf(file_path, "%s", src_dir); + xattr_len = getxattr(file_path, "trusted.fid", (void *)&trusted_fid, + sizeof(trusted_fid)); + + if (xattr_len < 0 || xattr_len < sizeof(trusted_fid)) { + /* + * Its very much possible that we dont find fid + * on precreated files, LAST_ID + */ + continue; + } + + ff_group = le64_to_cpu(trusted_fid.ff_group); + ff_objid = le64_to_cpu(trusted_fid.ff_objid); + + if (ff_group >= MAX_GROUPS) { + fprintf(stderr, "error: invalid group "LPU64" likely" + "indicates a corrupt xattr for file %s.\n", + ff_group, file_path); + continue; + } + + /* might need to create the parent directories for this object */ + sprintf(dest_path, "%s/O/"LPU64"/d"LPU64, mount_path, ff_group, + ff_objid % 32); + + ret = mkdir_p(dest_path, mount_path, ff_group); + if (ret) { + fprintf(stderr, "error: creating directory %s : %s\n", + dest_path, strerror(errno)); + goto out; + } + + /* + * Object ID needs to be verified against last_id. + * LAST_ID file may not be present in the group directory + * due to corruption. In case of any error try to recover + * as many objects as possible by setting last_id to ~0ULL. + */ + if (last_id[ff_group] == 0) { + sprintf(last_id_file, "%s/O/"LPU64"/LAST_ID", + mount_path, ff_group); + tmp_last_id = read_last_id(last_id_file); + + if (tmp_last_id == 0) + tmp_last_id = ~0ULL; + last_id[ff_group] = tmp_last_id; + } + + if (ff_objid > last_id[ff_group]) { + fprintf(stderr, "error: file skipped because object ID " + "greater than LAST_ID\nFilename: %s\n" + "Group: "LPU64"\nObjectid: "LPU64"\n" + "LAST_ID: "LPU64, file_path, ff_group, ff_objid, + last_id[ff_group]); + continue; + } + + /* move file from lost+found to proper object directory */ + sprintf(dest_path, "%s/O/"LPU64"/d"LPU64"/"LPU64, mount_path, + ff_group, ff_objid % 32, ff_objid); + + obj_exists = 1; + ret = stat(dest_path, &st); + if (ret == 0) { + if (st.st_size == 0) + obj_exists = 0; + } else if (ret < 0 && errno == ENOENT) { + obj_exists = 0; + } + + if (obj_exists) { + fprintf(stderr, "error: target object %s already " + "exists and will not be replaced.\n",dest_path); + continue; + } + + if (rename(file_path, dest_path) < 0) { + fprintf(stderr, "error: rename failed for file %s: %s\n", + file_path, strerror(errno)); + error++; + continue; + } + + printf("Object %s restored.\n", dest_path); + break; + + case DT_UNKNOWN: + continue; + } + } +out: + if (dir_ptr) + closedir(dir_ptr); + + return error; +} + +/* + * If LAST_ID file is not present in some group then restore it with the highest + * object ID found in that group. By the time we come here all possible objects + * have been restored. + */ +int check_last_id(char *mount_path) +{ + char lastid_path[PATH_MAX]; + char dirname[PATH_MAX], subdirname[PATH_MAX]; + DIR *groupdir, *subdir; + struct stat st; + struct dirent *dirent; + unsigned long long group; + __u64 max_objid; + int fd; + int ret; + + for (group = 0; group < MAX_GROUPS; group++) { + max_objid = 0; + sprintf(dirname, "%s/O/"LPU64, mount_path, group); + + strcpy(lastid_path, dirname); + strcat(lastid_path, "/LAST_ID"); + if (stat(lastid_path, &st) == 0) + continue; + + groupdir = opendir(dirname); + if (groupdir == NULL) { + if (errno != ENOENT) + fprintf(stderr, "error: opening %s: %s\n", + dirname, strerror(errno)); + continue; + } + + while ((dirent = readdir(groupdir)) != NULL) { + if (!strcmp(dirent->d_name, ".") || + !strcmp(dirent->d_name, "..")) + continue; + + sprintf(subdirname, "%s/%s", dirname, dirent->d_name); + + subdir = opendir(subdirname); + if (subdir == NULL) { + fprintf(stderr, "error: opening %s: %s\n", + subdirname, strerror(errno)); + continue; + } + + while ((dirent = readdir(subdir)) != NULL) { + __u64 objid; + char *end; + + if (!strcmp(dirent->d_name, ".") || + !strcmp(dirent->d_name, "..")) + continue; + + objid = strtoull(dirent->d_name, &end, 0); + if (end == dirent->d_name || *end != 0) { + fprintf(stderr, "error: unknown object" + "ID %s/%s\n", subdirname, + dirent->d_name); + continue; + } + if (objid > max_objid) + max_objid = objid; + } + closedir(subdir); + } + closedir(groupdir); + + fd = open(lastid_path, O_RDWR | O_CREAT, 0700); + if (fd < 0) { + fprintf(stderr, "error: open \"%s\" failed: %s\n", + lastid_path, strerror(errno)); + close(fd); + return -errno; + } + + ret = write(fd, &max_objid, sizeof(__u64)); + if (ret < sizeof(__u64)) { + close(fd); + return errno; + } + + close(fd); + } + + return 0; +} + +int main(int argc, char **argv) +{ + char *progname; + char *src_dir = NULL, *last_dir = NULL; + struct stat stat_buf; + char tmp_path[PATH_MAX]; + char mount_path[PATH_MAX] = {0}; + char c; + int retval; + + progname = argv[0]; + + while ((c = getopt(argc, argv, "d:hv")) != EOF) { + switch (c) { + case 'd': + src_dir = optarg; + /* Trim last '/' */ + last_dir = strrchr(src_dir, '/'); + if (last_dir != strchr(src_dir, '/')) { + if (last_dir != NULL && (*(last_dir + 1) == '\0')) + *(last_dir) = '\0'; + } + fprintf(stdout, "\"lost+found\" directory path: %s\n", + src_dir); + break; + case 'v': + verbose = 1; + break; + case 'h': + usage(progname); + default: + fprintf(stderr, "%s: bad option '%c'\n", + progname, c); + usage(progname); + } + } + + if (src_dir == NULL) + usage(progname); + + last_dir = strrchr(src_dir, '/'); + if (last_dir == NULL) { + /* Current directory */ + strcpy(mount_path, src_dir); + strcat(mount_path, "/.."); + } else { + strncpy(mount_path, src_dir, (int)(last_dir - src_dir)); + } + + /* Check if 'O' directory exists and create it if needed */ + sprintf(tmp_path, "%s/O", mount_path); + if (stat(tmp_path, &stat_buf) != 0) { + retval = mkdir(tmp_path, 0700); + if (retval < 0) + fprintf(stderr, "error: creating objects directory %s:" + " %s\n", tmp_path, strerror(errno)); + return errno; + } + + memset(grp_info, 0, MAX_GROUPS * sizeof(struct obd_group_info)); + + retval = traverse_lost_found(src_dir, mount_path); + if (retval) { + fprintf(stderr, "error: traversing lost+found looking for " + "orphan objects.\n"); + return retval; + } + + retval = check_last_id(mount_path); + if (retval) + fprintf(stderr, "error: while checking/restoring LAST_ID.\n"); + + return retval; +} diff --git a/lustre/utils/lr_reader.c b/lustre/utils/lr_reader.c index f1275b395ce2e615b06eae0965bcc290ea06566c..7c3bb5e4f1db32b1529aa6d24ae217590ef4e68a 100644 --- a/lustre/utils/lr_reader.c +++ b/lustre/utils/lr_reader.c @@ -22,7 +22,9 @@ */ /* Safely read the last_rcvd file from a device */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include <stdlib.h> #include <stdio.h> #include <unistd.h> diff --git a/lustre/utils/lustre_cfg.c b/lustre/utils/lustre_cfg.c index 865d115d4828bac23a748864351bb93d3206307c..9d382a3b5133fc9e5772d58f472063f324c55404 100644 --- a/lustre/utils/lustre_cfg.c +++ b/lustre/utils/lustre_cfg.c @@ -547,13 +547,18 @@ static char *strnchr(const char *p, char c, size_t n) int jt_lcfg_getparam(int argc, char **argv) { int fp; - int rc = 0, i, show_path = 0; + int rc = 0, i, show_path = 0, only_path = 0; char pattern[PATH_MAX]; char *path, *tmp, *buf; glob_t glob_info; - if (argc == 3 && strcmp(argv[1], "-n") == 0) { + if (argc == 3 && (strcmp(argv[1], "-n") == 0 || + strcmp(argv[1], "-N") == 0)) { path = argv[2]; + if (strcmp(argv[1], "-N") == 0) { + only_path = 1; + show_path = 1; + } } else if (argc == 2) { show_path = 1; path = argv[1]; @@ -596,6 +601,10 @@ int jt_lcfg_getparam(int argc, char **argv) char *filename; filename = strdup(glob_info.gl_pathv[i]); valuename = display_name(filename); + if (valuename && only_path) { + printf("%s\n", valuename); + continue; + } } /* Write the contents of file to stdout */ @@ -723,8 +732,8 @@ int jt_lcfg_setparam(int argc, char **argv) rc = write(fp, value, strlen(value)); if (rc < 0) fprintf(stderr, - "error writing to file %s\n", - glob_info.gl_pathv[i]); + "error writing to file %s (%s)\n", + glob_info.gl_pathv[i], strerror(errno)); else rc = 0; close(fp); diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c index d872ecf6363503d83979676a6fa89ae1cb424532..1cd478d430239bce7e4a526561b654f6f95a0713 100644 --- a/lustre/utils/mkfs_lustre.c +++ b/lustre/utils/mkfs_lustre.c @@ -22,7 +22,9 @@ */ /* This source file is compiled into both mkfs.lustre and tunefs.lustre */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include <stdlib.h> #include <stdio.h> #include <unistd.h> @@ -50,6 +52,7 @@ #include <lustre_param.h> #include <lnet/lnetctl.h> #include <lustre_ver.h> +#include "mount_utils.h" #ifndef PATH_MAX #define PATH_MAX 4096 @@ -73,8 +76,8 @@ struct mkfs_opts { int mo_mgs_failnodes; }; -static char *progname; -static int verbose = 1; +char *progname; +int verbose = 1; static int print_only = 0; static int failover = 0; @@ -125,12 +128,6 @@ void usage(FILE *out) #define vprint if (verbose > 0) printf #define verrprint if (verbose >= 0) printf -static void fatal(void) -{ - verbose = 0; - fprintf(stderr, "\n%s FATAL: ", progname); -} - /*================ utility functions =====================*/ char *strscat(char *dst, char *src, int buflen) { @@ -185,47 +182,6 @@ int get_os_version() return version; } -int run_command(char *cmd, int cmdsz) -{ - char log[] = "/tmp/mkfs_logXXXXXX"; - int fd = -1, rc; - - if ((cmdsz - strlen(cmd)) < 6) { - fatal(); - fprintf(stderr, "Command buffer overflow: %.*s...\n", - cmdsz, cmd); - return ENOMEM; - } - - if (verbose > 1) { - printf("cmd: %s\n", cmd); - } else { - if ((fd = mkstemp(log)) >= 0) { - close(fd); - strcat(cmd, " >"); - strcat(cmd, log); - } - } - strcat(cmd, " 2>&1"); - - /* Can't use popen because we need the rv of the command */ - rc = system(cmd); - if (rc && (fd >= 0)) { - char buf[128]; - FILE *fp; - fp = fopen(log, "r"); - if (fp) { - while (fgets(buf, sizeof(buf), fp) != NULL) { - printf(" %s", buf); - } - fclose(fp); - } - } - if (fd >= 0) - remove(log); - return rc; -} - static int check_mtab_entry(char *spec) { FILE *fp; @@ -441,6 +397,7 @@ static int file_in_dev(char *file_name, char *dev_name) if (strstr(debugfs_cmd, "unsupported feature")) { disp_old_e2fsprogs_msg("an unknown", 0); } + pclose(fp); return -1; } pclose(fp); @@ -526,11 +483,16 @@ static void enable_default_backfs_features(struct mkfs_opts *mop) strscat(mop->mo_mkfsopts, " -O dir_index", sizeof(mop->mo_mkfsopts)); + /* Upstream e2fsprogs called our uninit_groups feature uninit_bg, + * check for both of them when testing e2fsprogs features. */ if (is_e2fsprogs_feature_supp("uninit_groups") == 0) strscat(mop->mo_mkfsopts, ",uninit_groups", sizeof(mop->mo_mkfsopts)); + else if (is_e2fsprogs_feature_supp("uninit_bg") == 0) + strscat(mop->mo_mkfsopts, ",uninit_bg", + sizeof(mop->mo_mkfsopts)); else - disp_old_e2fsprogs_msg("uninit_groups", 1); + disp_old_e2fsprogs_msg("uninit_bg", 1); ret = uname(&uts); if (ret) @@ -886,6 +848,10 @@ int read_local_files(struct mkfs_opts *mop) dev = mop->mo_device; + /* TODO: it's worth observing the get_mountdata() function that is + in mount_utils.c for getting the mountdata out of the + filesystem */ + /* Construct debugfs command line. */ snprintf(cmd, cmdsz, "debugfs -c -R 'dump /%s %s/mountdata' %s", MOUNT_DATA_FILE, tmpdir, dev); @@ -1204,12 +1170,6 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, return 1; rc = add_param(mop->mo_ldd.ldd_params, PARAM_FAILNODE, nids, 0); - /* Combo needs to add MDT failnodes as MGS failnodes - as well */ - if (!rc && IS_MGS(&mop->mo_ldd)) { - rc = add_param(mop->mo_ldd.ldd_params, - PARAM_MGSNODE, nids, 0); - } free(nids); if (rc) return rc; diff --git a/lustre/utils/mount_lustre.c b/lustre/utils/mount_lustre.c index 004890ccad88869893cdf23d4f833982b7b7243d..99d58abecbe65e418746257a7ebde90816f27119 100644 --- a/lustre/utils/mount_lustre.c +++ b/lustre/utils/mount_lustre.c @@ -23,7 +23,9 @@ */ +#ifndef _GNU_SOURCE #define _GNU_SOURCE +#endif #include <stdlib.h> #include <stdio.h> #include <unistd.h> @@ -33,12 +35,12 @@ #include <sys/mount.h> #include <mntent.h> #include <getopt.h> -#include <sys/utsname.h> #include "obdctl.h" #include <lustre_ver.h> #include <glob.h> #include <ctype.h> #include <limits.h> +#include "mount_utils.h" #define MAX_HW_SECTORS_KB_PATH "queue/max_hw_sectors_kb" #define MAX_SECTORS_KB_PATH "queue/max_sectors_kb" @@ -47,7 +49,7 @@ int verbose = 0; int nomtab = 0; int fake = 0; int force = 0; -static char *progname = NULL; +char *progname = NULL; void usage(FILE *out) { @@ -69,6 +71,7 @@ void usage(FILE *out) "\t<mntopt>: one or more comma separated of:\n" "\t\t(no)flock,(no)user_xattr,(no)acl\n" "\t\tnosvc: only start MGC/MGS obds\n" + "\t\tnomgs: only start target obds, using existing MGS\n" "\t\texclude=<ostname>[:<ostname>] : colon-separated list of " "inactive OSTs (e.g. lustre-OST0001)\n" ); @@ -552,6 +555,8 @@ int main(int argc, char *const argv[]) " (may cause reduced IO performance)", argv[0], source); + register_service_tags(usource, source, target); + if (!fake) /* flags and target get to lustre_get_sb, but not lustre_fill_super. Lustre ignores the flags, but mount diff --git a/lustre/utils/mount_utils.c b/lustre/utils/mount_utils.c new file mode 100644 index 0000000000000000000000000000000000000000..f2f288556e92a72673c5f9e4cb0439f5589bf18f --- /dev/null +++ b/lustre/utils/mount_utils.c @@ -0,0 +1,229 @@ +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <config.h> +#include <lustre_disk.h> +#include <lustre_ver.h> +#include <sys/stat.h> +#include <sys/utsname.h> + +extern char *progname; +extern int verbose; + +#define vprint(fmt, arg...) if (verbose > 0) printf(fmt, ##arg) +#define verrprint(fmt, arg...) if (verbose >= 0) fprintf(stderr, fmt, ##arg) + +void fatal(void) +{ + verbose = 0; + fprintf(stderr, "\n%s FATAL: ", progname); +} + +int run_command(char *cmd, int cmdsz) +{ + char log[] = "/tmp/run_command_logXXXXXX"; + int fd = -1, rc; + + if ((cmdsz - strlen(cmd)) < 6) { + fatal(); + fprintf(stderr, "Command buffer overflow: %.*s...\n", + cmdsz, cmd); + return ENOMEM; + } + + if (verbose > 1) { + printf("cmd: %s\n", cmd); + } else { + if ((fd = mkstemp(log)) >= 0) { + close(fd); + strcat(cmd, " >"); + strcat(cmd, log); + } + } + strcat(cmd, " 2>&1"); + + /* Can't use popen because we need the rv of the command */ + rc = system(cmd); + if (rc && (fd >= 0)) { + char buf[128]; + FILE *fp; + fp = fopen(log, "r"); + if (fp) { + while (fgets(buf, sizeof(buf), fp) != NULL) { + printf(" %s", buf); + } + fclose(fp); + } + } + if (fd >= 0) + remove(log); + return rc; +} + +int get_mountdata(char *dev, struct lustre_disk_data *mo_ldd) +{ + + char tmpdir[] = "/tmp/lustre_tmp.XXXXXX"; + char cmd[256]; + char filepnm[128]; + FILE *filep; + int ret = 0; + int ret2 = 0; + int cmdsz = sizeof(cmd); + + /* Make a temporary directory to hold Lustre data files. */ + if (!mkdtemp(tmpdir)) { + verrprint("%s: Can't create temporary directory %s: %s\n", + progname, tmpdir, strerror(errno)); + return errno; + } + + snprintf(cmd, cmdsz, "/sbin/debugfs -c -R 'dump /%s %s/mountdata' %s", + MOUNT_DATA_FILE, tmpdir, dev); + + ret = run_command(cmd, cmdsz); + if (ret) { + verrprint("%s: Unable to dump %s dir (%d)\n", + progname, MOUNT_CONFIGS_DIR, ret); + goto out_rmdir; + } + + sprintf(filepnm, "%s/mountdata", tmpdir); + filep = fopen(filepnm, "r"); + if (filep) { + vprint("Reading %s\n", MOUNT_DATA_FILE); + fread(mo_ldd, sizeof(*mo_ldd), 1, filep); + } else { + verrprint("%s: Unable to read %d.%d config %s.\n", + progname, LUSTRE_MAJOR, LUSTRE_MINOR, filepnm); + goto out_close; + } + +out_close: + fclose(filep); + +out_rmdir: + snprintf(cmd, cmdsz, "rm -rf %s", tmpdir); + ret2 = run_command(cmd, cmdsz); + if (ret2) { + verrprint("Failed to remove temp dir %s (%d)\n", tmpdir, ret2); + /* failure return from run_command() is more important + * than the failure to remove a dir */ + if (!ret) + ret = ret2; + } + + return ret; +} + +#define PARENT_URN "urn:uuid:2bb5bdbf-6c4b-11dc-9b8e-080020a9ed93" +#define PARENT_PRODUCT "Lustre" + +static int stclient(char *type, char *arch) +{ + + char product[64]; + char *urn = NULL; + char cmd[1024]; + FILE *fp; + int i; + + if (strcmp(type, "Client") == 0) + urn = CLIENT_URN; + else if (strcmp(type, "MDS") == 0) + urn = MDS_URN; + else if (strcmp(type, "MGS") == 0) + urn = MGS_URN; + else if (strcmp(type, "OSS") == 0) + urn = OSS_URN; + + snprintf(product, 64, "Lustre %s %d.%d.%d", type, LUSTRE_MAJOR, + LUSTRE_MINOR, LUSTRE_PATCH); + + /* need to see if the entry exists first */ + snprintf(cmd, 1024, + "/opt/sun/servicetag/bin/stclient -f -t '%s' ", urn); + fp = popen(cmd, "r"); + if (!fp) { + if (verbose) + fprintf(stderr, "%s: trying to run stclient -f: %s\n", + progname, strerror(errno)); + return 0; + } + + i = fread(cmd, 1, sizeof(cmd), fp); + if (i) { + cmd[i] = 0; + if (strcmp(cmd, "Record not found\n") != 0) { + /* exists, just return */ + pclose(fp); + return 0; + } + } + pclose(fp); + + snprintf(cmd, 1024, "/opt/sun/servicetag/bin/stclient -a -p '%s' " + "-e %d.%d.%d -t '%s' -S mount -F '%s' -P '%s' -m SUN " + "-A %s -z global", product, LUSTRE_MAJOR, LUSTRE_MINOR, + LUSTRE_PATCH, urn, PARENT_URN, PARENT_PRODUCT, arch); + + return(run_command(cmd, sizeof(cmd))); +} + +void register_service_tags(char *usource, char *source, char *target) +{ + struct lustre_disk_data mo_ldd; + struct utsname utsname_buf; + struct stat stat_buf; + char stclient_loc[] = "/opt/sun/servicetag/bin/stclient"; + int rc; + + rc = stat(stclient_loc, &stat_buf); + + if (rc == 0) { + /* call the service tags stclient to show that we use Lustre on + this system */ + + rc = uname(&utsname_buf); + if (rc) { + if (verbose) + fprintf(stderr, + "%s: trying to get uname failed: %s, " + "inventory tags will not be created\n", + progname, strerror(errno)); + } else { + + /* client or server? */ + if (strchr(usource, ':')) { + stclient("Client", utsname_buf.machine); + } else { + /* first figure what type of device it is */ + rc = get_mountdata(source, &mo_ldd); + if (rc) { + if (verbose) + fprintf(stderr, + "%s: trying to read mountdata from %s " + "failed: %s, inventory tags will not " + "be created\n", + progname, target, strerror(errno)); + } else { + + if (IS_MDT(&mo_ldd)) + stclient("MDS", utsname_buf.machine); + + if (IS_MGS(&mo_ldd)) + stclient("MGS", utsname_buf.machine); + + if (IS_OST(&mo_ldd)) + stclient("OSS", utsname_buf.machine); + } + } + } + } else { + if (errno != ENOENT && verbose) { + fprintf(stderr, + "%s: trying to stat stclient failed: %s\n", + progname, strerror(errno)); + } + } +} diff --git a/lustre/utils/mount_utils.h b/lustre/utils/mount_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..3311b0fbec721731f876c1f50a5d1409703f0f00 --- /dev/null +++ b/lustre/utils/mount_utils.h @@ -0,0 +1,16 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * This file is part of Lustre, http://www.lustre.org + */ +#ifndef _MOUNT_UTILS_H_ +#define _MOUNT_UTILS_H_ + +#include <lustre_disk.h> + +void fatal(void); +int run_command(char *, int); +int get_mountdata(char *, struct lustre_disk_data *); +void register_service_tags(char *, char *, char *); + +#endif diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 4b03997b9b5a3bd559673b4e6d4592d684955b89..5311389782aae3f32efd3c76425898760e2dba3e 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -882,6 +882,51 @@ int jt_get_version(int argc, char **argv) return rc; } +/* + * Print an obd device line with the ost_conn_uuid inserted, if the obd + * is an osc. + */ +static void print_obd_line(char *s) +{ + char buf[MAX_STRING_SIZE]; + char obd_name[MAX_OBD_NAME]; + FILE *fp = NULL; + char *ptr; + + if (sscanf(s, " %*d %*s osc %s %*s %*d ", obd_name) == 0) + goto try_mdc; + snprintf(buf, sizeof(buf), + "/proc/fs/lustre/osc/%s/ost_conn_uuid", obd_name); + if ((fp = fopen(buf, "r")) == NULL) + goto try_mdc; + goto got_one; + +try_mdc: + if (sscanf(s, " %*d %*s mdc %s %*s %*d ", obd_name) == 0) + goto fail; + snprintf(buf, sizeof(buf), + "/proc/fs/lustre/mdc/%s/mds_conn_uuid", obd_name); + if ((fp = fopen(buf, "r")) == NULL) + goto fail; + +got_one: + fgets(buf, sizeof(buf), fp); + fclose(fp); + + /* trim trailing newlines */ + ptr = strrchr(buf, '\n'); + if (ptr) *ptr = '\0'; + ptr = strrchr(s, '\n'); + if (ptr) *ptr = '\0'; + + printf("%s %s\n", s, buf); + return; + +fail: + printf("%s", s); + return; +} + /* get device list by ioctl */ int jt_obd_list_ioctl(int argc, char **argv) { @@ -889,7 +934,10 @@ int jt_obd_list_ioctl(int argc, char **argv) char buf[8192]; struct obd_ioctl_data *data = (struct obd_ioctl_data *)buf; - if (argc != 1) + if (argc > 2) + return CMD_HELP; + /* Just ignore a -t option. Only supported with /proc. */ + else if (argc == 2 && strcmp(argv[1], "-t") != 0) return CMD_HELP; for (index = 0;; index++) { @@ -922,9 +970,16 @@ int jt_obd_list(int argc, char **argv) int rc; char buf[MAX_STRING_SIZE]; FILE *fp = NULL; + int print_obd = 0; - if (argc != 1) + if (argc > 2) return CMD_HELP; + else if (argc == 2) { + if (strcmp(argv[1], "-t") == 0) + print_obd = 1; + else + return CMD_HELP; + } fp = fopen(DEVICES_LIST, "r"); if (fp == NULL) { @@ -934,7 +989,10 @@ int jt_obd_list(int argc, char **argv) } while (fgets(buf, sizeof(buf), fp) != NULL) - printf("%s", buf); + if (print_obd) + print_obd_line(buf); + else + printf("%s", buf); fclose(fp); return 0; @@ -2224,7 +2282,7 @@ int jt_blockdev_info(int argc, char **argv) if (ino == 0ULL) fprintf(stdout, "Not attached\n"); else - fprintf(stdout, "attached to inode %llu\n", ino); + fprintf(stdout, "attached to inode "LPU64"\n", ino); out: close(fd); return -rc; diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 12b104f5a8a5907efe859b023bb3d2237b491dc0..f5770a31311605364e3d07aea27b26db1434d66c 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -205,6 +205,7 @@ static void check_obd_connect_data(void) CHECK_CDEFINE(OBD_CONNECT_LRU_RESIZE); CHECK_CDEFINE(OBD_CONNECT_MDS_MDS); CHECK_CDEFINE(OBD_CONNECT_REAL); + CHECK_CDEFINE(OBD_CONNECT_FID); CHECK_CDEFINE(OBD_CONNECT_CKSUM); } @@ -998,17 +999,6 @@ check_qunit_data_old2(void) CHECK_MEMBER(qunit_data_old2, qd_count); } -static void -check_qunit_data_old(void) -{ - BLANK_LINE(); - CHECK_STRUCT(qunit_data_old); - CHECK_MEMBER(qunit_data_old, qd_id); - CHECK_MEMBER(qunit_data_old, qd_type); - CHECK_MEMBER(qunit_data_old, qd_count); - CHECK_MEMBER(qunit_data_old, qd_isblk); -} - static void check_mgs_target_info(void) { @@ -1047,7 +1037,6 @@ check_lustre_disk_data(void) CHECK_MEMBER(lustre_disk_data, ldd_params); } -#ifdef LIBLUSTRE_POSIX_ACL static void check_posix_acl_xattr_entry(void) { @@ -1066,7 +1055,6 @@ check_posix_acl_xattr_header(void) CHECK_MEMBER_TYPEDEF(posix_acl_xattr_header, a_version); CHECK_MEMBER_TYPEDEF(posix_acl_xattr_header, a_entries); } -#endif static void check_quota_adjust_qunit(void) @@ -1333,6 +1321,9 @@ main(int argc, char **argv) check_mgs_target_info(); check_lustre_disk_data(); printf("#ifdef LIBLUSTRE_POSIX_ACL\n"); +#ifndef LIBLUSTRE_POSIX_ACL +#error build generator without LIBLUSTRE_POSIX_ACL defined - produce wrong check code. +#endif check_posix_acl_xattr_entry(); check_posix_acl_xattr_header(); printf("#endif\n"); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 82bf91dfe1d1e21a4552c905e315fbed5d54a0e3..2ae5d33ffb29089c881eff4298365c094cc06248 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -145,7 +145,9 @@ void lustre_assert_wire_constants(void) (long long)REINT_RENAME); LASSERTF(REINT_OPEN == 6, " found %lld\n", (long long)REINT_OPEN); - LASSERTF(REINT_MAX == 7, " found %lld\n", + LASSERTF(REINT_SETXATTR == 7, " found %lld\n", + (long long)REINT_SETXATTR); + LASSERTF(REINT_MAX == 8, " found %lld\n", (long long)REINT_MAX); LASSERTF(MGS_CONNECT == 250, " found %lld\n", (long long)MGS_CONNECT); @@ -497,6 +499,7 @@ void lustre_assert_wire_constants(void) CLASSERT(OBD_CONNECT_MDS_MDS == 0x04000000ULL); CLASSERT(OBD_CONNECT_REAL == 0x08000000ULL); CLASSERT(OBD_CONNECT_CKSUM == 0x20000000ULL); + CLASSERT(OBD_CONNECT_FID == 0x40000000ULL); /* Checks for struct obdo */ LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n", @@ -2076,26 +2079,6 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct qunit_data_old2 *)0)->qd_count) == 8, " found %lld\n", (long long)(int)sizeof(((struct qunit_data_old2 *)0)->qd_count)); - /* Checks for struct qunit_data_old */ - LASSERTF((int)sizeof(struct qunit_data_old) == 16, " found %lld\n", - (long long)(int)sizeof(struct qunit_data_old)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_id) == 0, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_id)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_id) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_id)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_type) == 4, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_type)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_type) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_type)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_count) == 8, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_count)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_count) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_count)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_isblk) == 12, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_isblk)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_isblk) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_isblk)); - /* Checks for struct quota_adjust_qunit */ LASSERTF((int)sizeof(struct quota_adjust_qunit) == 32, " found %lld\n", (long long)(int)sizeof(struct quota_adjust_qunit)); diff --git a/lustrecvs b/lustrecvs index a76333ee53fa97c55e5ea5226461fae19b2068c1..1e86d0f8246979e18841432864328502f039b343 100755 --- a/lustrecvs +++ b/lustrecvs @@ -27,7 +27,14 @@ EOF if [ -z "$LUSTRECVS_UPDATED" ] ; then echo "$progname: updating lustrecvs" - cvs up -l || fatal 1 "Error updating lustrecvs" + + # If checking out a specific tag, make sure all of the files here are also + # checked out with the same tag to avoid later changes breaking things. + case "$1" in + v*|b_release_*) TAG="-r $1" ;; + esac + + cvs update -l $TAG export LUSTRECVS_UPDATED=yes exec "$0" "$@" fi @@ -81,6 +88,8 @@ case "$lustretag" in # this is the branch table # keep this list sorted alphabetically! + *_gate) buildtag="b_build_gate" ;; + *) buildtag="HEAD" ;; @@ -136,6 +145,16 @@ hg_cmd () return fi + if ! which hg &> /dev/null; then + cat <<EOF + +Error: Mercurial is missing, try 'yum install mercurial', 'apt-get install +mercurial' or try http://rpmfind.net/linux/rpm2html/search.php?query=mercurial +EOF + error_modules="$dir $error_modules" + return + fi + url="$base_url/$repository" # create a cvs date format that will survive shell expansion diff --git a/snmp/lustre-snmp-trap.c b/snmp/lustre-snmp-trap.c index 7caf5cea1a753d75acd7fde654d0c211a7c7763c..6fa62fa51ba19e1f67b668983cbb7482976fe00d 100644 --- a/snmp/lustre-snmp-trap.c +++ b/snmp/lustre-snmp-trap.c @@ -33,7 +33,9 @@ */ #include <sys/types.h> +#if defined (__linux__) #include <sys/vfs.h> +#endif #include <dirent.h> #include <sys/stat.h> #include <unistd.h> diff --git a/snmp/lustre-snmp-util.c b/snmp/lustre-snmp-util.c index 9969272244c39ccfa7c4979a9eedce6fd1a87287..c4984269ab1781d7d4ede394f3b7f1f846d1d373 100644 --- a/snmp/lustre-snmp-util.c +++ b/snmp/lustre-snmp-util.c @@ -33,7 +33,9 @@ */ #include <sys/types.h> +#if defined (__linux__) #include <sys/vfs.h> +#endif #include <dirent.h> #include <sys/stat.h> #include <unistd.h> @@ -342,9 +344,9 @@ int read_counter64(const char *file_path, counter64 *c64,int factor) if ((ret_val = read_string(file_path, file_data,sizeof(file_data))) == SUCCESS) { tmp = atoll(file_data) * factor; - c64->low = (ulong) (0x0FFFFFFFF & tmp); + c64->low = (unsigned long) (0x0FFFFFFFF & tmp); tmp >>= 32; /* Shift right by 4 bytes */ - c64->high = (ulong) (0x0FFFFFFFF & tmp); + c64->high = (unsigned long) (0x0FFFFFFFF & tmp); } return ret_val; }