diff --git a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch b/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch index ef5a25365986d54412226279bdc25f2c8905e0cb..2133355ad47ea243278091b9b0e301d33d826c2f 100644 --- a/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch +++ b/ldiskfs/kernel_patches/patches/iopen-2.6-suse.patch @@ -1,4 +1,3 @@ - Documentation/filesystems/ext2.txt | 16 ++ fs/ext3/inode.c | 3 fs/ext3/iopen.c | 239 +++++++++++++++++++++++++++++++++++++ fs/ext3/iopen.h | 15 ++ @@ -7,10 +6,23 @@ include/linux/ext3_fs.h | 2 7 files changed, 304 insertions(+), 1 deletion(-) -Index: linux-2.6.4-51.1/fs/ext3/inode.c +Index: linux-stage/fs/ext3/Makefile =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/inode.c 2004-04-06 00:31:14.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/inode.c 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/fs/ext3/Makefile 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/fs/ext3/Makefile 2004-05-07 16:00:17.000000000 -0400 +@@ -4,7 +4,7 @@ + + obj-$(CONFIG_EXT3_FS) += ext3.o + +-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/fs/ext3/inode.c 2004-05-07 17:21:59.000000000 -0400 @@ -37,6 +37,7 @@ #include <linux/mpage.h> #include <linux/uio.h> @@ -19,22 +31,21 @@ Index: linux-2.6.4-51.1/fs/ext3/inode.c #include "acl.h" /* -@@ -2472,6 +2473,8 @@ +@@ -2472,6 +2473,9 @@ ei->i_acl = EXT3_ACL_NOT_CACHED; ei->i_default_acl = EXT3_ACL_NOT_CACHED; #endif + if (ext3_iopen_get_inode(inode)) + return; ++ if (ext3_get_inode_loc(inode, &iloc, 0)) goto bad_inode; bh = iloc.bh; -Index: linux-2.6.4-51.1/fs/ext3/iopen.c +Index: linux-stage/fs/ext3/iopen.c =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/iopen.c 2004-04-06 00:31:24.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/iopen.c 2004-04-06 00:31:24.000000000 -0400 -@@ -0,0 +1,223 @@ -+ -+ +--- linux-stage.orig/fs/ext3/iopen.c 2004-05-07 16:00:17.000000000 -0400 ++++ linux-stage/fs/ext3/iopen.c 2004-05-07 17:22:37.000000000 -0400 +@@ -0,0 +1,272 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -44,6 +55,25 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + * + * This file may be redistributed under the terms of the GNU General + * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. + */ + +#include <linux/sched.h> @@ -52,6 +82,8 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c +#include <linux/jbd.h> +#include <linux/ext3_fs.h> +#include <linux/smp_lock.h> ++#include <linux/dcache.h> ++#include <linux/security.h> +#include "iopen.h" + +#ifndef assert @@ -63,14 +95,15 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c +/* + * This implements looking up an inode by number. + */ -+static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) +{ -+ struct inode * inode; ++ struct inode *inode; + unsigned long ino; + struct list_head *lp; + struct dentry *alternate; + char buf[IOPEN_NAME_LEN]; -+ ++ + if (dentry->d_name.len >= IOPEN_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + @@ -99,6 +132,9 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + return ERR_PTR(-ENOENT); + } + ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ + /* preferrably return a connected dentry */ + spin_lock(&dcache_lock); + list_for_each(lp, &inode->i_dentry) { @@ -116,9 +152,14 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + return alternate; + } + dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ __d_rehash(dentry, 0); /* d_rehash */ + spin_unlock(&dcache_lock); + -+ d_add(dentry, inode); + return NULL; +} + @@ -126,7 +167,7 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + __typeof__ (x) __tmp = x; \ + x = y; y = __tmp; } while (0) + -+static inline void switch_names(struct dentry * dentry, struct dentry * target) ++static inline void switch_names(struct dentry *dentry, struct dentry *target) +{ + const unsigned char *old_name, *new_name; + @@ -141,20 +182,27 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + dentry->d_name.name = old_name; +} + -+ -+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) +{ + struct dentry *tmp, *goal = NULL; + struct list_head *lp; + -+ /* preferrably return a connected dentry */ -+ spin_lock(&dcache_lock); + /* verify this dentry is really new */ -+ assert(!de->d_inode); -+ assert(list_empty(&de->d_subdirs)); -+ assert(list_empty(&de->d_alias)); ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); + ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; + ++ /* preferrably return a connected dentry */ + list_for_each(lp, &inode->i_dentry) { + tmp = list_entry(lp, struct dentry, d_alias); + if (tmp->d_flags & DCACHE_DISCONNECTED) { @@ -165,16 +213,30 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + break; + } + } -+ spin_unlock(&dcache_lock); + + if (!goal) -+ return NULL; ++ goto do_instantiate; + -+ goal->d_flags &= ~DCACHE_DISCONNECTED; -+ d_rehash(de); -+ d_move(goal, de); ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_rehash(dentry, 0); ++ __d_move(goal, dentry); ++ spin_unlock(&dcache_lock); ++ iput(inode); + + return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; +} + +/* @@ -205,9 +267,9 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + * This function is spliced into ext3_lookup and returns 1 the file + * name is __iopen__ and dentry has been filled in appropriately. + */ -+int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry) ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) +{ -+ struct inode * inode; ++ struct inode *inode; + + if (dir->i_ino != EXT3_ROOT_INO || + !test_opt(dir->i_sb, IOPEN) || @@ -227,7 +289,7 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + * number is the one for /__iopen__, in which case the inode is filled + * in appropriately. Otherwise, this fuction returns 0. + */ -+int ext3_iopen_get_inode(struct inode * inode) ++int ext3_iopen_get_inode(struct inode *inode) +{ + if (inode->i_ino != EXT3_BAD_INO) + return 0; @@ -256,10 +318,10 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + + return 1; +} -Index: linux-2.6.4-51.1/fs/ext3/iopen.h +Index: linux-stage/fs/ext3/iopen.h =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/iopen.h 2004-04-06 00:31:24.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/iopen.h 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/fs/ext3/iopen.h 2004-05-07 16:00:17.000000000 -0400 ++++ linux-stage/fs/ext3/iopen.h 2004-05-07 16:00:17.000000000 -0400 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -272,14 +334,14 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.h + * Public License. + */ + -+extern int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry); -+extern int ext3_iopen_get_inode(struct inode * inode); -+ -+ -Index: linux-2.6.4-51.1/fs/ext3/namei.c ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-stage/fs/ext3/namei.c =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/namei.c 2004-04-06 00:31:11.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/namei.c 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/fs/ext3/namei.c 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/fs/ext3/namei.c 2004-05-07 16:00:17.000000000 -0400 @@ -37,6 +37,7 @@ #include <linux/buffer_head.h> #include <linux/smp_lock.h> @@ -288,47 +350,78 @@ Index: linux-2.6.4-51.1/fs/ext3/namei.c #include "acl.h" /* -@@ -970,15 +971,21 @@ - } - #endif - -+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode); -+ - static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) - { - struct inode * inode; - struct ext3_dir_entry_2 * de; - struct buffer_head * bh; -+ struct dentry *alternate = NULL; - +@@ -979,6 +980,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); -+ if (ext3_check_for_iopen(dir, dentry)) -+ return NULL; ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; + bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -989,8 +996,14 @@ +@@ -989,10 +993,8 @@ if (!inode) return ERR_PTR(-EACCES); } -+ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { -+ iput(inode); -+ return alternate; -+ } +- if (inode) +- return d_splice_alias(inode, dentry); +- d_add(dentry, inode); +- return NULL; + - if (inode) - return d_splice_alias(inode, dentry); ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -2019,10 +2021,6 @@ + inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); +@@ -2139,6 +2137,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ (void)iopen_connect_dentry(dentry, inode, 0); ++ return 0; ++ } ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} + - d_add(dentry, inode); - return NULL; + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2161,7 +2176,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle,inode); + ext3_journal_stop(handle); + return err; } -Index: linux-2.6.4-51.1/fs/ext3/super.c +Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/super.c 2004-04-06 00:31:14.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/super.c 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/fs/ext3/super.c 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/fs/ext3/super.c 2004-05-07 17:21:59.000000000 -0400 @@ -536,7 +536,7 @@ Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload, Opt_commit, Opt_journal_update, Opt_journal_inum, @@ -353,24 +446,24 @@ Index: linux-2.6.4-51.1/fs/ext3/super.c set_opt(sbi->s_mount_opt, ABORT); break; + case Opt_iopen: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); + break; + case Opt_noiopen: + clear_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); + break; + case Opt_iopen_nopriv: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); + break; case Opt_ignore: break; default: -Index: linux-2.6.4-51.1/include/linux/ext3_fs.h +Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.4-51.1.orig/include/linux/ext3_fs.h 2004-04-06 00:31:11.000000000 -0400 -+++ linux-2.6.4-51.1/include/linux/ext3_fs.h 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/include/linux/ext3_fs.h 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/include/linux/ext3_fs.h 2004-05-07 16:00:17.000000000 -0400 @@ -325,6 +325,8 @@ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ @@ -380,16 +473,3 @@ Index: linux-2.6.4-51.1/include/linux/ext3_fs.h /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H -Index: linux-2.6.4-51.1/fs/ext3/Makefile -=================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/Makefile 2004-04-06 00:27:21.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/Makefile 2004-04-06 00:31:42.000000000 -0400 -@@ -5,7 +5,7 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -- ioctl.o namei.o super.o symlink.o hash.o -+ ioctl.o namei.o super.o symlink.o hash.o iopen.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index 958889a07e36c18d4dad5f9c19232858f165a3dc..c55dd373cd65113c28cf622f7e422f586e64a406 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -694,11 +694,15 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data); # endif #endif -/*#ifndef LP_POISON +#if BITS_PER_LONG > 32 # define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) # define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) # define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) -#endif*/ +#else +# define LI_POISON ((int)0x5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a) +# define LP_POISON ((void *)(long)0x5a5a5a5a) +#endif #if defined(__x86_64__) # define LPU64 "%Lu" @@ -706,33 +710,18 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data); # define LPX64 "%#Lx" # define LPSZ "%lu" # define LPSSZ "%ld" -#ifndef LP_POISON -# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) -# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) -# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) -#endif #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPSZ "%u" # define LPSSZ "%d" -#ifndef LP_POISON -# define LI_POISON ((int)0x5a5a5a5a) -# define LL_POISON ((long)0x5a5a5a5a) -# define LP_POISON ((void *)(long)0x5a5a5a5a) -#endif #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64) # define LPU64 "%lu" # define LPD64 "%ld" # define LPX64 "%#lx" # define LPSZ "%lu" # define LPSSZ "%ld" -#ifndef LP_POISON -# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) -# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) -# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) -#endif #endif #ifndef LPU64 # error "No word size defined" diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index 5359ef7590da6af80b4f62fdd27afaa355efd2ec..f4005de8443f0980585e50ffeebfe16b2e4ac612 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -108,7 +108,7 @@ kqswnal_yield(nal_t *nal, unsigned long *flags, int milliseconds) CDEBUG (D_NET, "yield\n"); if (milliseconds == 0) { - if (current->need_resched) + if (need_resched()) schedule(); return 0; } @@ -817,8 +817,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, /**********************************************************************/ /* Spawn scheduling threads */ - for (i = 0; i < smp_num_cpus; i++) - { + for (i = 0; i < num_online_cpus(); i++) { rc = kqswnal_thread_start (kqswnal_scheduler, NULL); if (rc != 0) { diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index 1cd42db9396b323de5dd83452925d42f3644d63b..6978aa062c407dc050e20c928c76e21fae1bb2b0 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -53,7 +53,11 @@ #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> -#include <linux/locks.h> +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include <linux/locks.h> /* wait_on_buffer */ +#else +#include <linux/buffer_head.h> /* wait_on_buffer */ +#endif #include <linux/unistd.h> #include <net/sock.h> #include <linux/uio.h> diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index f92f97474d624d758f1ae45d5a3d440ba074b3a3..2bcb853a4629fe087dc51603f7f1e310ed267268 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -1824,7 +1824,7 @@ kqswnal_scheduler (void *arg) !list_empty(&kqswnal_data.kqn_delayedtxds) || !list_empty(&kqswnal_data.kqn_delayedfwds)); LASSERT (rc == 0); - } else if (current->need_resched) + } else if (need_resched()) schedule (); spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c index a53ea6b41e8bbec1b84ee8090f9e0cb7a91975e7..4e63c8644ed5743b850fe1625e761a96356acab0 100644 --- a/lnet/libcfs/module.c +++ b/lnet/libcfs/module.c @@ -401,14 +401,22 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, err = lwt_control (data->ioc_flags, data->ioc_misc); break; - case IOC_PORTAL_LWT_SNAPSHOT: - err = lwt_snapshot (&data->ioc_nid, - &data->ioc_count, &data->ioc_misc, + case IOC_PORTAL_LWT_SNAPSHOT: { + cycles_t now; + int ncpu; + int total_size; + + err = lwt_snapshot (&now, &ncpu, &total_size, data->ioc_pbuf1, data->ioc_plen1); + data->ioc_nid = now; + data->ioc_count = ncpu; + data->ioc_misc = total_size; + if (err == 0 && copy_to_user((char *)arg, data, sizeof (*data))) err = -EFAULT; break; + } case IOC_PORTAL_LWT_LOOKUP_STRING: err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, @@ -421,7 +429,13 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, case IOC_PORTAL_NAL_CMD: { struct portals_cfg pcfg; - LASSERT (data->ioc_plen1 == sizeof(pcfg)); + if (data->ioc_plen1 != sizeof(pcfg)) { + CERROR("Bad ioc_plen1 %d (wanted %d)\n", + data->ioc_plen1, sizeof(pcfg)); + err = -EINVAL; + break; + } + if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, sizeof(pcfg))) { err = -EFAULT; diff --git a/lnet/ulnds/connection.c b/lnet/ulnds/connection.c index ca6999a30093272578ee95cb69ee8948b50938c5..3448460cfaa503cbbd179359f0d1153a35534b52 100644 --- a/lnet/ulnds/connection.c +++ b/lnet/ulnds/connection.c @@ -229,7 +229,7 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); hdr.msg.hello.type = __cpu_to_le32 (type); - hdr.msg.hello.incarnation = 0; + hdr.msg.hello.incarnation = __cpu_to_le64(incarnation); /* Assume sufficient socket buffering for this message */ rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr)); @@ -315,6 +315,8 @@ connection force_tcp_connection(manager m, connection conn; struct sockaddr_in addr; unsigned int id[2]; + struct timeval tv; + __u64 incarnation; port = tcpnal_acceptor_port; @@ -353,8 +355,11 @@ connection force_tcp_connection(manager m, setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); #endif + gettimeofday(&tv, NULL); + incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + /* say hello */ - if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, 0)) + if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) exit(-1); conn = allocate_connection(m, ip, port, fd); diff --git a/lnet/ulnds/socklnd/connection.c b/lnet/ulnds/socklnd/connection.c index ca6999a30093272578ee95cb69ee8948b50938c5..3448460cfaa503cbbd179359f0d1153a35534b52 100644 --- a/lnet/ulnds/socklnd/connection.c +++ b/lnet/ulnds/socklnd/connection.c @@ -229,7 +229,7 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); hdr.msg.hello.type = __cpu_to_le32 (type); - hdr.msg.hello.incarnation = 0; + hdr.msg.hello.incarnation = __cpu_to_le64(incarnation); /* Assume sufficient socket buffering for this message */ rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr)); @@ -315,6 +315,8 @@ connection force_tcp_connection(manager m, connection conn; struct sockaddr_in addr; unsigned int id[2]; + struct timeval tv; + __u64 incarnation; port = tcpnal_acceptor_port; @@ -353,8 +355,11 @@ connection force_tcp_connection(manager m, setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); #endif + gettimeofday(&tv, NULL); + incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + /* say hello */ - if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, 0)) + if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) exit(-1); conn = allocate_connection(m, ip, port, fd); diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index f3e82c6bc2cd04b893c607b51214071e23fbe3a0..f8107d8068603fa41ade8ec3e9c4404bb20a81cb 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -1565,14 +1565,11 @@ lwt_put_string(char *ustr) static int lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e) { - char whenstr[32]; char *where = lwt_get_string(e->lwte_where); if (where == NULL) return (-1); - sprintf(whenstr, LPU64, (__u64)(e->lwte_when - t0)); - fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n", e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4, (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0), @@ -1624,6 +1621,7 @@ jt_ptl_lwt(int argc, char **argv) cycles_t tnow; struct timeval tvnow; int printed_date = 0; + int nlines = 0; FILE *f = stdout; if (argc < 2 || @@ -1773,6 +1771,12 @@ jt_ptl_lwt(int argc, char **argv) rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]); if (rc != 0) break; + + if (++nlines % 10000 == 0 && f != stdout) { + /* show some activity... */ + printf("."); + fflush (stdout); + } } tlast = next_event[cpu]->lwte_when; @@ -1786,8 +1790,10 @@ jt_ptl_lwt(int argc, char **argv) next_event[cpu] = NULL; } - if (f != stdout) + if (f != stdout) { + printf("\n"); fclose(f); + } free(events); return (0); diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 5c86853b49cecfcfaeef824bc433794b102b9e29..1b957a311a292daaa3d631d3e8f1f90b031b7d7c 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -2,10 +2,13 @@ tbd Cluster File Systems, Inc. <info@clusterfs.com> * version 1.2.x * bug fixes - clear page cache after eviction (2766) - - deal with strange write() on x86-64 (3043) - don't dereference NULL peer_ni in ldlm_handle_ast_error (3258) - - clear page->private before handing to FS, better assertion (3119) - - tune the read pipeline (3236) + - don't allow unlinking open directory if it isn't empty (2904) + - handle partial page writes in filter; fix 512b direct IO (3138) + - handle page cache pages in cleanup path for 2.6 (3335) + - leave liblustre's partial write handling to filter (3274) + - chose better nal ids in liblustre (3292) + - initialize liblustre with uid/group membership (2862) * miscellania - drop scimac NAL (unmaintained) @@ -35,9 +38,20 @@ tbd Cluster File Systems, Inc. <info@clusterfs.com> - additional checks for oscc recovery before doing precreate (3284) - fix ll_extent_lock() error return code for 64-bit systems (3043) - don't crash in mdc_close for bad permissions on open (3285) + - zero i_rdev for non-device files (3147) + - clear page->private before handing to FS, better assertion (3119) + - fix incorrect decref of invalidated dentry (2350) + - don't hold journal transaction open across create RPC (3313) + - update atime on MDS at close time (3265) + - close LDAP connection when recovering to avoid server load (3315) + - update iopen-2.6 patch with fixes from 2399,2517,2904 (3301) + - don't leak open file on MDS after open resend (3325) + - serialize filter_precreate and filter_destroy_precreated (3329) * miscellania - allow default OST striping configuration per directory (1414) + - fix compilation for qswnal for 2.6 kernels (3125) - increase maximum number of MDS request buffers for large systems + - change liblustreapi to be useful for external progs like lfsck (3098) 2004-03-22 Cluster File Systems, Inc. <info@clusterfs.com> * version 1.2.1 diff --git a/lustre/Rules.in b/lustre/Rules.in index 1a3ae5236fb1e2a424a5d58a86a268986591b3b0..293ff3cd65e3958988e769c26e014db90b4f3542 100644 --- a/lustre/Rules.in +++ b/lustre/Rules.in @@ -23,17 +23,6 @@ ifeq ($(PATCHLEVEL),) include autoMakefile -tags: - rm -f $(top_srcdir)/TAGS - ETAGSF=`etags --version | grep -iq exuberant && \ - echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \ - find $(top_srcdir) -name '*.[hc]' | xargs etags $$ETAGSF -a - - rm -f $(top_srcdir)/tags - CTAGSF=`ctags --version | grep -iq exuberant && \ - echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \ - find $(top_srcdir) -name '*.[hc]' | xargs ctags $$CTAGSF -a - else include @LINUX_CONFIG@ diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index 9b829bfc1995987f70fca0d53b0ed3e17711c623..385ddcb950be2c5805a16b463678cca17edfd0bb 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -12,6 +12,27 @@ SUBDIRS = . include portals ldiskfs lvfs obdclass lov ldlm ptlrpc \ EXTRA_DIST = BUGS FDL Rules.in kernel_patches kernel-tests/Makefile \ README.kernel-source +# these empty rules are needed so that automake doesn't add its own +# recursive rules +etags-recursive: + +ctags-recursive: + +tags-recursive: + +TAGS: + +tags: + rm -f $(top_srcdir)/TAGS + ETAGSF=`etags --version | grep -iq exuberant && \ + echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \ + find $(top_srcdir) -name '*.[hc]' | xargs etags $$ETAGSF -a + + rm -f $(top_srcdir)/tags + CTAGSF=`ctags --version | grep -iq exuberant && \ + echo "-I __initdata,__exitdata,EXPORT_SYMBOL"`; \ + find $(top_srcdir) -name '*.[hc]' | xargs ctags $$CTAGSF -a + if MODULES all-am: modules diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index af80f443e794619acc9e62d8dffc9271ce082ba7..da6cc8a41565eb24bc65f2261728c3e2aa1dd436 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -116,9 +116,6 @@ static inline void *kmalloc(int size, int prot) #define PTR_ERR(a) ((long)(a)) #define ERR_PTR(a) ((void*)((long)(a))) -#define capable(foo) 1 -#define CAP_SYS_ADMIN 1 - typedef struct { void *cwd; }mm_segment_t; @@ -142,7 +139,7 @@ typedef int (write_proc_t)(struct file *file, const char *buffer, ((unsigned char *)&addr)[1], \ ((unsigned char *)&addr)[2], \ ((unsigned char *)&addr)[3] - + #if defined(__LITTLE_ENDIAN) #define HIPQUAD(addr) \ ((unsigned char *)&addr)[3], \ @@ -305,14 +302,7 @@ static inline void spin_unlock_irqrestore(spinlock_t *a, unsigned long b) {} /* random */ -static inline void get_random_bytes(void *ptr, int size) -{ - int *p = (int *)ptr; - int i, count = size/sizeof(int); - - for (i = 0; i< count; i++) - *p++ = rand(); -} +void get_random_bytes(void *ptr, int size); /* memory */ @@ -366,11 +356,6 @@ static inline int kmem_cache_destroy(kmem_cache_t *a) #define PAGE_CACHE_SHIFT 12 #define PAGE_CACHE_MASK PAGE_MASK -/* XXX - * for this moment, liblusre will not rely OST for non-page-aligned write - */ -#define LIBLUSTRE_HANDLE_UNALIGNED_PAGE - struct page { void *addr; unsigned long index; @@ -380,9 +365,6 @@ struct page { /* internally used by liblustre file i/o */ int _offset; int _count; -#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE - int _managed; -#endif }; /* 2.4 defines */ @@ -578,12 +560,23 @@ struct task_struct { int pid; int fsuid; int fsgid; + int max_groups; + int ngroups; + gid_t *groups; __u32 cap_effective; + + struct fs_struct __fs; }; extern struct task_struct *current; - -#define in_group_p(a) 0 /* FIXME */ +int in_group_p(gid_t gid); +static inline int capable(int cap) +{ + if (current->cap_effective & (1 << cap)) + return 1; + else + return 0; +} #define set_current_state(foo) do { current->state = foo; } while (0) @@ -695,6 +688,33 @@ typedef struct { volatile int counter; } atomic_t; #define unlikely(exp) (exp) #endif +/* FIXME sys/capability will finally included linux/fs.h thus + * cause numerous trouble on x86-64. as temporary solution for + * build broken at cary, we copy definition we need from capability.h + * FIXME + */ +struct _cap_struct; +typedef struct _cap_struct *cap_t; +typedef int cap_value_t; +typedef enum { + CAP_EFFECTIVE=0, + CAP_PERMITTED=1, + CAP_INHERITABLE=2 +} cap_flag_t; +typedef enum { + CAP_CLEAR=0, + CAP_SET=1 +} cap_flag_value_t; + +#define CAP_FOWNER 3 +#define CAP_FSETID 4 +#define CAP_SYS_ADMIN 21 + +cap_t cap_get_proc(void); +int cap_get_flag(cap_t, cap_value_t, cap_flag_t, cap_flag_value_t *); + + + /* log related */ static inline int llog_init_commit_master(void) { return 0; } static inline int llog_cleanup_commit_master(int force) { return 0; } diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index ae8b5442b87e5f5466d0c9431b5393436f091996..d83e1491d8279f5407f0ebc8442abdcfc7db91fe 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -79,9 +79,10 @@ struct fsfilt_operations { void *cb_data); int (* fs_statfs)(struct super_block *sb, struct obd_statfs *osfs); int (* fs_sync)(struct super_block *sb); - int (* fs_map_inode_page)(struct inode *inode, struct page *page, - unsigned long *blocks, int *created, - int create); + int (* fs_map_inode_pages)(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int *created, int create, + struct semaphore *sem); int (* fs_prep_san_write)(struct inode *inode, long *blocks, int nblocks, loff_t newsize); int (* fs_write_record)(struct file *, void *, int size, loff_t *, @@ -90,9 +91,9 @@ struct fsfilt_operations { int (* fs_setup)(struct super_block *sb); int (* fs_set_xattr)(struct inode *inode, void *handle, char *name, - void *buffer, int buffer_size); + void *buffer, int buffer_size); int (* fs_get_xattr)(struct inode *inode, char *name, - void *buffer, int buffer_size); + void *buffer, int buffer_size); int (* fs_get_op_len)(int, struct fsfilt_objinfo *, int); }; @@ -209,7 +210,7 @@ fsfilt_commit(struct obd_device *obd, struct inode *inode, return fsfilt_commit_ops(obd->obd_fsops, inode, handle, force_sync); } -static inline int +static inline int llog_fsfilt_commit(struct llog_ctxt *ctxt, struct inode *inode, void *handle, int force_sync) { @@ -301,7 +302,7 @@ static inline int fsfilt_setup(struct obd_device *obd, { if (obd->obd_fsops->fs_setup) return obd->obd_fsops->fs_setup(fs); - + return 0; } @@ -345,12 +346,12 @@ fsfilt_putpage(struct obd_device *obd, struct inode *inode, LASSERT(page != NULL); filter = &obd->u.filter; - + if (!obd->obd_fsops->fs_putpage) return -ENOSYS; CDEBUG(D_INFO, "putpage %lx\n", page->index); - + rc = obd->obd_fsops->fs_putpage(inode, page); if (time_after(jiffies, now + 15 * HZ)) @@ -373,9 +374,9 @@ fsfilt_getpage(struct obd_device *obd, struct inode *inode, return ERR_PTR(-ENOSYS); CDEBUG(D_INFO, "getpage %lx\n", index); - + page = obd->obd_fsops->fs_getpage(inode, index); - + if (time_after(jiffies, now + 15 * HZ)) CERROR("long getpage time %lus\n", (jiffies - now) / HZ); @@ -423,13 +424,14 @@ fsfilt_sync(struct obd_device *obd, struct super_block *sb) return obd->obd_fsops->fs_sync(sb); } -static inline int -fsfilt_map_inode_page(struct obd_device *obd, struct inode *inode, - struct page *page, unsigned long *blocks, - int *created, int create) +static inline int fsfilt_map_inode_pages(struct obd_device *obd, + struct inode *inode, + struct page **page, int pages, + unsigned long *blocks, int *created, + int create, struct semaphore *sem) { - return obd->obd_fsops->fs_map_inode_page(inode, page, blocks, - created, create); + return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks, + created, create, sem); } static inline int @@ -451,7 +453,7 @@ static inline int fsfilt_write_record(struct obd_device *obd, struct file *file, void *buf, loff_t size, loff_t *offs, int force_sync) { - return obd->obd_fsops->fs_write_record(file, buf, size, offs, + return obd->obd_fsops->fs_write_record(file, buf, size, offs, force_sync); } diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 7479634ddd711e9f7dd4c0ab69f74567f7dd18c1..a56b1436c89a8deb4f178133820c8094bda23a5c 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -80,21 +80,6 @@ /* * GENERAL STUFF */ -struct obd_uuid { - __u8 uuid[40]; -}; - -static inline int obd_uuid_equals(struct obd_uuid *u1, struct obd_uuid *u2) -{ - return strcmp(u1->uuid, u2->uuid) == 0; -} - -static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp) -{ - strncpy(uuid->uuid, tmp, sizeof(*uuid)); - uuid->uuid[sizeof(*uuid) - 1] = '\0'; -} - /* FOO_REQUEST_PORTAL is for incoming requests on the FOO * FOO_REPLY_PORTAL is for incoming replies on the FOO * FOO_BULK_PORTAL is for incoming bulk on the FOO @@ -525,11 +510,6 @@ struct ll_fid { __u32 f_type; }; -struct ll_recreate_obj { - __u64 lrc_id; - __u32 lrc_ost_idx; -}; - extern void lustre_swab_ll_fid (struct ll_fid *fid); #define MDS_STATUS_CONN 1 @@ -973,33 +953,6 @@ struct llog_lru_rec { struct llog_rec_tail llr_tail; } __attribute__((packed)); -/* got from mds_update_record. FIXME: maybe some attribute in reint_record and - update_record will be changed later. */ -/* XXX BUG 3188 -- must return to one set of structures. */ - -struct update_record { - __u32 ur_opcode; - __u32 ur_fsuid; - __u32 ur_fsgid; - dev_t ur_rdev; - struct iattr ur_iattr; - struct iattr ur_pattr; - __u32 ur_flags; - __u32 ur_len; -}; -struct reint_record { - struct update_record u_rec; - char *rec_data1; - int rec1_size; - char *rec_data2; - int rec2_size; -}; -struct llog_smfs_rec { - struct llog_rec_hdr lsr_hdr; - struct update_record lsr_rec; - struct llog_rec_tail lsr_tail; -}; - /* On-disk header structure of each log object, stored in little endian order */ #define LLOG_CHUNK_SIZE 8192 #define LLOG_HEADER_SIZE (96) diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index b4a59d34c22c1969c3a29bbf7ffd51ee670a31eb..a529860882de589eca70861c38c3820afa6593ed 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -40,14 +40,14 @@ #include <linux/lustre_idl.h> #include <linux/lustre_cfg.h> -#if BITS_PER_LONG > 32 && !defined(__x86_64__) #ifndef LP_POISON +#if BITS_PER_LONG > 32 # define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) # define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) -#endif #else -#ifndef LP_POISON # define LI_POISON ((int)0x5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a) # define LP_POISON ((void *)(long)0x5a5a5a5a) #endif #endif diff --git a/lustre/include/linux/lustre_log.h b/lustre/include/linux/lustre_log.h index 36ec2d615ad196f899c5d8bf7eca8f0fc6a4b22b..fe94cff55cf9952097349f9f9ec8e61156221b10 100644 --- a/lustre/include/linux/lustre_log.h +++ b/lustre/include/linux/lustre_log.h @@ -68,6 +68,37 @@ struct llog_handle { } u; }; +/* got from mds_update_record. + * FIXME: maybe some attribute in reint_record and update_record will be + * changed later. */ +/* XXX BUG 3188 -- must return to one set of structures. */ +/* XXX use fixed-sized fields (__u32) instead of dev_t and iattr->gid_t, etc */ + +struct update_record { + __u32 ur_opcode; + __u32 ur_fsuid; + __u32 ur_fsgid; + dev_t ur_rdev; + struct iattr ur_iattr; + struct iattr ur_pattr; + __u32 ur_flags; + __u32 ur_len; +}; + +struct reint_record { + struct update_record u_rec; + char *rec_data1; + int rec1_size; + char *rec_data2; + int rec2_size; +}; + +struct llog_smfs_rec { + struct llog_rec_hdr lsr_hdr; + struct update_record lsr_rec; + struct llog_rec_tail lsr_tail; +}; + /* llog.c - general API */ typedef int (*llog_cb_t)(struct llog_handle *, struct llog_rec_hdr *, void *); struct llog_handle *llog_alloc_handle(void); diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 8abb4e41bb798d88ead64df78b7385976a540e9b..8370ad583c78e5036925380f2f9fbc87b5b226be 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -103,7 +103,7 @@ */ #define LDLM_NUM_THREADS min(smp_num_cpus * smp_num_cpus * 8, 64) -#define LDLM_NBUF_MAX 256UL +#define LDLM_NBUF_MAX 512UL #define LDLM_BUFSIZE (8 * 1024) #define LDLM_MAXREQSIZE (5 * 1024) #define LDLM_MAXMEM (num_physpages*(PAGE_SIZE/1024)) @@ -359,8 +359,6 @@ struct ptlrpc_request { /* Spare the preprocessor, spoil the bugs. */ #define FLAG(field, str) (field ? str : "") -#define PTLRPC_REQUEST_COMPLETE(req) ((req)->rq_phase > RQ_PHASE_RPC) - #define DEBUG_REQ_FLAGS(req) \ ((req->rq_phase == RQ_PHASE_NEW) ? "New" : \ (req->rq_phase == RQ_PHASE_RPC) ? "Rpc" : \ diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 8635862ea52d56228bd0331c3dc0ffc65511afdc..1edd5740c8965842da5ca670ec8b4e20aa156d99 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -103,7 +103,8 @@ struct obd_type { }; struct brw_page { - obd_off off; + obd_off disk_offset; /* modulo PAGE_SIZE */ + obd_off page_offset; /* modulo PAGE_SIZE (obviously) */ struct page *pg; int count; obd_flag flag; @@ -176,6 +177,9 @@ struct filter_obd { unsigned long *fo_last_rcvd_slots; __u64 fo_mount_count; + unsigned int fo_destroy_in_progress:1; + struct semaphore fo_create_lock; + struct file_operations *fo_fop; struct inode_operations *fo_iop; struct address_space_operations *fo_aops; diff --git a/lustre/include/lustre/liblustreapi.h b/lustre/include/lustre/liblustreapi.h index 5aa2de2ce66c58d053a5d39c69d0a0d3b62a36e7..350bd09f436259bd711564695beb6e439237ad17 100644 --- a/lustre/include/lustre/liblustreapi.h +++ b/lustre/include/lustre/liblustreapi.h @@ -29,13 +29,14 @@ #include <lustre/lustre_user.h> /* liblustreapi.c */ -extern int op_create_file(char *name, long stripe_size, int stripe_offset, - int stripe_count); -extern int op_find(char *path, struct obd_uuid *obduuid, int recursive, - int verbose, int quiet); -extern int op_check(int type_num, char **obd_type_p, char *dir); -extern int op_catinfo(char *dir, char *keyword, char *node_name); -extern int get_file_stripe(char *path, struct lov_user_md *lum); +extern int llapi_file_create(char *name, long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern); +extern int llapi_file_get_stripe(char *path, struct lov_user_md *lum); +extern int llapi_find(char *path, struct obd_uuid *obduuid, int recursive, + int verbose, int quiet); +extern int llapi_target_check(int num_types, char **obd_types, char *dir); +extern int llapi_catinfo(char *dir, char *keyword, char *node_name); +extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count); extern int llapi_is_lustre_mnttype(char *type); #endif diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index e98b204b6b3ea1b369fe71e0c17b24a1291ca2d8..f0a839a0ac33acac32ce333a25ccb626bbf61a3e 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -24,6 +24,11 @@ #ifndef _LUSTRE_USER_H #define _LUSTRE_USER_H #include <asm/types.h> +#ifdef __KERNEL__ +#include <linux/string.h> +#else +#include <string.h> +#endif #define LL_IOC_GETFLAGS _IOR ('f', 151, long) #define LL_IOC_SETFLAGS _IOW ('f', 152, long) @@ -54,9 +59,8 @@ struct lov_user_ost_data_v1 { /* per-stripe data structure */ __u64 l_object_id; /* OST object ID */ __u64 l_object_gr; /* OST object group (creating MDS number) */ - __u32 l_ost_generation; /* generation of this OST index */ - __u16 l_ost_idx; /* OST index in LOV */ - __u16 l_reserved2; + __u32 l_ost_gen; /* generation of this OST index */ + __u32 l_ost_idx; /* OST index in LOV */ } __attribute__((packed)); #define lov_user_md lov_user_md_v1 @@ -71,4 +75,24 @@ struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ } __attribute__((packed)); +struct ll_recreate_obj { + __u64 lrc_id; + __u32 lrc_ost_idx; +}; + +struct obd_uuid { + __u8 uuid[40]; +}; + +static inline int obd_uuid_equals(struct obd_uuid *u1, struct obd_uuid *u2) +{ + return strcmp(u1->uuid, u2->uuid) == 0; +} + +static inline void obd_str2uuid(struct obd_uuid *uuid, char *tmp) +{ + strncpy(uuid->uuid, tmp, sizeof(*uuid)); + uuid->uuid[sizeof(*uuid) - 1] = '\0'; +} + #endif /* _LUSTRE_USER_H */ diff --git a/lustre/kernel_patches/patches/directio-2.4.24.patch b/lustre/kernel_patches/patches/directio-2.4.24.patch new file mode 100644 index 0000000000000000000000000000000000000000..ba63e7872919e777d15a2c4ce8a00c35d92214fc --- /dev/null +++ b/lustre/kernel_patches/patches/directio-2.4.24.patch @@ -0,0 +1,15 @@ +Index: lum/mm/filemap.c +=================================================================== +--- lum.orig/mm/filemap.c 2004-04-25 14:58:10.000000000 -0400 ++++ lum/mm/filemap.c 2004-04-25 16:23:32.000000000 -0400 +@@ -1614,8 +1614,8 @@ + new_iobuf = 1; + } + +- blocksize = 1 << inode->i_blkbits; +- blocksize_bits = inode->i_blkbits; ++ blocksize = 512 /*1 << inode->i_blkbits*/; ++ blocksize_bits = 9 /*inode->i_blkbits*/; + blocksize_mask = blocksize - 1; + chunk_size = KIO_MAX_ATOMIC_IO << 10; + diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos-pdirops.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos-pdirops.patch index c931e8d593a52cc0fb575c438a0603836d6fa5f0..592af936a80f7af65508624237b9bb92af124427 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos-pdirops.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos-pdirops.patch @@ -1762,7 +1762,7 @@ @@ -12,7 +12,8 @@ O_TARGET := ext3.o export-objs := ext3-exports.o - obj-y := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o + ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o \ + extents.o diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch index a88b30df82308dd012511ccd0ddd230c79a4fc4c..a0b423075785666368c47be85a1d043bddde555b 100644 --- a/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.18-chaos.patch @@ -1760,7 +1760,7 @@ Index: linux-2.4.18-chaos/fs/ext3/Makefile @@ -12,7 +12,8 @@ export-objs := ext3-exports.o - obj-y := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ - ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o + ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o \ + extents.o diff --git a/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch b/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch new file mode 100644 index 0000000000000000000000000000000000000000..1122ba4b52274953141fcbb0bb7e53263206997b --- /dev/null +++ b/lustre/kernel_patches/patches/ext3-extents-2.4.24.patch @@ -0,0 +1,2810 @@ +Index: linux-2.4.24-mb34/fs/ext3/extents.c +=================================================================== +--- linux-2.4.24-mb34.orig/fs/ext3/extents.c 1969-12-31 16:00:00.000000000 -0800 ++++ linux-2.4.24-mb34/fs/ext3/extents.c 2004-05-05 14:27:07.000000000 -0700 +@@ -0,0 +1,2346 @@ ++/* ++ * Copyright (C) 2003 Alex Tomas <alex@clusterfs.com> ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++/* ++ * Extents support for EXT3 ++ * ++ * TODO: ++ * - ext3_ext_walk_space() sould not use ext3_ext_find_extent() ++ * - ext3_ext_calc_credits() could take 'mergable' into account ++ * - ext3*_error() should be used in some situations ++ * - find_goal() [to be tested and improved] ++ * - smart tree reduction ++ * - arch-independence ++ * common on-disk format for big/little-endian arch ++ */ ++ ++#include <linux/module.h> ++#include <linux/fs.h> ++#include <linux/time.h> ++#include <linux/ext3_jbd.h> ++#include <linux/jbd.h> ++#include <linux/smp_lock.h> ++#include <linux/highuid.h> ++#include <linux/pagemap.h> ++#include <linux/quotaops.h> ++#include <linux/string.h> ++#include <linux/slab.h> ++#include <linux/locks.h> ++#include <linux/ext3_extents.h> ++#include <asm/uaccess.h> ++ ++static handle_t *ext3_ext_journal_restart(handle_t *handle, int needed) ++{ ++ int err; ++ ++ if (handle->h_buffer_credits > needed) ++ return handle; ++ if (!ext3_journal_extend(handle, needed)) ++ return handle; ++ err = ext3_journal_restart(handle, needed); ++ ++ return handle; ++} ++ ++static int inline ++ext3_ext_get_access_for_root(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->get_write_access) ++ return tree->get_write_access(h,tree->buffer); ++ else ++ return 0; ++} ++ ++static int inline ++ext3_ext_mark_root_dirty(handle_t *h, struct ext3_extents_tree *tree) ++{ ++ if (tree->mark_buffer_dirty) ++ return tree->mark_buffer_dirty(h,tree->buffer); ++ else ++ return 0; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ */ ++static int ext3_ext_get_access(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ ++ if (path->p_bh) { ++ /* path points to block */ ++ err = ext3_journal_get_write_access(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_get_access_for_root(handle, tree); ++ } ++ return err; ++} ++ ++/* ++ * could return: ++ * - EROFS ++ * - ENOMEM ++ * - EIO ++ */ ++static int ext3_ext_dirty(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int err; ++ if (path->p_bh) { ++ /* path points to block */ ++ err =ext3_journal_dirty_metadata(handle, path->p_bh); ++ } else { ++ /* path points to leaf/index in inode body */ ++ err = ext3_ext_mark_root_dirty(handle, tree); ++ } ++ return err; ++} ++ ++static int inline ++ext3_ext_new_block(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, struct ext3_extent *ex, ++ int *err) ++{ ++ int goal, depth, newblock; ++ struct inode *inode; ++ ++ EXT_ASSERT(tree); ++ if (tree->new_block) ++ return tree->new_block(handle, tree, path, ex, err); ++ ++ inode = tree->inode; ++ depth = EXT_DEPTH(tree); ++ if (path && depth > 0) { ++ goal = path[depth-1].p_block; ++ } else { ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ ++ bg_start = (ei->i_block_group * ++ EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ goal = bg_start + colour; ++ } ++ ++ newblock = ext3_new_block(handle, inode, goal, 0, 0, err); ++ return newblock; ++} ++ ++static inline void ext3_ext_tree_changed(struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *neh; ++ neh = EXT_ROOT_HDR(tree); ++ neh->e_generation++; ++} ++ ++static inline int ext3_ext_space_block(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 6; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_block_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->inode->i_sb->s_blocksize - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 5; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent); ++#ifdef AGRESSIVE_TEST ++ size = 3; ++#endif ++ return size; ++} ++ ++static inline int ext3_ext_space_root_idx(struct ext3_extents_tree *tree) ++{ ++ int size; ++ ++ size = (tree->buffer_len - ++ sizeof(struct ext3_extent_header)) ++ / sizeof(struct ext3_extent_idx); ++#ifdef AGRESSIVE_TEST ++ size = 4; ++#endif ++ return size; ++} ++ ++static void ext3_ext_show_path(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int k, l = path->p_depth; ++ ++ ext_debug(tree, "path:"); ++ for (k = 0; k <= l; k++, path++) { ++ if (path->p_idx) { ++ ext_debug(tree, " %d->%d", path->p_idx->e_block, ++ path->p_idx->e_leaf); ++ } else if (path->p_ext) { ++ ext_debug(tree, " %d:%d:%d", ++ path->p_ext->e_block, ++ path->p_ext->e_num, ++ path->p_ext->e_start); ++ } else ++ ext_debug(tree, " []"); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_show_leaf(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++#ifdef EXT_DEBUG ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *eh; ++ struct ext3_extent *ex; ++ int i; ++ ++ if (!path) ++ return; ++ ++ eh = path[depth].p_hdr; ++ ex = EXT_FIRST_EXTENT(eh); ++ ++ for (i = 0; i < eh->e_num; i++, ex++) { ++ ext_debug(tree, "%d:%d:%d ", ++ ex->e_block, ex->e_num, ex->e_start); ++ } ++ ext_debug(tree, "\n"); ++#endif ++} ++ ++static void ext3_ext_drop_refs(struct ext3_ext_path *path) ++{ ++ int depth = path->p_depth; ++ int i; ++ ++ for (i = 0; i <= depth; i++, path++) ++ if (path->p_bh) { ++ brelse(path->p_bh); ++ path->p_bh = NULL; ++ } ++} ++ ++/* ++ * binary search for closest index by given block ++ */ ++static inline void ++ext3_ext_binsearch_idx(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent_idx *ix; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->e_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ EXT_ASSERT(eh->e_num > 0); ++ ++ ext_debug(tree, "binsearch for %d(idx): ", block); ++ ++ path->p_idx = ix = EXT_FIRST_INDEX(eh); ++ ++ r = k = eh->e_num; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ix[l + k].e_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ix += l; ++ path->p_idx = ix; ++ ext_debug(tree, " -> %d->%d ", path->p_idx->e_block, path->p_idx->e_leaf); ++ ++ while (l++ < r) { ++ if (block < ix->e_block) ++ break; ++ path->p_idx = ix++; ++ } ++ ext_debug(tree, " -> %d->%d\n", path->p_idx->e_block, ++ path->p_idx->e_leaf); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent_idx *chix; ++ ++ chix = ix = EXT_FIRST_INDEX(eh); ++ for (k = 0; k < eh->e_num; k++, ix++) { ++ if (k != 0 && ix->e_block <= ix[-1].e_block) { ++ printk("k=%d, ix=0x%p, first=0x%p\n", k, ++ ix, EXT_FIRST_INDEX(eh)); ++ printk("%u <= %u\n", ++ ix->e_block,ix[-1].e_block); ++ } ++ EXT_ASSERT(k == 0 || ix->e_block > ix[-1].e_block); ++ if (block < ix->e_block) ++ break; ++ chix = ix; ++ } ++ EXT_ASSERT(chix == path->p_idx); ++ } ++#endif ++ ++} ++ ++/* ++ * binary search for closest extent by given block ++ */ ++static inline void ++ext3_ext_binsearch(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, int block) ++{ ++ struct ext3_extent_header *eh = path->p_hdr; ++ struct ext3_extent *ex; ++ int l = 0, k, r; ++ ++ EXT_ASSERT(eh->e_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ ++ if (eh->e_num == 0) { ++ /* ++ * this leaf is empty yet: ++ * we get such a leaf in split/add case ++ */ ++ return; ++ } ++ ++ ext_debug(tree, "binsearch for %d: ", block); ++ ++ path->p_ext = ex = EXT_FIRST_EXTENT(eh); ++ ++ r = k = eh->e_num; ++ while (k > 1) { ++ k = (r - l) / 2; ++ if (block < ex[l + k].e_block) ++ r -= k; ++ else ++ l += k; ++ ext_debug(tree, "%d:%d:%d ", k, l, r); ++ } ++ ++ ex += l; ++ path->p_ext = ex; ++ ext_debug(tree, " -> %d:%d:%d ", path->p_ext->e_block, ++ path->p_ext->e_start, path->p_ext->e_num); ++ ++ while (l++ < r) { ++ if (block < ex->e_block) ++ break; ++ path->p_ext = ex++; ++ } ++ ext_debug(tree, " -> %d:%d:%d\n", path->p_ext->e_block, ++ path->p_ext->e_start, path->p_ext->e_num); ++ ++#ifdef CHECK_BINSEARCH ++ { ++ struct ext3_extent *chex; ++ ++ chex = ex = EXT_FIRST_EXTENT(eh); ++ for (k = 0; k < eh->e_num; k++, ex++) { ++ EXT_ASSERT(k == 0 || ex->e_block > ex[-1].e_block); ++ if (block < ex->e_block) ++ break; ++ chex = ex; ++ } ++ EXT_ASSERT(chex == path->p_ext); ++ } ++#endif ++ ++} ++ ++int ext3_extent_tree_init(handle_t *handle, struct ext3_extents_tree *tree) ++{ ++ struct ext3_extent_header *eh; ++ ++ BUG_ON(tree->buffer_len == 0); ++ ext3_ext_get_access_for_root(handle, tree); ++ eh = EXT_ROOT_HDR(tree); ++ eh->e_depth = 0; ++ eh->e_num = 0; ++ eh->e_magic = EXT3_EXT_MAGIC; ++ eh->e_max = ext3_ext_space_root(tree); ++ ext3_ext_mark_root_dirty(handle, tree); ++ return 0; ++} ++ ++struct ext3_ext_path * ++ext3_ext_find_extent(struct ext3_extents_tree *tree, int block, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ struct buffer_head *bh; ++ int depth, i, ppos = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ eh = EXT_ROOT_HDR(tree); ++ EXT_ASSERT(eh); ++ i = depth = EXT_DEPTH(tree); ++ EXT_ASSERT(eh->e_max); ++ EXT_ASSERT(eh->e_magic == EXT3_EXT_MAGIC); ++ EXT_ASSERT(i == 0 || eh->e_num > 0); ++ ++ /* account possible depth increase */ ++ if (!path) { ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 2), ++ GFP_NOFS); ++ if (!path) ++ return ERR_PTR(-ENOMEM); ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[0].p_hdr = eh; ++ ++ /* walk through the tree */ ++ while (i) { ++ ext_debug(tree, "depth %d: num %d, max %d\n", ++ ppos, eh->e_num, eh->e_max); ++ ext3_ext_binsearch_idx(tree, path + ppos, block); ++ path[ppos].p_block = path[ppos].p_idx->e_leaf; ++ path[ppos].p_depth = i; ++ path[ppos].p_ext = NULL; ++ ++ bh = sb_bread(tree->inode->i_sb, path[ppos].p_block); ++ if (!bh) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ return ERR_PTR(-EIO); ++ } ++ eh = EXT_BLOCK_HDR(bh); ++ ppos++; ++ EXT_ASSERT(ppos <= depth); ++ path[ppos].p_bh = bh; ++ path[ppos].p_hdr = eh; ++ i--; ++ } ++ ++ path[ppos].p_depth = i; ++ path[ppos].p_hdr = eh; ++ path[ppos].p_ext = NULL; ++ ++ /* find extent */ ++ ext3_ext_binsearch(tree, path + ppos, block); ++ ++ ext3_ext_show_path(tree, path); ++ ++ return path; ++} ++ ++/* ++ * insert new index [logical;ptr] into the block at cupr ++ * it check where to insert: before curp or after curp ++ */ ++static int ext3_ext_insert_index(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *curp, ++ int logical, int ptr) ++{ ++ struct ext3_extent_idx *ix; ++ int len, err; ++ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ return err; ++ ++ EXT_ASSERT(logical != curp->p_idx->e_block); ++ len = EXT_MAX_INDEX(curp->p_hdr) - curp->p_idx; ++ if (logical > curp->p_idx->e_block) { ++ /* insert after */ ++ if (curp->p_idx != EXT_LAST_INDEX(curp->p_hdr)) { ++ len = (len - 1) * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d after: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ (curp->p_idx + 1), (curp->p_idx + 2)); ++ memmove(curp->p_idx + 2, curp->p_idx + 1, len); ++ } ++ ix = curp->p_idx + 1; ++ } else { ++ /* insert before */ ++ len = len * sizeof(struct ext3_extent_idx); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert new index %d before: %d. " ++ "move %d from 0x%p to 0x%p\n", ++ logical, ptr, len, ++ curp->p_idx, (curp->p_idx + 1)); ++ memmove(curp->p_idx + 1, curp->p_idx, len); ++ ix = curp->p_idx; ++ } ++ ++ ix->e_block = logical; ++ ix->e_leaf = ptr; ++ curp->p_hdr->e_num++; ++ ++ EXT_ASSERT(curp->p_hdr->e_num <= curp->p_hdr->e_max); ++ EXT_ASSERT(ix <= EXT_LAST_INDEX(curp->p_hdr)); ++ ++ err = ext3_ext_dirty(handle, tree, curp); ++ ext3_std_error(tree->inode->i_sb, err); ++ ++ return err; ++} ++ ++/* ++ * routine inserts new subtree into the path, using free index entry ++ * at depth 'at: ++ * - allocates all needed blocks (new leaf and all intermediate index blocks) ++ * - makes decision where to split ++ * - moves remaining extens and index entries (right to the split point) ++ * into the newly allocated blocks ++ * - initialize subtree ++ */ ++static int ext3_ext_split(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext, int at) ++{ ++ struct buffer_head *bh = NULL; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct ext3_extent *ex; ++ int i = at, k, m, a; ++ unsigned long newblock, oldblock, border; ++ int *ablocks = NULL; /* array of allocated blocks */ ++ int err = 0; ++ ++ /* make decision: where to split? */ ++ /* FIXME: now desicion is simplest: at current extent */ ++ ++ /* if current leaf will be splitted, then we should use ++ * border from split point */ ++ EXT_ASSERT(path[depth].p_ext <= EXT_MAX_EXTENT(path[depth].p_hdr)); ++ if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ border = path[depth].p_ext[1].e_block; ++ ext_debug(tree, "leaf will be splitted." ++ " next leaf starts at %d\n", ++ (int)border); ++ } else { ++ border = newext->e_block; ++ ext_debug(tree, "leaf will be added." ++ " next leaf starts at %d\n", ++ (int)border); ++ } ++ ++ /* ++ * if error occurs, then we break processing ++ * and turn filesystem read-only. so, index won't ++ * be inserted and tree will be in consistent ++ * state. next mount will repair buffers too ++ */ ++ ++ /* ++ * get array to track all allocated blocks ++ * we need this to handle errors and free blocks ++ * upon them ++ */ ++ ablocks = kmalloc(sizeof(unsigned long) * depth, GFP_NOFS); ++ if (!ablocks) ++ return -ENOMEM; ++ memset(ablocks, 0, sizeof(unsigned long) * depth); ++ ++ /* allocate all needed blocks */ ++ ext_debug(tree, "allocate %d blocks for indexes/leaf\n", depth - at); ++ for (a = 0; a < depth - at; a++) { ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ goto cleanup; ++ ablocks[a] = newblock; ++ } ++ ++ /* initialize new leaf */ ++ newblock = ablocks[--a]; ++ EXT_ASSERT(newblock); ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->e_num = 0; ++ neh->e_max = ext3_ext_space_block(tree); ++ neh->e_magic = EXT3_EXT_MAGIC; ++ neh->e_depth = 0; ++ ex = EXT_FIRST_EXTENT(neh); ++ ++ /* move remain of path[depth] to the new leaf */ ++ EXT_ASSERT(path[depth].p_hdr->e_num == path[depth].p_hdr->e_max); ++ /* start copy from next extent */ ++ /* TODO: we could do it by single memmove */ ++ m = 0; ++ path[depth].p_ext++; ++ while (path[depth].p_ext <= ++ EXT_MAX_EXTENT(path[depth].p_hdr)) { ++ ext_debug(tree, "move %d:%d:%d in new leaf %lu\n", ++ path[depth].p_ext->e_block, ++ path[depth].p_ext->e_start, ++ path[depth].p_ext->e_num, ++ newblock); ++ memmove(ex++, path[depth].p_ext++, ++ sizeof(struct ext3_extent)); ++ neh->e_num++; ++ m++; ++ } ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old leaf */ ++ if (m) { ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ path[depth].p_hdr->e_num -= m; ++ if ((err = ext3_ext_dirty(handle, tree, path + depth))) ++ goto cleanup; ++ ++ } ++ ++ /* create intermediate indexes */ ++ k = depth - at - 1; ++ EXT_ASSERT(k >= 0); ++ if (k) ++ ext_debug(tree, "create %d intermediate indices\n", k); ++ /* insert new index into current index block */ ++ /* current depth stored in i var */ ++ i = depth - 1; ++ while (k--) { ++ oldblock = newblock; ++ newblock = ablocks[--a]; ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ goto cleanup; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) ++ goto cleanup; ++ ++ neh = EXT_BLOCK_HDR(bh); ++ neh->e_num = 1; ++ neh->e_magic = EXT3_EXT_MAGIC; ++ neh->e_max = ext3_ext_space_block_idx(tree); ++ neh->e_depth = depth - i; ++ fidx = EXT_FIRST_INDEX(neh); ++ fidx->e_block = border; ++ fidx->e_leaf = oldblock; ++ ++ ext_debug(tree, "int.index at %d (block %lu): %lu -> %lu\n", ++ i, newblock, border, oldblock); ++ /* copy indexes */ ++ m = 0; ++ path[i].p_idx++; ++ ++ ext_debug(tree, "cur 0x%p, last 0x%p\n", path[i].p_idx, ++ EXT_MAX_INDEX(path[i].p_hdr)); ++ EXT_ASSERT(EXT_MAX_INDEX(path[i].p_hdr) == ++ EXT_LAST_INDEX(path[i].p_hdr)); ++ while (path[i].p_idx <= EXT_MAX_INDEX(path[i].p_hdr)) { ++ ext_debug(tree, "%d: move %d:%d in new index %lu\n", ++ i, path[i].p_idx->e_block, ++ path[i].p_idx->e_leaf, newblock); ++ memmove(++fidx, path[i].p_idx++, ++ sizeof(struct ext3_extent_idx)); ++ neh->e_num++; ++ EXT_ASSERT(neh->e_num <= neh->e_max); ++ m++; ++ } ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto cleanup; ++ brelse(bh); ++ bh = NULL; ++ ++ /* correct old index */ ++ if (m) { ++ err = ext3_ext_get_access(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ path[i].p_hdr->e_num -= m; ++ err = ext3_ext_dirty(handle, tree, path + i); ++ if (err) ++ goto cleanup; ++ } ++ ++ i--; ++ } ++ ++ /* insert new index */ ++ if (!err) ++ err = ext3_ext_insert_index(handle, tree, path + at, ++ border, newblock); ++ ++cleanup: ++ if (bh) { ++ if (buffer_locked(bh)) ++ unlock_buffer(bh); ++ brelse(bh); ++ } ++ ++ if (err) { ++ /* free all allocated blocks in error case */ ++ for (i = 0; i < depth; i++) ++ if (!ablocks[i]) ++ continue; ++ ext3_free_blocks(handle, tree->inode, ablocks[i], 1); ++ } ++ kfree(ablocks); ++ ++ return err; ++} ++ ++/* ++ * routine implements tree growing procedure: ++ * - allocates new block ++ * - moves top-level data (index block or leaf) into the new block ++ * - initialize new top-level, creating index that points to the ++ * just created block ++ */ ++static int ext3_ext_grow_indepth(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp = path; ++ struct ext3_extent_header *neh; ++ struct ext3_extent_idx *fidx; ++ struct buffer_head *bh; ++ unsigned long newblock; ++ int err = 0; ++ ++ newblock = ext3_ext_new_block(handle, tree, path, newext, &err); ++ if (newblock == 0) ++ return err; ++ ++ bh = sb_getblk(tree->inode->i_sb, newblock); ++ if (!bh) { ++ err = -EIO; ++ ext3_std_error(tree->inode->i_sb, err); ++ return err; ++ } ++ lock_buffer(bh); ++ ++ if ((err = ext3_journal_get_create_access(handle, bh))) { ++ unlock_buffer(bh); ++ goto out; ++ } ++ ++ /* move top-level index/leaf into new block */ ++ memmove(bh->b_data, curp->p_hdr, tree->buffer_len); ++ ++ /* set size of new block */ ++ neh = EXT_BLOCK_HDR(bh); ++ /* old root could have indexes or leaves ++ * so calculate e_max right way */ ++ if (EXT_DEPTH(tree)) ++ neh->e_max = ext3_ext_space_block_idx(tree); ++ else ++ neh->e_max = ext3_ext_space_block(tree); ++ neh->e_magic = EXT3_EXT_MAGIC; ++ mark_buffer_uptodate(bh, 1); ++ unlock_buffer(bh); ++ ++ if ((err = ext3_journal_dirty_metadata(handle, bh))) ++ goto out; ++ ++ /* create index in new top-level index: num,max,pointer */ ++ if ((err = ext3_ext_get_access(handle, tree, curp))) ++ goto out; ++ ++ curp->p_hdr->e_magic = EXT3_EXT_MAGIC; ++ curp->p_hdr->e_max = ext3_ext_space_root_idx(tree); ++ curp->p_hdr->e_num = 1; ++ curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr); ++ /* FIXME: it works, but actually path[0] can be index */ ++ curp->p_idx->e_block = EXT_FIRST_EXTENT(path[0].p_hdr)->e_block; ++ curp->p_idx->e_leaf = newblock; ++ ++ neh = EXT_ROOT_HDR(tree); ++ fidx = EXT_FIRST_INDEX(neh); ++ ext_debug(tree, "new root: num %d(%d), lblock %d, ptr %d\n", ++ neh->e_num, neh->e_max, fidx->e_block, fidx->e_leaf); ++ ++ neh->e_depth = path->p_depth + 1; ++ err = ext3_ext_dirty(handle, tree, curp); ++out: ++ brelse(bh); ++ ++ return err; ++} ++ ++/* ++ * routine finds empty index and adds new leaf. if no free index found ++ * then it requests in-depth growing ++ */ ++static int ext3_ext_create_new_leaf(handle_t *handle, ++ struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_ext_path *curp; ++ int depth, i, err = 0; ++ ++repeat: ++ i = depth = EXT_DEPTH(tree); ++ ++ /* walk up to the tree and look for free index entry */ ++ curp = path + depth; ++ while (i > 0 && !EXT_HAS_FREE_INDEX(curp)) { ++ i--; ++ curp--; ++ } ++ ++ /* we use already allocated block for index block ++ * so, subsequent data blocks should be contigoues */ ++ if (EXT_HAS_FREE_INDEX(curp)) { ++ /* if we found index with free entry, then use that ++ * entry: create all needed subtree and add new leaf */ ++ err = ext3_ext_split(handle, tree, path, newext, i); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->e_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ } else { ++ /* tree is full, time to grow in depth */ ++ err = ext3_ext_grow_indepth(handle, tree, path, newext); ++ ++ /* refill path */ ++ ext3_ext_drop_refs(path); ++ path = ext3_ext_find_extent(tree, newext->e_block, path); ++ if (IS_ERR(path)) ++ err = PTR_ERR(path); ++ ++ /* ++ * only first (depth 0 -> 1) produces free space ++ * in all other cases we have to split growed tree ++ */ ++ depth = EXT_DEPTH(tree); ++ if (path[depth].p_hdr->e_num == path[depth].p_hdr->e_max) { ++ /* now we need split */ ++ goto repeat; ++ } ++ } ++ ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++/* ++ * returns allocated block in subsequent extent or 0xffffffff ++ * NOTE: it consider block number from index entry as ++ * allocated block. thus, index entries have to be consistent ++ * with leafs ++ */ ++static unsigned long ++ext3_ext_next_allocated_block(struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ if (depth == 0 && path->p_ext == NULL) ++ return 0xffffffff; ++ ++ /* FIXME: what if index isn't full ?! */ ++ while (depth >= 0) { ++ if (depth == path->p_depth) { ++ /* leaf */ ++ if (path[depth].p_ext != ++ EXT_LAST_EXTENT(path[depth].p_hdr)) ++ return path[depth].p_ext[1].e_block; ++ } else { ++ /* index */ ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ } ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * returns first allocated block from next leaf or 0xffffffff ++ */ ++static unsigned ext3_ext_next_leaf_block(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth; ++ ++ EXT_ASSERT(path != NULL); ++ depth = path->p_depth; ++ ++ /* zero-tree has no leaf blocks at all */ ++ if (depth == 0) ++ return 0xffffffff; ++ ++ /* go to index block */ ++ depth--; ++ ++ while (depth >= 0) { ++ if (path[depth].p_idx != ++ EXT_LAST_INDEX(path[depth].p_hdr)) ++ return path[depth].p_idx[1].e_block; ++ depth--; ++ } ++ ++ return 0xffffffff; ++} ++ ++/* ++ * if leaf gets modified and modified extent is first in the leaf ++ * then we have to correct all indexes above ++ * TODO: do we need to correct tree in all cases? ++ */ ++int ext3_ext_correct_indexes(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct ext3_extent_header *eh; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex; ++ unsigned long border; ++ int k, err = 0; ++ ++ eh = path[depth].p_hdr; ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(eh); ++ ++ if (depth == 0) { ++ /* there is no tree at all */ ++ return 0; ++ } ++ ++ if (ex != EXT_FIRST_EXTENT(eh)) { ++ /* we correct tree if first leaf got modified only */ ++ return 0; ++ } ++ ++ /* ++ * TODO: we need correction if border is smaller then current one ++ */ ++ k = depth - 1; ++ border = path[depth].p_ext->e_block; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ return err; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ return err; ++ ++ while (k--) { ++ /* change all left-side indexes */ ++ if (path[k+1].p_idx != EXT_FIRST_INDEX(path[k+1].p_hdr)) ++ break; ++ if ((err = ext3_ext_get_access(handle, tree, path + k))) ++ break; ++ path[k].p_idx->e_block = border; ++ if ((err = ext3_ext_dirty(handle, tree, path + k))) ++ break; ++ } ++ ++ return err; ++} ++ ++static int inline ++ext3_can_extents_be_merged(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->e_block + ex1->e_num != ex2->e_block) ++ return 0; ++ ++#ifdef AGRESSIVE_TEST ++ if (ex1->e_num >= 4) ++ return 0; ++#endif ++ ++ if (!tree->mergable) ++ return 1; ++ ++ return tree->mergable(ex1, ex2); ++} ++ ++/* ++ * this routine tries to merge requsted extent into the existing ++ * extent or inserts requested extent as new one into the tree, ++ * creating new leaf in no-space case ++ */ ++int ext3_ext_insert_extent(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newext) ++{ ++ struct ext3_extent_header * eh; ++ struct ext3_extent *ex, *fex; ++ struct ext3_extent *nearex; /* nearest extent */ ++ struct ext3_ext_path *npath = NULL; ++ int depth, len, err, next; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(path[depth].p_hdr); ++ ++ /* try to insert block into found extent and return */ ++ if (ex && ext3_can_extents_be_merged(tree, ex, newext)) { ++ ext_debug(tree, "append %d block to %d:%d (from %d)\n", ++ newext->e_num, ex->e_block, ex->e_num, ++ ex->e_start); ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ return err; ++ ex->e_num += newext->e_num; ++ eh = path[depth].p_hdr; ++ nearex = ex; ++ goto merge; ++ } ++ ++repeat: ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ if (eh->e_num < eh->e_max) ++ goto has_space; ++ ++ /* probably next leaf has space for us? */ ++ fex = EXT_LAST_EXTENT(eh); ++ next = ext3_ext_next_leaf_block(tree, path); ++ if (newext->e_block > fex->e_block && next != 0xffffffff) { ++ ext_debug(tree, "next leaf block - %d\n", next); ++ EXT_ASSERT(!npath); ++ npath = ext3_ext_find_extent(tree, next, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ EXT_ASSERT(npath->p_depth == path->p_depth); ++ eh = npath[depth].p_hdr; ++ if (eh->e_num < eh->e_max) { ++ ext_debug(tree, "next leaf isnt full(%d)\n", ++ eh->e_num); ++ path = npath; ++ goto repeat; ++ } ++ ext_debug(tree, "next leaf hasno free space(%d,%d)\n", ++ eh->e_num, eh->e_max); ++ } ++ ++ /* ++ * there is no free space in found leaf ++ * we're gonna add new leaf in the tree ++ */ ++ err = ext3_ext_create_new_leaf(handle, tree, path, newext); ++ if (err) ++ goto cleanup; ++ depth = EXT_DEPTH(tree); ++ eh = path[depth].p_hdr; ++ ++has_space: ++ nearex = path[depth].p_ext; ++ ++ if ((err = ext3_ext_get_access(handle, tree, path + depth))) ++ goto cleanup; ++ ++ if (!nearex) { ++ /* there is no extent in this leaf, create first one */ ++ ext_debug(tree, "first extent in the leaf: %d:%d:%d\n", ++ newext->e_block, newext->e_start, ++ newext->e_num); ++ path[depth].p_ext = EXT_FIRST_EXTENT(eh); ++ } else if (newext->e_block > nearex->e_block) { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ if (nearex != EXT_LAST_EXTENT(eh)) { ++ len = EXT_MAX_EXTENT(eh) - nearex; ++ len = (len - 1) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d after: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, ++ newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 2, nearex + 1, len); ++ } ++ path[depth].p_ext = nearex + 1; ++ } else { ++ EXT_ASSERT(newext->e_block != nearex->e_block); ++ len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext3_extent); ++ len = len < 0 ? 0 : len; ++ ext_debug(tree, "insert %d:%d:%d before: nearest 0x%p, " ++ "move %d from 0x%p to 0x%p\n", ++ newext->e_block, newext->e_start, newext->e_num, ++ nearex, len, nearex + 1, nearex + 2); ++ memmove(nearex + 1, nearex, len); ++ path[depth].p_ext = nearex; ++ } ++ ++ eh->e_num++; ++ nearex = path[depth].p_ext; ++ nearex->e_block = newext->e_block; ++ nearex->e_start = newext->e_start; ++ nearex->e_num = newext->e_num; ++ ++merge: ++ /* try to merge extents to the right */ ++ while (nearex < EXT_LAST_EXTENT(eh)) { ++ if (!ext3_can_extents_be_merged(tree, nearex, nearex + 1)) ++ break; ++ /* merge with next extent! */ ++ nearex->e_num += nearex[1].e_num; ++ if (nearex + 1 < EXT_LAST_EXTENT(eh)) { ++ len = (EXT_LAST_EXTENT(eh) - nearex - 1) ++ * sizeof(struct ext3_extent); ++ memmove(nearex + 1, nearex + 2, len); ++ } ++ eh->e_num--; ++ EXT_ASSERT(eh->e_num > 0); ++ } ++ ++ /* try to merge extents to the left */ ++ ++ /* time to correct all indexes above */ ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ if (err) ++ goto cleanup; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ ++cleanup: ++ if (npath) { ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ } ++ ext3_ext_tree_changed(tree); ++ return err; ++} ++ ++int ext3_ext_walk_space(struct ext3_extents_tree *tree, unsigned long block, ++ unsigned long num, ext_prepare_callback func) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent *ex, cbex; ++ unsigned long next, start = 0, end = 0; ++ unsigned long last = block + num; ++ int depth, exists, err = 0; ++ ++ EXT_ASSERT(tree); ++ EXT_ASSERT(func); ++ EXT_ASSERT(tree->inode); ++ EXT_ASSERT(tree->root); ++ ++ while (block < last && block != 0xfffffffff) { ++ num = last - block; ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(tree, block, path); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ break; ++ } ++ ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(path[depth].p_hdr); ++ ex = path[depth].p_ext; ++ next = ext3_ext_next_allocated_block(path); ++ ++ exists = 0; ++ if (!ex) { ++ /* there is no extent yet, so try to allocate ++ * all requested space */ ++ start = block; ++ end = block + num; ++ } else if (ex->e_block > block) { ++ /* need to allocate space before found extent */ ++ start = block; ++ end = ex->e_block; ++ if (block + num < end) ++ end = block + num; ++ } else if (block >= ex->e_block + ex->e_num) { ++ /* need to allocate space after found extent */ ++ start = block; ++ end = block + num; ++ if (end >= next) ++ end = next; ++ } else if (block >= ex->e_block) { ++ /* ++ * some part of requested space is covered ++ * by found extent ++ */ ++ start = block; ++ end = ex->e_block + ex->e_num; ++ if (block + num < end) ++ end = block + num; ++ exists = 1; ++ } else { ++ BUG(); ++ } ++ EXT_ASSERT(end > start); ++ ++ if (!exists) { ++ cbex.e_block = start; ++ cbex.e_num = end - start; ++ cbex.e_start = 0; ++ } else ++ cbex = *ex; ++ ++ EXT_ASSERT(path[depth].p_hdr); ++ err = func(tree, path, &cbex, exists); ++ ext3_ext_drop_refs(path); ++ ++ if (err < 0) ++ break; ++ if (err == EXT_REPEAT) ++ continue; ++ else if (err == EXT_BREAK) { ++ err = 0; ++ break; ++ } ++ ++ if (EXT_DEPTH(tree) != depth) { ++ /* depth was changed. we have to realloc path */ ++ kfree(path); ++ path = NULL; ++ } ++ ++ block = cbex.e_block + cbex.e_num; ++ } ++ ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ ++ return err; ++} ++ ++void ext3_ext_invalidate_cache(struct ext3_extents_tree *tree) ++{ ++ if (tree->cex) ++ tree->cex->e_num = 0; ++} ++ ++static inline void ++ext3_ext_put_in_cache(struct ext3_extents_tree *tree, struct ext3_extent *ex) ++{ ++ if (tree->cex) { ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->e_num); ++ tree->cex->e_block = ex->e_block; ++ tree->cex->e_start = ex->e_start; ++ tree->cex->e_num = ex->e_num; ++ } ++} ++ ++/* ++ * this routine calculate boundaries of the gap requested block fits into ++ * and cache this gap ++ */ ++static inline void ++ext3_ext_put_gap_in_cache(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ int depth = EXT_DEPTH(tree); ++ struct ext3_extent *ex, gex; ++ ++ if (!tree->cex) ++ return; ++ ++ ex = path[depth].p_ext; ++ if (ex == NULL) { ++ /* there is no extent yet, so gap is [0;-] */ ++ gex.e_block = 0; ++ gex.e_num = 0xffffffff; ++ ext_debug(tree, "cache gap(whole file):"); ++ } else if (block < ex->e_block) { ++ gex.e_block = block; ++ gex.e_num = ex->e_block - block; ++ ext_debug(tree, "cache gap(before): %lu [%lu:%lu]", ++ (unsigned long) block, ++ (unsigned long) ex->e_block, ++ (unsigned long) ex->e_num); ++ } else if (block >= ex->e_block + ex->e_num) { ++ gex.e_block = ex->e_block + ex->e_num; ++ gex.e_num = ext3_ext_next_allocated_block(path); ++ ext_debug(tree, "cache gap(after): [%lu:%lu] %lu", ++ (unsigned long) ex->e_block, ++ (unsigned long) ex->e_num, ++ (unsigned long) block); ++ EXT_ASSERT(gex.e_num > gex.e_block); ++ gex.e_num = gex.e_num - gex.e_block; ++ } else { ++ BUG(); ++ } ++ ++ ext_debug(tree, " -> %lu:%lu\n", (unsigned long) gex.e_block, ++ (unsigned long) gex.e_num); ++ gex.e_start = 0xffffffff; ++ ext3_ext_put_in_cache(tree, &gex); ++} ++ ++static inline int ++ext3_ext_in_cache(struct ext3_extents_tree *tree, unsigned long block, ++ struct ext3_extent *ex) ++{ ++ struct ext3_extent *cex = tree->cex; ++ ++ /* is there cache storage at all? */ ++ if (!cex) ++ return 0; ++ ++ /* has cache valid data? */ ++ if (cex->e_num == 0) ++ return 0; ++ ++ if (block >= cex->e_block && block < cex->e_block + cex->e_num) { ++ ex->e_block = cex->e_block; ++ ex->e_start = cex->e_start; ++ ex->e_num = cex->e_num; ++ ext_debug(tree, "%lu cached by %lu:%lu:%lu\n", ++ (unsigned long) block, ++ (unsigned long) ex->e_block, ++ (unsigned long) ex->e_num, ++ (unsigned long) ex->e_start); ++ return 1; ++ } ++ ++ /* not in cache */ ++ return 0; ++} ++ ++/* ++ * routine removes index from the index block ++ * it's used in truncate case only. thus all requests are for ++ * last index in the block only ++ */ ++int ext3_ext_rm_idx(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ struct buffer_head *bh; ++ int err; ++ ++ /* free index block */ ++ path--; ++ EXT_ASSERT(path->p_hdr->e_num); ++ if ((err = ext3_ext_get_access(handle, tree, path))) ++ return err; ++ path->p_hdr->e_num--; ++ if ((err = ext3_ext_dirty(handle, tree, path))) ++ return err; ++ ext_debug(tree, "index is empty, remove it, free block %d\n", ++ path->p_idx->e_leaf); ++ bh = sb_get_hash_table(tree->inode->i_sb, path->p_idx->e_leaf); ++ ext3_forget(handle, 1, tree->inode, bh, path->p_idx->e_leaf); ++ ext3_free_blocks(handle, tree->inode, path->p_idx->e_leaf, 1); ++ return err; ++} ++ ++int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path) ++{ ++ int depth = EXT_DEPTH(tree); ++ int needed; ++ ++ if (path) { ++ /* probably there is space in leaf? */ ++ if (path[depth].p_hdr->e_num < path[depth].p_hdr->e_max) ++ return 1; ++ } ++ ++ /* ++ * the worste case we're expecting is creation of the ++ * new root (growing in depth) with index splitting ++ * for splitting we have to consider depth + 1 because ++ * previous growing could increase it ++ */ ++ depth = depth + 1; ++ ++ /* ++ * growing in depth: ++ * block allocation + new root + old root ++ */ ++ needed = EXT3_ALLOC_NEEDED + 2; ++ ++ /* index split. we may need: ++ * allocate intermediate indexes and new leaf ++ * change two blocks at each level, but root ++ * modify root block (inode) ++ */ ++ needed += (depth * EXT3_ALLOC_NEEDED) + (2 * depth) + 1; ++ ++ return needed; ++} ++ ++static int ++ext3_ext_split_for_rm(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, tex; ++ struct ext3_ext_path *npath; ++ int depth, creds, err; ++ ++ depth = EXT_DEPTH(tree); ++ ex = path[depth].p_ext; ++ EXT_ASSERT(ex); ++ EXT_ASSERT(end < ex->e_block + ex->e_num - 1); ++ EXT_ASSERT(ex->e_block < start); ++ ++ /* calculate tail extent */ ++ tex.e_block = end + 1; ++ EXT_ASSERT(tex.e_block < ex->e_block + ex->e_num); ++ tex.e_num = ex->e_block + ex->e_num - tex.e_block; ++ ++ creds = ext3_ext_calc_credits_for_insert(tree, path); ++ handle = ext3_ext_journal_restart(handle, creds); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ /* calculate head extent. use primary extent */ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ return err; ++ ex->e_num = start - ex->e_block; ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ return err; ++ ++ /* FIXME: some callback to free underlying resource ++ * and correct e_start? */ ++ ext_debug(tree, "split extent: head %u:%u, tail %u:%u\n", ++ ex->e_block, ex->e_num, tex.e_block, tex.e_num); ++ ++ npath = ext3_ext_find_extent(tree, ex->e_block, NULL); ++ if (IS_ERR(npath)) ++ return PTR_ERR(npath); ++ depth = EXT_DEPTH(tree); ++ EXT_ASSERT(npath[depth].p_ext->e_block == ex->e_block); ++ EXT_ASSERT(npath[depth].p_ext->e_num == ex->e_num); ++ ++ err = ext3_ext_insert_extent(handle, tree, npath, &tex); ++ ext3_ext_drop_refs(npath); ++ kfree(npath); ++ ++ return err; ++ ++} ++ ++static int ++ext3_ext_rm_leaf(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, unsigned long start, ++ unsigned long end) ++{ ++ struct ext3_extent *ex, *fu = NULL, *lu, *le; ++ int err = 0, correct_index = 0; ++ int depth = EXT_DEPTH(tree), credits; ++ struct ext3_extent_header *eh; ++ unsigned a, b, block, num; ++ ++ ext_debug(tree, "remove [%lu:%lu] in leaf\n", start, end); ++ if (!path[depth].p_hdr) ++ path[depth].p_hdr = EXT_BLOCK_HDR(path[depth].p_bh); ++ eh = path[depth].p_hdr; ++ EXT_ASSERT(eh); ++ EXT_ASSERT(eh->e_num <= eh->e_max); ++ EXT_ASSERT(eh->e_magic == EXT3_EXT_MAGIC); ++ ++ /* find where to start removing */ ++ le = ex = EXT_LAST_EXTENT(eh); ++ while (ex != EXT_FIRST_EXTENT(eh)) { ++ if (ex->e_block <= end) ++ break; ++ ex--; ++ } ++ ++ if (start > ex->e_block && end < ex->e_block + ex->e_num - 1) { ++ /* removal of internal part of the extent requested ++ * tail and head must be placed in different extent ++ * so, we have to insert one more extent */ ++ path[depth].p_ext = ex; ++ return ext3_ext_split_for_rm(handle, tree, path, start, end); ++ } ++ ++ lu = ex; ++ while (ex >= EXT_FIRST_EXTENT(eh) && ++ ex->e_block + ex->e_num > start) { ++ ext_debug(tree, "remove ext %u:%u\n", ex->e_block, ex->e_num); ++ path[depth].p_ext = ex; ++ ++ a = ex->e_block > start ? ex->e_block : start; ++ b = ex->e_block + ex->e_num - 1 < end ? ++ ex->e_block + ex->e_num - 1 : end; ++ ++ ext_debug(tree, " border %u:%u\n", a, b); ++ ++ if (a != ex->e_block && b != ex->e_block + ex->e_num - 1) { ++ block = 0; ++ num = 0; ++ BUG(); ++ } else if (a != ex->e_block) { ++ /* remove tail of the extent */ ++ block = ex->e_block; ++ num = a - block; ++ } else if (b != ex->e_block + ex->e_num - 1) { ++ /* remove head of the extent */ ++ block = a; ++ num = b - a; ++ } else { ++ /* remove whole extent: excelent! */ ++ block = ex->e_block; ++ num = 0; ++ EXT_ASSERT(a == ex->e_block && ++ b == ex->e_block + ex->e_num - 1); ++ } ++ ++ if (ex == EXT_FIRST_EXTENT(eh)) ++ correct_index = 1; ++ ++ credits = 1; ++ if (correct_index) ++ credits += (EXT_DEPTH(tree) * EXT3_ALLOC_NEEDED) + 1; ++ if (tree->remove_extent_credits) ++ credits += tree->remove_extent_credits(tree, ex, a, b); ++ ++ handle = ext3_ext_journal_restart(handle, credits); ++ if (IS_ERR(handle)) { ++ err = PTR_ERR(handle); ++ goto out; ++ } ++ ++ err = ext3_ext_get_access(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ if (tree->remove_extent) ++ err = tree->remove_extent(tree, ex, a, b); ++ if (err) ++ goto out; ++ ++ if (num == 0) { ++ /* this extent is removed entirely mark slot unused */ ++ ex->e_start = 0; ++ eh->e_num--; ++ fu = ex; ++ } ++ ++ ex->e_block = block; ++ ex->e_num = num; ++ ++ err = ext3_ext_dirty(handle, tree, path + depth); ++ if (err) ++ goto out; ++ ++ ext_debug(tree, "new extent: %u:%u:%u\n", ++ ex->e_block, ex->e_num, ex->e_start); ++ ex--; ++ } ++ ++ if (fu) { ++ /* reuse unused slots */ ++ while (lu < le) { ++ if (lu->e_start) { ++ *fu = *lu; ++ lu->e_start = 0; ++ fu++; ++ } ++ lu++; ++ } ++ } ++ ++ if (correct_index && eh->e_num) ++ err = ext3_ext_correct_indexes(handle, tree, path); ++ ++ /* if this leaf is free, then we should ++ * remove it from index block above */ ++ if (err == 0 && eh->e_num == 0 && path[depth].p_bh != NULL) ++ err = ext3_ext_rm_idx(handle, tree, path + depth); ++ ++out: ++ return err; ++} ++ ++ ++static struct ext3_extent_idx * ++ext3_ext_last_covered(struct ext3_extent_header *hdr, unsigned long block) ++{ ++ struct ext3_extent_idx *ix; ++ ++ ix = EXT_LAST_INDEX(hdr); ++ while (ix != EXT_FIRST_INDEX(hdr)) { ++ if (ix->e_block <= block) ++ break; ++ ix--; ++ } ++ return ix; ++} ++ ++/* ++ * returns 1 if current index have to be freed (even partial) ++ */ ++static int inline ++ext3_ext_more_to_rm(struct ext3_ext_path *path) ++{ ++ EXT_ASSERT(path->p_idx); ++ ++ if (path->p_idx < EXT_FIRST_INDEX(path->p_hdr)) ++ return 0; ++ ++ /* ++ * if truncate on deeper level happened it it wasn't partial ++ * so we have to consider current index for truncation ++ */ ++ if (path->p_hdr->e_num == path->p_block) ++ return 0; ++ return 1; ++} ++ ++int ext3_ext_remove_space(struct ext3_extents_tree *tree, ++ unsigned long start, unsigned long end) ++{ ++ struct inode *inode = tree->inode; ++ struct super_block *sb = inode->i_sb; ++ int depth = EXT_DEPTH(tree); ++ struct ext3_ext_path *path; ++ handle_t *handle; ++ int i = 0, err = 0; ++ ++ ext_debug(tree, "space to be removed: %lu:%lu\n", start, end); ++ ++ /* probably first extent we're gonna free will be last in block */ ++ handle = ext3_journal_start(inode, depth + 1); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ ++ ext3_ext_invalidate_cache(tree); ++ ++ /* ++ * we start scanning from right side freeing all the blocks ++ * after i_size and walking into the deep ++ */ ++ path = kmalloc(sizeof(struct ext3_ext_path) * (depth + 1), GFP_KERNEL); ++ if (IS_ERR(path)) { ++ ext3_error(sb, "ext3_ext_remove_space", ++ "Can't allocate path array"); ++ ext3_journal_stop(handle, inode); ++ return -ENOMEM; ++ } ++ memset(path, 0, sizeof(struct ext3_ext_path) * (depth + 1)); ++ path[i].p_hdr = EXT_ROOT_HDR(tree); ++ ++ while (i >= 0 && err == 0) { ++ if (i == depth) { ++ /* this is leaf block */ ++ err = ext3_ext_rm_leaf(handle, tree, path, start, end); ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ continue; ++ } ++ ++ /* this is index block */ ++ if (!path[i].p_hdr) { ++ ext_debug(tree, "initialize header\n"); ++ path[i].p_hdr = EXT_BLOCK_HDR(path[i].p_bh); ++ } ++ ++ EXT_ASSERT(path[i].p_hdr->e_num <= path[i].p_hdr->e_max); ++ EXT_ASSERT(path[i].p_hdr->e_magic == EXT3_EXT_MAGIC); ++ ++ if (!path[i].p_idx) { ++ /* this level hasn't touched yet */ ++ path[i].p_idx = ++ ext3_ext_last_covered(path[i].p_hdr, end); ++ path[i].p_block = path[i].p_hdr->e_num + 1; ++ ext_debug(tree, "init index ptr: hdr 0x%p, num %d\n", ++ path[i].p_hdr, path[i].p_hdr->e_num); ++ } else { ++ /* we've already was here, see at next index */ ++ path[i].p_idx--; ++ } ++ ++ ext_debug(tree, "level %d - index, first 0x%p, cur 0x%p\n", ++ i, EXT_FIRST_INDEX(path[i].p_hdr), ++ path[i].p_idx); ++ if (ext3_ext_more_to_rm(path + i)) { ++ /* go to the next level */ ++ ext_debug(tree, "move to level %d (block %d)\n", ++ i + 1, path[i].p_idx->e_leaf); ++ memset(path + i + 1, 0, sizeof(*path)); ++ path[i+1].p_bh = sb_bread(sb, path[i].p_idx->e_leaf); ++ if (!path[i+1].p_bh) { ++ /* should we reset i_size? */ ++ err = -EIO; ++ break; ++ } ++ /* put actual number of indexes to know is this ++ * number got changed at the next iteration */ ++ path[i].p_block = path[i].p_hdr->e_num; ++ i++; ++ } else { ++ /* we finish processing this index, go up */ ++ if (path[i].p_hdr->e_num == 0 && i > 0) { ++ /* index is empty, remove it ++ * handle must be already prepared by the ++ * truncate_leaf() */ ++ err = ext3_ext_rm_idx(handle, tree, path + i); ++ } ++ /* root level have p_bh == NULL, brelse() eats this */ ++ brelse(path[i].p_bh); ++ i--; ++ ext_debug(tree, "return to level %d\n", i); ++ } ++ } ++ ++ /* TODO: flexible tree reduction should be here */ ++ if (path->p_hdr->e_num == 0) { ++ /* ++ * truncate to zero freed all the tree ++ * so, we need to correct e_depth ++ */ ++ err = ext3_ext_get_access(handle, tree, path); ++ if (err == 0) { ++ EXT_ROOT_HDR(tree)->e_depth = 0; ++ err = ext3_ext_dirty(handle, tree, path); ++ } ++ } ++ ext3_ext_tree_changed(tree); ++ ++ kfree(path); ++ ext3_journal_stop(handle, inode); ++ ++ return err; ++} ++ ++/* ++ * called at mount time ++ */ ++void ext3_ext_init(struct super_block *sb) ++{ ++ /* ++ * possible initialization would be here ++ */ ++ ++ if (test_opt(sb, EXTENTS)) { ++ printk("EXT3-fs: file extents enabled"); ++#ifdef AGRESSIVE_TEST ++ printk(", agressive tests"); ++#endif ++#ifdef CHECK_BINSEARCH ++ printk(", check binsearch"); ++#endif ++ printk("\n"); ++ } ++} ++ ++/* ++ * called at umount time ++ */ ++void ext3_ext_release(struct super_block *sb) ++{ ++} ++ ++/************************************************************************ ++ * VFS related routines ++ ************************************************************************/ ++ ++static int ext3_get_inode_write_access(handle_t *handle, void *buffer) ++{ ++ /* we use in-core data, not bh */ ++ return 0; ++} ++ ++static int ext3_mark_buffer_dirty(handle_t *handle, void *buffer) ++{ ++ struct inode *inode = buffer; ++ return ext3_mark_inode_dirty(handle, inode); ++} ++ ++static int ext3_ext_mergable(struct ext3_extent *ex1, ++ struct ext3_extent *ex2) ++{ ++ if (ex1->e_start + ex1->e_num == ex2->e_start) ++ return 1; ++ return 0; ++} ++ ++static int ++ext3_remove_blocks_credits(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed; ++ ++ /* at present, extent can't cross block group */; ++ needed = 3; /* bitmap + group desc + sb */ ++ ++#ifdef CONFIG_QUOTA ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ return needed; ++} ++ ++static int ++ext3_remove_blocks(struct ext3_extents_tree *tree, ++ struct ext3_extent *ex, ++ unsigned long from, unsigned long to) ++{ ++ int needed = ext3_remove_blocks_credits(tree, ex, from, to); ++ handle_t *handle = ext3_journal_start(tree->inode, needed); ++ struct buffer_head *bh; ++ int i; ++ ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ if (from >= ex->e_block && to == ex->e_block + ex->e_num - 1) { ++ /* tail removal */ ++ unsigned long num, start; ++ num = ex->e_block + ex->e_num - from; ++ start = ex->e_start + ex->e_num - num; ++ ext_debug(tree, "free last %lu blocks starting %lu\n", ++ num, start); ++ for (i = 0; i < num; i++) { ++ bh = sb_get_hash_table(tree->inode->i_sb, start + i); ++ ext3_forget(handle, 0, tree->inode, bh, start + i); ++ } ++ ext3_free_blocks(handle, tree->inode, start, num); ++ } else if (from == ex->e_block && to <= ex->e_block + ex->e_num - 1) { ++ printk("strange request: removal %lu-%lu from %u:%u\n", ++ from, to, ex->e_block, ex->e_num); ++ } else { ++ printk("strange request: removal(2) %lu-%lu from %u:%u\n", ++ from, to, ex->e_block, ex->e_num); ++ } ++ ext3_journal_stop(handle, tree->inode); ++ return 0; ++} ++ ++int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path, ++ unsigned long block) ++{ ++ struct ext3_inode_info *ei = EXT3_I(inode); ++ unsigned long bg_start; ++ unsigned long colour; ++ int depth; ++ ++ if (path) { ++ struct ext3_extent *ex; ++ depth = path->p_depth; ++ ++ /* try to predict block placement */ ++ if ((ex = path[depth].p_ext)) ++ return ex->e_start + (block - ex->e_block); ++ ++ /* it looks index is empty ++ * try to find starting from index itself */ ++ if (path[depth].p_bh) ++ return path[depth].p_bh->b_blocknr; ++ } ++ ++ /* OK. use inode's group */ ++ bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) + ++ le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block); ++ colour = (current->pid % 16) * ++ (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16); ++ return bg_start + colour + block; ++} ++ ++static int ext3_new_block_cb(handle_t *handle, struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int *err) ++{ ++ struct inode *inode = tree->inode; ++ int newblock, goal; ++ ++ EXT_ASSERT(path); ++ EXT_ASSERT(ex); ++ EXT_ASSERT(ex->e_start); ++ EXT_ASSERT(ex->e_num); ++ ++ /* reuse block from the extent to order data/metadata */ ++ newblock = ex->e_start++; ++ ex->e_num--; ++ if (ex->e_num == 0) { ++ ex->e_num = 1; ++ /* allocate new block for the extent */ ++ goal = ext3_ext_find_goal(inode, path, ex->e_block); ++ ex->e_start = ext3_new_block(handle, inode, goal, 0, 0, err); ++ if (ex->e_start == 0) { ++ /* error occured: restore old extent */ ++ ex->e_start = newblock; ++ return 0; ++ } ++ } ++ return newblock; ++} ++ ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, ++ struct inode *inode) ++{ ++ tree->inode = inode; ++ tree->root = (void *) EXT3_I(inode)->i_data; ++ tree->get_write_access = ext3_get_inode_write_access; ++ tree->mark_buffer_dirty = ext3_mark_buffer_dirty; ++ tree->mergable = ext3_ext_mergable; ++ tree->new_block = ext3_new_block_cb; ++ tree->remove_extent = ext3_remove_blocks; ++ tree->remove_extent_credits = ext3_remove_blocks_credits; ++ tree->buffer = (void *) inode; ++ tree->buffer_len = sizeof(EXT3_I(inode)->i_data); ++ tree->cex = (struct ext3_extent *) &EXT3_I(inode)->i_cached_extent; ++} ++ ++#if EXT3_MULTIBLOCK_ALLOCATOR ++static int ++ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ struct inode *inode = tree->inode; ++ struct buffer_head *bh; ++ int count, err, goal; ++ unsigned long pblock; ++ unsigned long tgen; ++ loff_t new_i_size; ++ handle_t *handle; ++ int i; ++ ++ if (exist) ++ return EXT_CONTINUE; ++ ++ tgen = EXT_GENERATION(tree); ++ count = ext3_ext_calc_credits_for_insert(tree, path); ++ up_write(&EXT3_I(inode)->truncate_sem); ++ ++ handle = ext3_journal_start(inode, count + EXT3_ALLOC_NEEDED + 1); ++ if (IS_ERR(handle)) { ++ down_write(&EXT3_I(inode)->truncate_sem); ++ return PTR_ERR(handle); ++ } ++ ++ if (tgen != EXT_GENERATION(tree)) { ++ /* the tree has changed. so path can be invalid at moment */ ++ ext3_journal_stop(handle, inode); ++ down_write(&EXT3_I(inode)->truncate_sem); ++ return EXT_REPEAT; ++ } ++ ++ down_write(&EXT3_I(inode)->truncate_sem); ++ goal = ext3_ext_find_goal(inode, path, newex->e_block); ++ count = newex->e_num; ++ pblock = ext3_new_blocks(handle, inode, &count, goal, &err); ++ if (!pblock) ++ goto out; ++ EXT_ASSERT(count <= newex->e_num); ++ ++ /* insert new extent */ ++ newex->e_start = pblock; ++ newex->e_num = count; ++ err = ext3_ext_insert_extent(handle, tree, path, newex); ++ if (err) ++ goto out; ++ ++ /* block have been allocated for data, so time to drop dirty ++ * in correspondend buffer_heads to prevent corruptions */ ++ for (i = 0; i < newex->e_num; i++) { ++ bh = sb_get_hash_table(inode->i_sb, newex->e_start + i); ++ if (bh) { ++ mark_buffer_clean(bh); ++ wait_on_buffer(bh); ++ clear_bit(BH_Req, &bh->b_state); ++ __brelse(bh); ++ } ++ } ++ ++ /* correct on-disk inode size */ ++ if (newex->e_num > 0) { ++ new_i_size = (loff_t) newex->e_block + newex->e_num; ++ new_i_size = new_i_size << inode->i_blkbits; ++ if (new_i_size > EXT3_I(inode)->i_disksize) { ++ EXT3_I(inode)->i_disksize = new_i_size; ++ err = ext3_mark_inode_dirty(handle, inode); ++ } ++ } ++ ++out: ++ ext3_journal_stop(handle, inode); ++ return err; ++} ++ ++ ++int ext3_ext_allocate_nblocks(struct inode *inode, unsigned long block, ++ unsigned long num) ++{ ++ struct ext3_extents_tree tree; ++ int err; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "blocks %lu-%lu requested for inode %u\n", ++ block, block + num,(unsigned) inode->i_ino); ++ down_write(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, block, num, ext3_ext_new_extent_cb); ++ ext3_ext_invalidate_cache(&tree); ++ up_write(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++#endif ++ ++int ext3_ext_get_block(handle_t *handle, struct inode *inode, ++ long iblock, struct buffer_head *bh_result, int create) ++{ ++ struct ext3_ext_path *path = NULL; ++ struct ext3_extent newex; ++ struct ext3_extent *ex; ++ int goal, newblock, err = 0, depth; ++ struct ext3_extents_tree tree; ++ ++ clear_bit(BH_New, &bh_result->b_state); ++ ext3_init_tree_desc(&tree, inode); ++ ext_debug(&tree, "block %d requested for inode %u\n", ++ (int) iblock, (unsigned) inode->i_ino); ++ down_write(&EXT3_I(inode)->truncate_sem); ++ ++ /* check in cache */ ++ if (ext3_ext_in_cache(&tree, iblock, &newex)) { ++ if (newex.e_start == 0xffffffff && !create) { ++ /* block isn't allocated yet and ++ * user don't want to allocate it */ ++ goto out2; ++ } else if (newex.e_start) { ++ /* block is already allocated */ ++ newblock = iblock - newex.e_block + newex.e_start; ++ goto out; ++ } ++ } ++ ++ /* find extent for this block */ ++ path = ext3_ext_find_extent(&tree, iblock, NULL); ++ if (IS_ERR(path)) { ++ err = PTR_ERR(path); ++ path = NULL; ++ goto out2; ++ } ++ ++ depth = EXT_DEPTH(&tree); ++ ++ /* ++ * consistent leaf must not be empty ++ * this situations is possible, though, _during_ tree modification ++ * this is why assert can't be put in ext3_ext_find_extent() ++ */ ++ EXT_ASSERT(path[depth].p_ext != NULL || depth == 0); ++ ++ if ((ex = path[depth].p_ext)) { ++ /* if found exent covers block, simple return it */ ++ if (iblock >= ex->e_block && iblock < ex->e_block + ex->e_num) { ++ newblock = iblock - ex->e_block + ex->e_start; ++ ext_debug(&tree, "%d fit into %d:%d -> %d\n", ++ (int) iblock, ex->e_block, ex->e_num, ++ newblock); ++ ext3_ext_put_in_cache(&tree, ex); ++ goto out; ++ } ++ } ++ ++ /* ++ * requested block isn't allocated yet ++ * we couldn't try to create block if create flag is zero ++ */ ++ if (!create) { ++ /* put just found gap into cache to speedup subsequest reqs */ ++ ext3_ext_put_gap_in_cache(&tree, path, iblock); ++ goto out2; ++ } ++ ++ /* allocate new block */ ++ goal = ext3_ext_find_goal(inode, path, iblock); ++ newblock = ext3_new_block(handle, inode, goal, 0, 0, &err); ++ if (!newblock) ++ goto out2; ++ ext_debug(&tree, "allocate new block: goal %d, found %d\n", ++ goal, newblock); ++ ++ /* try to insert new extent into found leaf and return */ ++ newex.e_block = iblock; ++ newex.e_start = newblock; ++ newex.e_num = 1; ++ err = ext3_ext_insert_extent(handle, &tree, path, &newex); ++ if (err) ++ goto out2; ++ ++ if (inode->i_size > EXT3_I(inode)->i_disksize) ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ++ /* previous routine could use block we allocated */ ++ newblock = newex.e_start; ++ set_bit(BH_New, &bh_result->b_state); ++ ++ ext3_ext_put_in_cache(&tree, &newex); ++out: ++ ext3_ext_show_leaf(&tree, path); ++ set_bit(BH_Mapped, &bh_result->b_state); ++ bh_result->b_dev = inode->i_sb->s_dev; ++ bh_result->b_blocknr = newblock; ++out2: ++ if (path) { ++ ext3_ext_drop_refs(path); ++ kfree(path); ++ } ++ up_write(&EXT3_I(inode)->truncate_sem); ++ ++ return err; ++} ++ ++void ext3_ext_truncate(struct inode * inode) ++{ ++ struct address_space *mapping = inode->i_mapping; ++ struct super_block *sb = inode->i_sb; ++ struct ext3_extents_tree tree; ++ unsigned long last_block; ++ handle_t *handle; ++ int err = 0; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ /* ++ * probably first extent we're gonna free will be last in block ++ */ ++ err = ext3_writepage_trans_blocks(inode) + 3; ++ handle = ext3_journal_start(inode, err); ++ if (IS_ERR(handle)) ++ return; ++ ++ ext3_block_truncate_page(handle, mapping, inode->i_size); ++ ++ down_write(&EXT3_I(inode)->truncate_sem); ++ ext3_ext_invalidate_cache(&tree); ++ ++ /* ++ * TODO: optimization is possible here ++ * probably we need not scaning at all, ++ * because page truncation is enough ++ */ ++ if (ext3_orphan_add(handle, inode)) ++ goto out_stop; ++ ++ /* we have to know where to truncate from in crash case */ ++ EXT3_I(inode)->i_disksize = inode->i_size; ++ ext3_mark_inode_dirty(handle, inode); ++ ++ last_block = (inode->i_size + sb->s_blocksize - 1) ++ >> EXT3_BLOCK_SIZE_BITS(sb); ++ err = ext3_ext_remove_space(&tree, last_block, 0xffffffff); ++ ++ /* In a multi-transaction truncate, we only make the final ++ * transaction synchronous */ ++ if (IS_SYNC(inode)) ++ handle->h_sync = 1; ++ ++out_stop: ++ /* ++ * If this was a simple ftruncate(), and the file will remain alive ++ * then we need to clear up the orphan record which we created above. ++ * However, if this was a real unlink then we were called by ++ * ext3_delete_inode(), and we allow that function to clean up the ++ * orphan info for us. ++ */ ++ if (inode->i_nlink) ++ ext3_orphan_del(handle, inode); ++ ++ up_write(&EXT3_I(inode)->truncate_sem); ++ ext3_journal_stop(handle, inode); ++} ++ ++/* ++ * this routine calculate max number of blocks we could modify ++ * in order to allocate new block for an inode ++ */ ++int ext3_ext_writepage_trans_blocks(struct inode *inode, int num) ++{ ++ struct ext3_extents_tree tree; ++ int needed; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ++ needed = ext3_ext_calc_credits_for_insert(&tree, NULL); ++ ++ /* caller want to allocate num blocks */ ++ needed *= num; ++ ++#ifdef CONFIG_QUOTA ++ /* ++ * FIXME: real calculation should be here ++ * it depends on blockmap format of qouta file ++ */ ++ needed += 2 * EXT3_SINGLEDATA_TRANS_BLOCKS; ++#endif ++ ++ return needed; ++} ++ ++void ext3_extents_initialize_blockmap(handle_t *handle, struct inode *inode) ++{ ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ ext3_extent_tree_init(handle, &tree); ++} ++ ++static int ++ext3_ext_store_extent_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *newex, int exist) ++{ ++ struct ext3_extent_buf *buf = (struct ext3_extent_buf *) tree->private; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ if (buf->err < 0) ++ return EXT_BREAK; ++ if (buf->cur - buf->buffer + sizeof(*newex) > buf->buflen) ++ return EXT_BREAK; ++ ++ if (!copy_to_user(buf->cur, newex, sizeof(*newex))) { ++ buf->err++; ++ buf->cur += sizeof(*newex); ++ } else { ++ buf->err = -EFAULT; ++ return EXT_BREAK; ++ } ++ return EXT_CONTINUE; ++} ++ ++static int ++ext3_ext_collect_stats_cb(struct ext3_extents_tree *tree, ++ struct ext3_ext_path *path, ++ struct ext3_extent *ex, int exist) ++{ ++ struct ext3_extent_tree_stats *buf = ++ (struct ext3_extent_tree_stats *) tree->private; ++ int depth; ++ ++ if (!exist) ++ return EXT_CONTINUE; ++ ++ depth = EXT_DEPTH(tree); ++ buf->extents_num++; ++ if (path[depth].p_ext == EXT_FIRST_EXTENT(path[depth].p_hdr)) ++ buf->leaf_num++; ++ return EXT_CONTINUE; ++} ++ ++int ext3_ext_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, ++ unsigned long arg) ++{ ++ int err = 0; ++ ++ if (!(EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL)) ++ return -EINVAL; ++ ++ if (cmd == EXT3_IOC_GET_EXTENTS) { ++ struct ext3_extent_buf buf; ++ struct ext3_extents_tree tree; ++ ++ if (copy_from_user(&buf, (void *) arg, sizeof(buf))) ++ return -EFAULT; ++ ++ ext3_init_tree_desc(&tree, inode); ++ buf.cur = buf.buffer; ++ buf.err = 0; ++ tree.private = &buf; ++ down_write(&EXT3_I(inode)->truncate_sem); ++ err = ext3_ext_walk_space(&tree, buf.start, 0xffffffff, ++ ext3_ext_store_extent_cb); ++ up_write(&EXT3_I(inode)->truncate_sem); ++ if (err == 0) ++ err = buf.err; ++ } else if (cmd == EXT3_IOC_GET_TREE_STATS) { ++ struct ext3_extent_tree_stats buf; ++ struct ext3_extents_tree tree; ++ ++ ext3_init_tree_desc(&tree, inode); ++ down_write(&EXT3_I(inode)->truncate_sem); ++ buf.depth = EXT_DEPTH(&tree); ++ buf.extents_num = 0; ++ buf.leaf_num = 0; ++ tree.private = &buf; ++ err = ext3_ext_walk_space(&tree, 0, 0xffffffff, ++ ext3_ext_collect_stats_cb); ++ up_write(&EXT3_I(inode)->truncate_sem); ++ if (!err) ++ err = copy_to_user((void *) arg, &buf, sizeof(buf)); ++ } else if (cmd == EXT3_IOC_GET_TREE_DEPTH) { ++ struct ext3_extents_tree tree; ++ ext3_init_tree_desc(&tree, inode); ++ down_write(&EXT3_I(inode)->truncate_sem); ++ err = EXT_DEPTH(&tree); ++ up_write(&EXT3_I(inode)->truncate_sem); ++ } ++ ++ return err; ++} ++ ++EXPORT_SYMBOL(ext3_init_tree_desc); ++EXPORT_SYMBOL(ext3_mark_inode_dirty); ++EXPORT_SYMBOL(ext3_ext_invalidate_cache); ++EXPORT_SYMBOL(ext3_ext_insert_extent); ++EXPORT_SYMBOL(ext3_ext_walk_space); ++EXPORT_SYMBOL(ext3_ext_find_goal); ++EXPORT_SYMBOL(ext3_ext_calc_credits_for_insert); ++ +Index: linux-2.4.24-mb34/fs/ext3/ialloc.c +=================================================================== +--- linux-2.4.24-mb34.orig/fs/ext3/ialloc.c 2004-05-05 13:47:40.000000000 -0700 ++++ linux-2.4.24-mb34/fs/ext3/ialloc.c 2004-05-05 13:51:27.000000000 -0700 +@@ -592,10 +592,14 @@ + iloc.bh = NULL; + goto fail; + } +- err = ext3_mark_iloc_dirty(handle, inode, &iloc); +- if (err) goto fail; + ++ if (test_opt(sb, EXTENTS)) { ++ EXT3_I(inode)->i_flags |= EXT3_EXTENTS_FL; ++ ext3_extents_initialize_blockmap(handle, inode); ++ } + ++ err = ext3_mark_iloc_dirty(handle, inode, &iloc); ++ if (err) goto fail; + + unlock_super (sb); + if(DQUOT_ALLOC_INODE(inode)) { +Index: linux-2.4.24-mb34/fs/ext3/inode.c +=================================================================== +--- linux-2.4.24-mb34.orig/fs/ext3/inode.c 2004-05-05 13:47:41.000000000 -0700 ++++ linux-2.4.24-mb34/fs/ext3/inode.c 2004-05-05 13:49:40.000000000 -0700 +@@ -848,6 +848,15 @@ + goto reread; + } + ++static inline int ++ext3_get_block_wrap(handle_t *handle, struct inode *inode, long block, ++ struct buffer_head *bh, int create) ++{ ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_get_block(handle, inode, block, bh, create); ++ return ext3_get_block_handle(handle, inode, block, bh, create); ++} ++ + /* + * The BKL is not held on entry here. + */ +@@ -861,7 +870,7 @@ + handle = ext3_journal_current_handle(); + J_ASSERT(handle != 0); + } +- ret = ext3_get_block_handle(handle, inode, iblock, bh_result, create); ++ ret = ext3_get_block_wrap(handle, inode, iblock, bh_result, create); + return ret; + } + +@@ -879,7 +888,7 @@ + dummy.b_state = 0; + dummy.b_blocknr = -1000; + buffer_trace_init(&dummy.b_history); +- *errp = ext3_get_block_handle(handle, inode, block, &dummy, create); ++ *errp = ext3_get_block_wrap(handle, inode, block, &dummy, create); + if (!*errp && buffer_mapped(&dummy)) { + struct buffer_head *bh; + bh = sb_getblk(inode->i_sb, dummy.b_blocknr); +@@ -1403,7 +1412,7 @@ + * This required during truncate. We need to physically zero the tail end + * of that block so it doesn't yield old data if the file is later grown. + */ +-static int ext3_block_truncate_page(handle_t *handle, ++int ext3_block_truncate_page(handle_t *handle, + struct address_space *mapping, loff_t from) + { + unsigned long index = from >> PAGE_CACHE_SHIFT; +@@ -1889,6 +1898,9 @@ + + ext3_discard_prealloc(inode); + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_truncate(inode); ++ + handle = start_transaction(inode); + if (IS_ERR(handle)) + return; /* AKPM: return what? */ +@@ -2537,6 +2549,9 @@ + int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3; + int ret; + ++ if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) ++ return ext3_ext_writepage_trans_blocks(inode, bpp); ++ + if (ext3_should_journal_data(inode)) + ret = 3 * (bpp + indirects) + 2; + else +@@ -2973,7 +2988,7 @@ + + /* alloc blocks one by one */ + for (i = 0; i < nblocks; i++) { +- ret = ext3_get_block_handle(handle, inode, blocks[i], ++ ret = ext3_get_block_wrap(handle, inode, blocks[i], + &bh_tmp, 1); + if (ret) + break; +@@ -3049,7 +3064,7 @@ + if (blocks[i] != 0) + continue; + +- rc = ext3_get_block_handle(handle, inode, iblock, &bh, 1); ++ rc = ext3_get_block_wrap(handle, inode, iblock, &bh, 1); + if (rc) { + printk(KERN_INFO "ext3_map_inode_page: error %d " + "allocating block %ld\n", rc, iblock); +Index: linux-2.4.24-mb34/fs/ext3/Makefile +=================================================================== +--- linux-2.4.24-mb34.orig/fs/ext3/Makefile 2004-05-05 13:47:40.000000000 -0700 ++++ linux-2.4.24-mb34/fs/ext3/Makefile 2004-05-05 13:49:40.000000000 -0700 +@@ -13,7 +13,9 @@ + + obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o \ +- xattr_trusted.o ++ xattr_trusted.o extents.o ++export-objs += extents.o ++ + obj-m := $(O_TARGET) + + export-objs += xattr.o +Index: linux-2.4.24-mb34/fs/ext3/super.c +=================================================================== +--- linux-2.4.24-mb34.orig/fs/ext3/super.c 2004-05-05 13:47:40.000000000 -0700 ++++ linux-2.4.24-mb34/fs/ext3/super.c 2004-05-05 13:49:40.000000000 -0700 +@@ -530,6 +530,7 @@ + int i; + + J_ASSERT(sbi->s_delete_inodes == 0); ++ ext3_ext_release(sb); + ext3_xattr_put_super(sb); + journal_destroy(sbi->s_journal); + if (!(sb->s_flags & MS_RDONLY)) { +@@ -702,6 +703,10 @@ + return 0; + } + } ++ else if (!strcmp (this_char, "extents")) ++ set_opt (*mount_options, EXTENTS); ++ else if (!strcmp (this_char, "extdebug")) ++ set_opt (*mount_options, EXTDEBUG); + else if (!strcmp (this_char, "grpid") || + !strcmp (this_char, "bsdgroups")) + set_opt (*mount_options, GRPID); +@@ -1393,6 +1398,8 @@ + test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": + "writeback"); + ++ ext3_ext_init(sb); ++ + return sb; + + failed_mount3: +Index: linux-2.4.24-mb34/fs/ext3/ioctl.c +=================================================================== +--- linux-2.4.24-mb34.orig/fs/ext3/ioctl.c 2004-05-05 13:47:38.000000000 -0700 ++++ linux-2.4.24-mb34/fs/ext3/ioctl.c 2004-05-05 13:49:40.000000000 -0700 +@@ -174,6 +174,10 @@ + return ret; + } + #endif ++ case EXT3_IOC_GET_EXTENTS: ++ case EXT3_IOC_GET_TREE_STATS: ++ case EXT3_IOC_GET_TREE_DEPTH: ++ return ext3_ext_ioctl(inode, filp, cmd, arg); + default: + return -ENOTTY; + } +Index: linux-2.4.24-mb34/include/linux/ext3_fs.h +=================================================================== +--- linux-2.4.24-mb34.orig/include/linux/ext3_fs.h 2004-05-05 13:47:40.000000000 -0700 ++++ linux-2.4.24-mb34/include/linux/ext3_fs.h 2004-05-05 13:49:40.000000000 -0700 +@@ -184,6 +184,7 @@ + #define EXT3_IMAGIC_FL 0x00002000 /* AFS directory */ + #define EXT3_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ + #define EXT3_RESERVED_FL 0x80000000 /* reserved for ext3 lib */ ++#define EXT3_EXTENTS_FL 0x00080000 /* Inode uses extents */ + + #define EXT3_FL_USER_VISIBLE 0x00005FFF /* User visible flags */ + #define EXT3_FL_USER_MODIFIABLE 0x000000FF /* User modifiable flags */ +@@ -208,6 +209,9 @@ + #ifdef CONFIG_JBD_DEBUG + #define EXT3_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) + #endif ++#define EXT3_IOC_GET_EXTENTS _IOR('f', 5, long) ++#define EXT3_IOC_GET_TREE_DEPTH _IOR('f', 6, long) ++#define EXT3_IOC_GET_TREE_STATS _IOR('f', 7, long) + + /* + * Structure of an inode on the disk +@@ -327,6 +331,8 @@ + #define EXT3_MOUNT_IOPEN 0x8000 /* Allow access via iopen */ + #define EXT3_MOUNT_IOPEN_NOPRIV 0x10000 /* Make iopen world-readable */ + #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ ++#define EXT3_MOUNT_EXTENTS 0x100000/* Extents support */ ++#define EXT3_MOUNT_EXTDEBUG 0x200000/* Extents debug */ + + /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ + #ifndef _LINUX_EXT2_FS_H +@@ -688,6 +694,7 @@ + extern unsigned long ext3_count_free (struct buffer_head *, unsigned); + + /* inode.c */ ++extern int ext3_block_truncate_page(handle_t *, struct address_space *, loff_t); + extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int); + extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); + extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); +@@ -769,6 +776,14 @@ + extern struct inode_operations ext3_symlink_inode_operations; + extern struct inode_operations ext3_fast_symlink_inode_operations; + ++/* extents.c */ ++extern int ext3_ext_writepage_trans_blocks(struct inode *, int); ++extern int ext3_ext_get_block(handle_t *, struct inode *, long, ++ struct buffer_head *, int); ++extern void ext3_ext_truncate(struct inode *); ++extern void ext3_ext_init(struct super_block *); ++extern void ext3_ext_release(struct super_block *); ++extern void ext3_extents_initialize_blockmap(handle_t *, struct inode *); + + #endif /* __KERNEL__ */ + +Index: linux-2.4.24-mb34/include/linux/ext3_extents.h +=================================================================== +--- linux-2.4.24-mb34.orig/include/linux/ext3_extents.h 1969-12-31 16:00:00.000000000 -0800 ++++ linux-2.4.24-mb34/include/linux/ext3_extents.h 2004-05-05 14:27:50.000000000 -0700 +@@ -0,0 +1,219 @@ ++/* ++ * Copyright (C) 2003 Alex Tomas <alex@clusterfs.com> ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public Licens ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- ++ */ ++ ++#ifndef _LINUX_EXT3_EXTENTS ++#define _LINUX_EXT3_EXTENTS ++ ++/* ++ * with AGRESSIVE_TEST defined capacity of index/leaf blocks ++ * become very little, so index split, in-depth growing and ++ * other hard changes happens much more often ++ * this is for debug purposes only ++ */ ++#define AGRESSIVE_TEST_ ++ ++/* ++ * if CHECK_BINSEARCH defined, then results of binary search ++ * will be checked by linear search ++ */ ++#define CHECK_BINSEARCH_ ++ ++/* ++ * if EXT_DEBUG is defined you can use 'extdebug' mount option ++ * to get lots of info what's going on ++ */ ++#define EXT_DEBUG ++#ifdef EXT_DEBUG ++#define ext_debug(tree,fmt,a...) \ ++do { \ ++ if (test_opt((tree)->inode->i_sb, EXTDEBUG)) \ ++ printk(fmt, ##a); \ ++} while (0); ++#else ++#define ext_debug(tree,fmt,a...) ++#endif ++ ++/* ++ * if EXT_STATS is defined then stats numbers are collected ++ * these number will be displayed at umount time ++ */ ++#define EXT_STATS_ ++ ++ ++#define EXT3_ALLOC_NEEDED 3 /* block bitmap + group desc. + sb */ ++ ++/* ++ * ext3_inode has i_block array (total 60 bytes) ++ * first 4 bytes are used to store: ++ * - tree depth (0 mean there is no tree yet. all extents in the inode) ++ * - number of alive extents in the inode ++ */ ++ ++/* ++ * this is extent on-disk structure ++ * it's used at the bottom of the tree ++ */ ++struct ext3_extent { ++ __u32 e_block; /* first logical block extent covers */ ++ __u32 e_start; /* first physical block extents lives */ ++ __u32 e_num; /* number of blocks covered by extent */ ++}; ++ ++/* ++ * this is index on-disk structure ++ * it's used at all the levels, but the bottom ++ */ ++struct ext3_extent_idx { ++ __u32 e_block; /* index covers logical blocks from 'block' */ ++ __u32 e_leaf; /* pointer to the physical block of the next * ++ * level. leaf or next index could bet here */ ++}; ++ ++/* ++ * each block (leaves and indexes), even inode-stored has header ++ */ ++struct ext3_extent_header { ++ __u16 e_magic; /* probably will support different formats */ ++ __u16 e_num; /* number of valid entries */ ++ __u16 e_max; /* capacity of store in entries */ ++ __u16 e_depth; /* has tree real underlaying blocks? */ ++ __u32 e_generation; /* generation of the tree */ ++}; ++ ++#define EXT3_EXT_MAGIC 0xf301 ++ ++/* ++ * array of ext3_ext_path contains path to some extent ++ * creation/lookup routines use it for traversal/splitting/etc ++ * truncate uses it to simulate recursive walking ++ */ ++struct ext3_ext_path { ++ __u32 p_block; ++ __u16 p_depth; ++ struct ext3_extent *p_ext; ++ struct ext3_extent_idx *p_idx; ++ struct ext3_extent_header *p_hdr; ++ struct buffer_head *p_bh; ++}; ++ ++/* ++ * structure for external API ++ */ ++ ++ ++/* ++ * ext3_extents_tree is used to pass initial information ++ * to top-level extents API ++ */ ++struct ext3_extents_tree { ++ struct inode *inode; /* inode which tree belongs to */ ++ void *root; /* ptr to data top of tree resides at */ ++ void *buffer; /* will be passed as arg to ^^ routines */ ++ int buffer_len; ++ void *private; ++ struct ext3_extent *cex;/* last found extent */ ++ int (*get_write_access)(handle_t *h, void *buffer); ++ int (*mark_buffer_dirty)(handle_t *h, void *buffer); ++ int (*mergable)(struct ext3_extent *ex1, struct ext3_extent *ex2); ++ int (*remove_extent_credits)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*remove_extent)(struct ext3_extents_tree *, ++ struct ext3_extent *, unsigned long, ++ unsigned long); ++ int (*new_block)(handle_t *, struct ext3_extents_tree *, ++ struct ext3_ext_path *, struct ext3_extent *, ++ int *); ++}; ++ ++/* ++ * to be called by ext3_ext_walk_space() ++ * negative retcode - error ++ * positive retcode - signal for ext3_ext_walk_space(), see below ++ * callback must return valid extent (passed or newly created) ++ */ ++typedef int (*ext_prepare_callback)(struct ext3_extents_tree *, ++ struct ext3_ext_path *, ++ struct ext3_extent *, int); ++ ++#define EXT_CONTINUE 0 ++#define EXT_BREAK 1 ++#define EXT_REPEAT 2 ++ ++ ++#define EXT_FIRST_EXTENT(__hdr__) \ ++ ((struct ext3_extent *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_FIRST_INDEX(__hdr__) \ ++ ((struct ext3_extent_idx *) (((char *) (__hdr__)) + \ ++ sizeof(struct ext3_extent_header))) ++#define EXT_HAS_FREE_INDEX(__path__) \ ++ ((__path__)->p_hdr->e_num < (__path__)->p_hdr->e_max) ++#define EXT_LAST_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_LAST_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_num - 1) ++#define EXT_MAX_EXTENT(__hdr__) \ ++ (EXT_FIRST_EXTENT((__hdr__)) + (__hdr__)->e_max - 1) ++#define EXT_MAX_INDEX(__hdr__) \ ++ (EXT_FIRST_INDEX((__hdr__)) + (__hdr__)->e_max - 1) ++ ++#define EXT_ROOT_HDR(tree) \ ++ ((struct ext3_extent_header *) (tree)->root) ++#define EXT_BLOCK_HDR(bh) \ ++ ((struct ext3_extent_header *) (bh)->b_data) ++#define EXT_DEPTH(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->e_depth) ++#define EXT_GENERATION(_t_) \ ++ (((struct ext3_extent_header *)((_t_)->root))->e_generation) ++ ++ ++#define EXT_ASSERT(__x__) if (!(__x__)) BUG(); ++ ++ ++/* ++ * this structure is used to gather extents from the tree via ioctl ++ */ ++struct ext3_extent_buf { ++ unsigned long start; ++ int buflen; ++ void *buffer; ++ void *cur; ++ int err; ++}; ++ ++/* ++ * this structure is used to collect stats info about the tree ++ */ ++struct ext3_extent_tree_stats { ++ int depth; ++ int extents_num; ++ int leaf_num; ++}; ++ ++extern int ext3_extent_tree_init(handle_t *, struct ext3_extents_tree *); ++extern int ext3_ext_calc_credits_for_insert(struct ext3_extents_tree *, struct ext3_ext_path *); ++extern int ext3_ext_insert_extent(handle_t *, struct ext3_extents_tree *, struct ext3_ext_path *, struct ext3_extent *); ++extern int ext3_ext_walk_space(struct ext3_extents_tree *, unsigned long, unsigned long, ext_prepare_callback); ++extern int ext3_ext_remove_space(struct ext3_extents_tree *, unsigned long, unsigned long); ++extern struct ext3_ext_path * ext3_ext_find_extent(struct ext3_extents_tree *, int, struct ext3_ext_path *); ++int ext3_ext_find_goal(struct inode *inode, struct ext3_ext_path *path, ++ unsigned long block); ++void ext3_init_tree_desc(struct ext3_extents_tree *tree, struct inode *inode); ++void ext3_ext_invalidate_cache(struct ext3_extents_tree *tree); ++ ++#endif /* _LINUX_EXT3_EXTENTS */ +Index: linux-2.4.24-mb34/include/linux/ext3_fs_i.h +=================================================================== +--- linux-2.4.24-mb34.orig/include/linux/ext3_fs_i.h 2004-05-05 13:47:40.000000000 -0700 ++++ linux-2.4.24-mb34/include/linux/ext3_fs_i.h 2004-05-05 13:53:43.000000000 -0700 +@@ -76,6 +76,8 @@ + * by other means, so we have truncate_sem. + */ + struct rw_semaphore truncate_sem; ++ ++ __u32 i_cached_extent[3]; + }; + + #endif /* _LINUX_EXT3_FS_I */ diff --git a/lustre/kernel_patches/patches/iopen-2.4.18-2.patch b/lustre/kernel_patches/patches/iopen-2.4.18-2.patch index 821573004888c19a41a959733a1839558098380c..4af67bc394a94fa43f93377738084aa77a3f73e0 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.18-2.patch +++ b/lustre/kernel_patches/patches/iopen-2.4.18-2.patch @@ -8,9 +8,11 @@ include/linux/ext3_fs.h | 2 8 files changed, 318 insertions(+), 1 deletion(-) ---- linux-2.4.18-p4smp/Documentation/filesystems/ext2.txt~iopen-2.4.18 2003-07-09 12:17:30.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/Documentation/filesystems/ext2.txt 2003-07-09 17:13:02.000000000 -0600 -@@ -35,6 +35,22 @@ resgid=n The group ID which may use th +Index: linux-aed/Documentation/filesystems/ext2.txt +=================================================================== +--- linux-aed.orig/Documentation/filesystems/ext2.txt Tue May 4 13:14:35 2004 ++++ linux-aed/Documentation/filesystems/ext2.txt Tue May 4 19:17:12 2004 +@@ -35,6 +35,22 @@ sb=n Use alternate superblock at this location. @@ -33,19 +35,23 @@ grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. ---- linux-2.4.18-p4smp/fs/ext3/Makefile~iopen-2.4.18 2003-07-09 17:12:12.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/fs/ext3/Makefile 2003-07-09 17:13:15.000000000 -0600 -@@ -11,7 +11,7 @@ O_TARGET := ext3.o +Index: linux-aed/fs/ext3/Makefile +=================================================================== +--- linux-aed.orig/fs/ext3/Makefile Tue May 4 19:16:51 2004 ++++ linux-aed/fs/ext3/Makefile Tue May 4 19:17:12 2004 +@@ -11,7 +11,7 @@ - export-objs := super.o inode.o xattr.o ext3-exports.o + export-objs := ext3-exports.o -obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+obj-y := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o xattr.o hash.o ext3-exports.o obj-m := $(O_TARGET) ---- linux-2.4.18-p4smp/fs/ext3/inode.c~iopen-2.4.18 2003-07-09 17:11:19.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/fs/ext3/inode.c 2003-07-09 17:13:02.000000000 -0600 +Index: linux-aed/fs/ext3/inode.c +=================================================================== +--- linux-aed.orig/fs/ext3/inode.c Tue May 4 19:17:09 2004 ++++ linux-aed/fs/ext3/inode.c Tue May 4 19:17:12 2004 @@ -31,6 +31,7 @@ #include <linux/highuid.h> #include <linux/quotaops.h> @@ -54,7 +60,7 @@ /* * SEARCH_FROM_ZERO forces each block allocation to search from the start -@@ -2165,6 +2166,9 @@ void ext3_read_inode(struct inode * inod +@@ -2277,6 +2278,9 @@ struct buffer_head *bh; int block; @@ -64,9 +70,11 @@ if(ext3_get_inode_loc(inode, &iloc)) goto bad_inode; bh = iloc.bh; ---- /dev/null 2003-01-30 03:24:37.000000000 -0700 -+++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.c 2003-07-09 17:13:02.000000000 -0600 -@@ -0,0 +1,258 @@ +Index: linux-aed/fs/ext3/iopen.c +=================================================================== +--- linux-aed.orig/fs/ext3/iopen.c Tue May 4 13:14:35 2004 ++++ linux-aed/fs/ext3/iopen.c Tue May 4 19:17:12 2004 +@@ -0,0 +1,282 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -203,13 +211,24 @@ + +/* This function is spliced into ext3_lookup and does the move of a + * disconnected dentry (if it exists) to a connected dentry. -+ * Caller must hold dcache_lock. + */ -+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) +{ + struct dentry *tmp, *goal = NULL; + struct list_head *lp; + ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ + /* preferrably return a connected dentry */ + list_for_each(lp, &inode->i_dentry) { + tmp = list_entry(lp, struct dentry, d_alias); @@ -223,27 +242,40 @@ + } + + if (!goal) -+ return NULL; ++ goto do_instantiate; + + /* Move the goal to the de hash queue - like d_move() */ + goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; + list_del_init(&goal->d_hash); + + list_del(&goal->d_child); -+ list_del(&de->d_child); ++ list_del(&dentry->d_child); + + /* Switch the parents and the names.. */ -+ switch_names(goal, de); -+ do_switch(goal->d_parent, de->d_parent); -+ do_switch(goal->d_name.len, de->d_name.len); -+ do_switch(goal->d_name.hash, de->d_name.hash); ++ switch_names(goal, dentry); ++ do_switch(goal->d_parent, dentry->d_parent); ++ do_switch(goal->d_name.len, dentry->d_name.len); ++ do_switch(goal->d_name.hash, dentry->d_name.hash); + + /* And add them back to the (new) parent lists */ + list_add(&goal->d_child, &goal->d_parent->d_subdirs); -+ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); + __d_rehash(goal, 0); ++ spin_unlock(&dcache_lock); ++ iput(inode); + + return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; +} + +/* @@ -325,8 +357,10 @@ + + return 1; +} ---- /dev/null 2003-01-30 03:24:37.000000000 -0700 -+++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.h 2003-07-09 17:13:02.000000000 -0600 +Index: linux-aed/fs/ext3/iopen.h +=================================================================== +--- linux-aed.orig/fs/ext3/iopen.h Tue May 4 13:14:35 2004 ++++ linux-aed/fs/ext3/iopen.h Tue May 4 19:17:12 2004 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -341,10 +375,12 @@ + +extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); +extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *de, -+ struct inode *inode); ---- linux-2.4.18-p4smp/fs/ext3/namei.c~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/fs/ext3/namei.c 2003-07-09 17:13:02.000000000 -0600 ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-aed/fs/ext3/namei.c +=================================================================== +--- linux-aed.orig/fs/ext3/namei.c Tue May 4 19:17:05 2004 ++++ linux-aed/fs/ext3/namei.c Tue May 4 19:17:12 2004 @@ -34,6 +34,7 @@ #include <linux/locks.h> #include <linux/quotaops.h> @@ -353,12 +389,7 @@ /* * define how far ahead to read directories while searching them. -@@ -709,10 +710,14 @@ cleanup_and_exit: - struct inode * inode; - struct ext3_dir_entry_2 * de; - struct buffer_head * bh; -+ struct dentry *alternate = NULL; - +@@ -713,6 +714,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -368,39 +399,67 @@ bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -723,7 +729,28 @@ static struct dentry *ext3_lookup(struct +@@ -723,8 +727,8 @@ if (!inode) return ERR_PTR(-EACCES); } - d_add(dentry, inode); +- return NULL; + -+ /* verify this dentry is really new */ -+ assert(!dentry->d_inode); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ return alternate; ++ return iopen_connect_dentry(dentry, inode, 1); + } + + #define S_SHIFT 12 +@@ -1588,10 +1592,6 @@ + inode->i_nlink); + inode->i_version = ++event; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; +@@ -1711,6 +1711,23 @@ + goto out_stop; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ (void)iopen_connect_dentry(dentry, inode, 0); ++ return 0; ++ } + } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} + -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ if (inode) /* d_instantiate */ -+ list_add(&dentry->d_alias, &inode->i_dentry); -+ dentry->d_inode = inode; -+ -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ - return NULL; - } + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -1736,7 +1753,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); ---- linux-2.4.18-p4smp/fs/ext3/super.c~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/fs/ext3/super.c 2003-07-09 17:13:02.000000000 -0600 -@@ -831,6 +831,18 @@ static int parse_options (char * options +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle, inode); + ext3_mark_inode_dirty(handle, inode); + ext3_journal_stop(handle, dir); + return err; +Index: linux-aed/fs/ext3/super.c +=================================================================== +--- linux-aed.orig/fs/ext3/super.c Tue May 4 19:17:01 2004 ++++ linux-aed/fs/ext3/super.c Tue May 4 19:17:12 2004 +@@ -834,6 +834,18 @@ || !strcmp (this_char, "quota") || !strcmp (this_char, "usrquota")) /* Don't do anything ;-) */ ; @@ -419,9 +478,11 @@ else if (!strcmp (this_char, "journal")) { /* @@@ FIXME */ /* Eventually we will want to be able to create ---- linux-2.4.18-p4smp/include/linux/ext3_fs.h~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/include/linux/ext3_fs.h 2003-07-09 17:13:02.000000000 -0600 -@@ -321,6 +321,8 @@ struct ext3_inode { +Index: linux-aed/include/linux/ext3_fs.h +=================================================================== +--- linux-aed.orig/include/linux/ext3_fs.h Tue May 4 19:17:08 2004 ++++ linux-aed/include/linux/ext3_fs.h Tue May 4 19:17:12 2004 +@@ -321,6 +321,8 @@ #define EXT3_MOUNT_WRITEBACK_DATA 0x0C00 /* No data ordering */ #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ @@ -430,5 +491,3 @@ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - -_ diff --git a/lustre/kernel_patches/patches/iopen-2.4.18.patch b/lustre/kernel_patches/patches/iopen-2.4.18.patch index 202ebc6ef134f4e5ce7e603758118ea587a34283..7c56b03988cf4f21ded18835b9b5590aa23fad6f 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.18.patch +++ b/lustre/kernel_patches/patches/iopen-2.4.18.patch @@ -8,9 +8,11 @@ include/linux/ext3_fs.h | 2 8 files changed, 318 insertions(+), 1 deletion(-) ---- linux-2.4.18-p4smp/Documentation/filesystems/ext2.txt~iopen-2.4.18 2003-07-09 12:17:30.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/Documentation/filesystems/ext2.txt 2003-07-09 17:13:02.000000000 -0600 -@@ -35,6 +35,22 @@ resgid=n The group ID which may use th +Index: linux-aed/Documentation/filesystems/ext2.txt +=================================================================== +--- linux-aed.orig/Documentation/filesystems/ext2.txt Tue May 4 13:14:35 2004 ++++ linux-aed/Documentation/filesystems/ext2.txt Tue May 4 19:17:12 2004 +@@ -35,6 +35,22 @@ sb=n Use alternate superblock at this location. @@ -33,19 +35,23 @@ grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. ---- linux-2.4.18-p4smp/fs/ext3/Makefile~iopen-2.4.18 2003-07-09 17:12:12.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/fs/ext3/Makefile 2003-07-09 17:13:15.000000000 -0600 -@@ -11,7 +11,7 @@ O_TARGET := ext3.o +Index: linux-aed/fs/ext3/Makefile +=================================================================== +--- linux-aed.orig/fs/ext3/Makefile Tue May 4 19:16:51 2004 ++++ linux-aed/fs/ext3/Makefile Tue May 4 19:17:12 2004 +@@ -11,7 +11,7 @@ - export-objs := super.o inode.o xattr.o ext3-exports.o + export-objs := ext3-exports.o -obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -+obj-y := balloc.o iopen.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++obj-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ ioctl.o namei.o super.o symlink.o xattr.o ext3-exports.o obj-m := $(O_TARGET) ---- linux-2.4.18-p4smp/fs/ext3/inode.c~iopen-2.4.18 2003-07-09 17:11:19.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/fs/ext3/inode.c 2003-07-09 17:13:02.000000000 -0600 +Index: linux-aed/fs/ext3/inode.c +=================================================================== +--- linux-aed.orig/fs/ext3/inode.c Tue May 4 19:17:09 2004 ++++ linux-aed/fs/ext3/inode.c Tue May 4 19:17:12 2004 @@ -31,6 +31,7 @@ #include <linux/highuid.h> #include <linux/quotaops.h> @@ -54,7 +60,7 @@ /* * SEARCH_FROM_ZERO forces each block allocation to search from the start -@@ -2165,6 +2166,9 @@ void ext3_read_inode(struct inode * inod +@@ -2277,6 +2278,9 @@ struct buffer_head *bh; int block; @@ -64,9 +70,11 @@ if(ext3_get_inode_loc(inode, &iloc)) goto bad_inode; bh = iloc.bh; ---- /dev/null 2003-01-30 03:24:37.000000000 -0700 -+++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.c 2003-07-09 17:13:02.000000000 -0600 -@@ -0,0 +1,258 @@ +Index: linux-aed/fs/ext3/iopen.c +=================================================================== +--- linux-aed.orig/fs/ext3/iopen.c Tue May 4 13:14:35 2004 ++++ linux-aed/fs/ext3/iopen.c Tue May 4 19:17:12 2004 +@@ -0,0 +1,282 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -203,13 +211,24 @@ + +/* This function is spliced into ext3_lookup and does the move of a + * disconnected dentry (if it exists) to a connected dentry. -+ * Caller must hold dcache_lock. + */ -+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) +{ + struct dentry *tmp, *goal = NULL; + struct list_head *lp; + ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ + /* preferrably return a connected dentry */ + list_for_each(lp, &inode->i_dentry) { + tmp = list_entry(lp, struct dentry, d_alias); @@ -223,27 +242,40 @@ + } + + if (!goal) -+ return NULL; ++ goto do_instantiate; + + /* Move the goal to the de hash queue - like d_move() */ + goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; + list_del_init(&goal->d_hash); + + list_del(&goal->d_child); -+ list_del(&de->d_child); ++ list_del(&dentry->d_child); + + /* Switch the parents and the names.. */ -+ switch_names(goal, de); -+ do_switch(goal->d_parent, de->d_parent); -+ do_switch(goal->d_name.len, de->d_name.len); -+ do_switch(goal->d_name.hash, de->d_name.hash); ++ switch_names(goal, dentry); ++ do_switch(goal->d_parent, dentry->d_parent); ++ do_switch(goal->d_name.len, dentry->d_name.len); ++ do_switch(goal->d_name.hash, dentry->d_name.hash); + + /* And add them back to the (new) parent lists */ + list_add(&goal->d_child, &goal->d_parent->d_subdirs); -+ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); + __d_rehash(goal, 0); ++ spin_unlock(&dcache_lock); ++ iput(inode); + + return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; +} + +/* @@ -325,8 +357,10 @@ + + return 1; +} ---- /dev/null 2003-01-30 03:24:37.000000000 -0700 -+++ linux-2.4.18-p4smp-braam/fs/ext3/iopen.h 2003-07-09 17:13:02.000000000 -0600 +Index: linux-aed/fs/ext3/iopen.h +=================================================================== +--- linux-aed.orig/fs/ext3/iopen.h Tue May 4 13:14:35 2004 ++++ linux-aed/fs/ext3/iopen.h Tue May 4 19:17:12 2004 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -341,10 +375,12 @@ + +extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); +extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *de, -+ struct inode *inode); ---- linux-2.4.18-p4smp/fs/ext3/namei.c~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/fs/ext3/namei.c 2003-07-09 17:13:02.000000000 -0600 ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-aed/fs/ext3/namei.c +=================================================================== +--- linux-aed.orig/fs/ext3/namei.c Tue May 4 19:17:05 2004 ++++ linux-aed/fs/ext3/namei.c Tue May 4 19:17:12 2004 @@ -34,6 +34,7 @@ #include <linux/locks.h> #include <linux/quotaops.h> @@ -353,12 +389,7 @@ /* * define how far ahead to read directories while searching them. -@@ -703,10 +704,14 @@ cleanup_and_exit: - struct inode * inode; - struct ext3_dir_entry_2 * de; - struct buffer_head * bh; -+ struct dentry *alternate = NULL; - +@@ -713,6 +714,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -368,39 +399,67 @@ bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -723,7 +729,28 @@ static struct dentry *ext3_lookup(struct +@@ -723,8 +727,8 @@ if (!inode) return ERR_PTR(-EACCES); } - d_add(dentry, inode); +- return NULL; + -+ /* verify this dentry is really new */ -+ assert(!dentry->d_inode); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ return alternate; ++ return iopen_connect_dentry(dentry, inode, 1); + } + + #define S_SHIFT 12 +@@ -1588,10 +1592,6 @@ + inode->i_nlink); + inode->i_version = ++event; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + ext3_mark_inode_dirty(handle, inode); + dir->i_nlink--; +@@ -1711,6 +1711,23 @@ + goto out_stop; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ (void)iopen_connect_dentry(dentry, inode, 0); ++ return 0; ++ } + } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} + -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ if (inode) /* d_instantiate */ -+ list_add(&dentry->d_alias, &inode->i_dentry); -+ dentry->d_inode = inode; -+ -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ - return NULL; - } + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -1736,7 +1753,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); ---- linux-2.4.18-p4smp/fs/ext3/super.c~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/fs/ext3/super.c 2003-07-09 17:13:02.000000000 -0600 -@@ -831,6 +831,18 @@ static int parse_options (char * options +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle, inode); + ext3_mark_inode_dirty(handle, inode); + ext3_journal_stop(handle, dir); + return err; +Index: linux-aed/fs/ext3/super.c +=================================================================== +--- linux-aed.orig/fs/ext3/super.c Tue May 4 19:17:01 2004 ++++ linux-aed/fs/ext3/super.c Tue May 4 19:17:12 2004 +@@ -834,6 +834,18 @@ || !strcmp (this_char, "quota") || !strcmp (this_char, "usrquota")) /* Don't do anything ;-) */ ; @@ -419,9 +478,11 @@ else if (!strcmp (this_char, "journal")) { /* @@@ FIXME */ /* Eventually we will want to be able to create ---- linux-2.4.18-p4smp/include/linux/ext3_fs.h~iopen-2.4.18 2003-07-09 13:32:38.000000000 -0600 -+++ linux-2.4.18-p4smp-braam/include/linux/ext3_fs.h 2003-07-09 17:13:02.000000000 -0600 -@@ -321,6 +321,8 @@ struct ext3_inode { +Index: linux-aed/include/linux/ext3_fs.h +=================================================================== +--- linux-aed.orig/include/linux/ext3_fs.h Tue May 4 19:17:08 2004 ++++ linux-aed/include/linux/ext3_fs.h Tue May 4 19:17:12 2004 +@@ -321,6 +321,8 @@ #define EXT3_MOUNT_UPDATE_JOURNAL 0x1000 /* Update the journal format */ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ #define EXT3_MOUNT_INDEX 0x4000 /* Enable directory index */ @@ -430,5 +491,3 @@ #define EXT3_MOUNT_ASYNCDEL 0x20000 /* Delayed deletion */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ - -_ diff --git a/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch index 9258544a56b2513c4c66cfe3400ee2b7d1e74692..07e49b863e1c93065a18177da539ff31a423d14a 100644 --- a/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch +++ b/lustre/kernel_patches/patches/iopen-2.4.21-sles8sp3.patch @@ -8,10 +8,10 @@ include/linux/ext3_fs.h | 2 8 files changed, 318 insertions(+), 2 deletions(-) -Index: linux-2.4.21/Documentation/filesystems/ext2.txt +Index: kernel-2.4.212l35/Documentation/filesystems/ext2.txt =================================================================== ---- linux-2.4.21.orig/Documentation/filesystems/ext2.txt 2001-07-11 18:44:45.000000000 -0400 -+++ linux-2.4.21/Documentation/filesystems/ext2.txt 2004-04-24 02:46:32.000000000 -0400 +--- kernel-2.4.212l35.orig/Documentation/filesystems/ext2.txt 2001-07-11 15:44:45.000000000 -0700 ++++ kernel-2.4.212l35/Documentation/filesystems/ext2.txt 2004-05-06 19:48:32.000000000 -0700 @@ -35,6 +35,22 @@ sb=n Use alternate superblock at this location. @@ -35,10 +35,10 @@ Index: linux-2.4.21/Documentation/filesystems/ext2.txt grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. -Index: linux-2.4.21/fs/ext3/Makefile +Index: kernel-2.4.212l35/fs/ext3/Makefile =================================================================== ---- linux-2.4.21.orig/fs/ext3/Makefile 2004-04-24 02:46:18.000000000 -0400 -+++ linux-2.4.21/fs/ext3/Makefile 2004-04-24 02:47:02.000000000 -0400 +--- kernel-2.4.212l35.orig/fs/ext3/Makefile 2004-05-06 19:46:22.000000000 -0700 ++++ kernel-2.4.212l35/fs/ext3/Makefile 2004-05-06 19:48:32.000000000 -0700 @@ -11,7 +11,7 @@ export-objs := ext3-exports.o @@ -48,10 +48,10 @@ Index: linux-2.4.21/fs/ext3/Makefile ioctl.o namei.o super.o symlink.o hash.o ext3-exports.o obj-m := $(O_TARGET) -Index: linux-2.4.21/fs/ext3/inode.c +Index: kernel-2.4.212l35/fs/ext3/inode.c =================================================================== ---- linux-2.4.21.orig/fs/ext3/inode.c 2004-04-24 02:46:19.000000000 -0400 -+++ linux-2.4.21/fs/ext3/inode.c 2004-04-24 02:46:32.000000000 -0400 +--- kernel-2.4.212l35.orig/fs/ext3/inode.c 2004-05-06 19:46:24.000000000 -0700 ++++ kernel-2.4.212l35/fs/ext3/inode.c 2004-05-06 19:48:32.000000000 -0700 @@ -34,6 +34,7 @@ #include <linux/highuid.h> #include <linux/quotaops.h> @@ -70,11 +70,11 @@ Index: linux-2.4.21/fs/ext3/inode.c if(ext3_get_inode_loc(inode, &iloc)) goto bad_inode; bh = iloc.bh; -Index: linux-2.4.21/fs/ext3/iopen.c +Index: kernel-2.4.212l35/fs/ext3/iopen.c =================================================================== ---- linux-2.4.21.orig/fs/ext3/iopen.c 2003-01-30 05:24:37.000000000 -0500 -+++ linux-2.4.21/fs/ext3/iopen.c 2004-04-24 02:46:32.000000000 -0400 -@@ -0,0 +1,258 @@ +--- kernel-2.4.212l35.orig/fs/ext3/iopen.c 2003-03-27 11:16:05.000000000 -0800 ++++ kernel-2.4.212l35/fs/ext3/iopen.c 2004-05-06 19:48:41.000000000 -0700 +@@ -0,0 +1,282 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -211,13 +211,24 @@ Index: linux-2.4.21/fs/ext3/iopen.c + +/* This function is spliced into ext3_lookup and does the move of a + * disconnected dentry (if it exists) to a connected dentry. -+ * Caller must hold dcache_lock. + */ -+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) +{ + struct dentry *tmp, *goal = NULL; + struct list_head *lp; + ++ /* verify this dentry is really new */ ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(list_empty(&dentry->d_hash)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); ++ ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; ++ + /* preferrably return a connected dentry */ + list_for_each(lp, &inode->i_dentry) { + tmp = list_entry(lp, struct dentry, d_alias); @@ -231,27 +242,40 @@ Index: linux-2.4.21/fs/ext3/iopen.c + } + + if (!goal) -+ return NULL; ++ goto do_instantiate; + + /* Move the goal to the de hash queue - like d_move() */ + goal->d_flags &= ~DCACHE_NFSD_DISCONNECTED; + list_del_init(&goal->d_hash); + + list_del(&goal->d_child); -+ list_del(&de->d_child); ++ list_del(&dentry->d_child); + + /* Switch the parents and the names.. */ -+ switch_names(goal, de); -+ do_switch(goal->d_parent, de->d_parent); -+ do_switch(goal->d_name.len, de->d_name.len); -+ do_switch(goal->d_name.hash, de->d_name.hash); ++ switch_names(goal, dentry); ++ do_switch(goal->d_parent, dentry->d_parent); ++ do_switch(goal->d_name.len, dentry->d_name.len); ++ do_switch(goal->d_name.hash, dentry->d_name.hash); + + /* And add them back to the (new) parent lists */ + list_add(&goal->d_child, &goal->d_parent->d_subdirs); -+ list_add(&de->d_child, &de->d_parent->d_subdirs); ++ list_add(&dentry->d_child, &dentry->d_parent->d_subdirs); + __d_rehash(goal, 0); ++ spin_unlock(&dcache_lock); ++ iput(inode); + + return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; +} + +/* @@ -333,10 +357,10 @@ Index: linux-2.4.21/fs/ext3/iopen.c + + return 1; +} -Index: linux-2.4.21/fs/ext3/iopen.h +Index: kernel-2.4.212l35/fs/ext3/iopen.h =================================================================== ---- linux-2.4.21.orig/fs/ext3/iopen.h 2003-01-30 05:24:37.000000000 -0500 -+++ linux-2.4.21/fs/ext3/iopen.h 2004-04-24 02:46:32.000000000 -0400 +--- kernel-2.4.212l35.orig/fs/ext3/iopen.h 2003-03-27 11:16:05.000000000 -0800 ++++ kernel-2.4.212l35/fs/ext3/iopen.h 2004-05-06 19:48:41.000000000 -0700 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -351,12 +375,12 @@ Index: linux-2.4.21/fs/ext3/iopen.h + +extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); +extern int ext3_iopen_get_inode(struct inode *inode); -+extern struct dentry *iopen_connect_dentry(struct dentry *de, -+ struct inode *inode); -Index: linux-2.4.21/fs/ext3/namei.c ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: kernel-2.4.212l35/fs/ext3/namei.c =================================================================== ---- linux-2.4.21.orig/fs/ext3/namei.c 2004-04-24 02:46:19.000000000 -0400 -+++ linux-2.4.21/fs/ext3/namei.c 2004-04-24 02:46:32.000000000 -0400 +--- kernel-2.4.212l35.orig/fs/ext3/namei.c 2004-05-06 19:46:23.000000000 -0700 ++++ kernel-2.4.212l35/fs/ext3/namei.c 2004-05-06 19:51:48.000000000 -0700 @@ -36,7 +36,7 @@ #include <linux/string.h> #include <linux/locks.h> @@ -366,12 +390,7 @@ Index: linux-2.4.21/fs/ext3/namei.c /* * define how far ahead to read directories while searching them. -@@ -928,10 +928,14 @@ - struct inode * inode; - struct ext3_dir_entry_2 * de; - struct buffer_head * bh; -+ struct dentry *alternate = NULL; - +@@ -932,6 +932,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -381,40 +400,66 @@ Index: linux-2.4.21/fs/ext3/namei.c bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -943,7 +947,28 @@ +@@ -943,8 +946,8 @@ return ERR_PTR(-EACCES); } } - d_add(dentry, inode); +- return NULL; + -+ /* verify this dentry is really new */ -+ assert(!dentry->d_inode); -+ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ -+ assert(list_empty(&dentry->d_hash)); /* d_rehash */ -+ assert(list_empty(&dentry->d_subdirs)); -+ -+ spin_lock(&dcache_lock); -+ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { -+ spin_unlock(&dcache_lock); -+ iput(inode); -+ return alternate; ++ return iopen_connect_dentry(dentry, inode, 1); + } + + #define S_SHIFT 12 +@@ -1936,10 +1940,6 @@ + inode->i_nlink); + inode->i_version = ++event; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + dir->i_nlink--; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; +@@ -2058,6 +2058,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ (void)iopen_connect_dentry(dentry, inode, 0); ++ return 0; ++ } + } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} + -+ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ -+ if (inode) /* d_instantiate */ -+ list_add(&dentry->d_alias, &inode->i_dentry); -+ dentry->d_inode = inode; -+ -+ __d_rehash(dentry, 0); /* d_rehash */ -+ spin_unlock(&dcache_lock); -+ - return NULL; - } + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2085,7 +2102,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); -Index: linux-2.4.21/fs/ext3/super.c +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle, inode); + ext3_journal_stop(handle, dir); + return err; + } +Index: kernel-2.4.212l35/fs/ext3/super.c =================================================================== ---- linux-2.4.21.orig/fs/ext3/super.c 2004-04-24 02:46:19.000000000 -0400 -+++ linux-2.4.21/fs/ext3/super.c 2004-04-24 02:46:32.000000000 -0400 +--- kernel-2.4.212l35.orig/fs/ext3/super.c 2004-05-06 19:46:23.000000000 -0700 ++++ kernel-2.4.212l35/fs/ext3/super.c 2004-05-06 19:48:32.000000000 -0700 @@ -869,6 +869,18 @@ || !strcmp (this_char, "quota") || !strcmp (this_char, "usrquota")) @@ -434,10 +479,10 @@ Index: linux-2.4.21/fs/ext3/super.c else if (!strcmp (this_char, "journal")) { /* @@@ FIXME */ /* Eventually we will want to be able to create -Index: linux-2.4.21/include/linux/ext3_fs.h +Index: kernel-2.4.212l35/include/linux/ext3_fs.h =================================================================== ---- linux-2.4.21.orig/include/linux/ext3_fs.h 2004-04-24 02:46:19.000000000 -0400 -+++ linux-2.4.21/include/linux/ext3_fs.h 2004-04-24 02:46:32.000000000 -0400 +--- kernel-2.4.212l35.orig/include/linux/ext3_fs.h 2004-05-06 19:46:24.000000000 -0700 ++++ kernel-2.4.212l35/include/linux/ext3_fs.h 2004-05-06 19:48:32.000000000 -0700 @@ -324,6 +324,8 @@ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ diff --git a/lustre/kernel_patches/patches/iopen-2.6-suse.patch b/lustre/kernel_patches/patches/iopen-2.6-suse.patch index ef5a25365986d54412226279bdc25f2c8905e0cb..2133355ad47ea243278091b9b0e301d33d826c2f 100644 --- a/lustre/kernel_patches/patches/iopen-2.6-suse.patch +++ b/lustre/kernel_patches/patches/iopen-2.6-suse.patch @@ -1,4 +1,3 @@ - Documentation/filesystems/ext2.txt | 16 ++ fs/ext3/inode.c | 3 fs/ext3/iopen.c | 239 +++++++++++++++++++++++++++++++++++++ fs/ext3/iopen.h | 15 ++ @@ -7,10 +6,23 @@ include/linux/ext3_fs.h | 2 7 files changed, 304 insertions(+), 1 deletion(-) -Index: linux-2.6.4-51.1/fs/ext3/inode.c +Index: linux-stage/fs/ext3/Makefile =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/inode.c 2004-04-06 00:31:14.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/inode.c 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/fs/ext3/Makefile 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/fs/ext3/Makefile 2004-05-07 16:00:17.000000000 -0400 +@@ -4,7 +4,7 @@ + + obj-$(CONFIG_EXT3_FS) += ext3.o + +-ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ++ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o iopen.o \ + ioctl.o namei.o super.o symlink.o hash.o + + ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +Index: linux-stage/fs/ext3/inode.c +=================================================================== +--- linux-stage.orig/fs/ext3/inode.c 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/fs/ext3/inode.c 2004-05-07 17:21:59.000000000 -0400 @@ -37,6 +37,7 @@ #include <linux/mpage.h> #include <linux/uio.h> @@ -19,22 +31,21 @@ Index: linux-2.6.4-51.1/fs/ext3/inode.c #include "acl.h" /* -@@ -2472,6 +2473,8 @@ +@@ -2472,6 +2473,9 @@ ei->i_acl = EXT3_ACL_NOT_CACHED; ei->i_default_acl = EXT3_ACL_NOT_CACHED; #endif + if (ext3_iopen_get_inode(inode)) + return; ++ if (ext3_get_inode_loc(inode, &iloc, 0)) goto bad_inode; bh = iloc.bh; -Index: linux-2.6.4-51.1/fs/ext3/iopen.c +Index: linux-stage/fs/ext3/iopen.c =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/iopen.c 2004-04-06 00:31:24.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/iopen.c 2004-04-06 00:31:24.000000000 -0400 -@@ -0,0 +1,223 @@ -+ -+ +--- linux-stage.orig/fs/ext3/iopen.c 2004-05-07 16:00:17.000000000 -0400 ++++ linux-stage/fs/ext3/iopen.c 2004-05-07 17:22:37.000000000 -0400 +@@ -0,0 +1,272 @@ +/* + * linux/fs/ext3/iopen.c + * @@ -44,6 +55,25 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + * + * This file may be redistributed under the terms of the GNU General + * Public License. ++ * ++ * ++ * Invariants: ++ * - there is only ever a single DCACHE_NFSD_DISCONNECTED dentry alias ++ * for an inode at one time. ++ * - there are never both connected and DCACHE_NFSD_DISCONNECTED dentry ++ * aliases on an inode at the same time. ++ * ++ * If we have any connected dentry aliases for an inode, use one of those ++ * in iopen_lookup(). Otherwise, we instantiate a single NFSD_DISCONNECTED ++ * dentry for this inode, which thereafter will be found by the dcache ++ * when looking up this inode number in __iopen__, so we don't return here ++ * until it is gone. ++ * ++ * If we get an inode via a regular name lookup, then we "rename" the ++ * NFSD_DISCONNECTED dentry to the proper name and parent. This ensures ++ * existing users of the disconnected dentry will continue to use the same ++ * dentry as the connected users, and there will never be both kinds of ++ * dentry aliases at one time. + */ + +#include <linux/sched.h> @@ -52,6 +82,8 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c +#include <linux/jbd.h> +#include <linux/ext3_fs.h> +#include <linux/smp_lock.h> ++#include <linux/dcache.h> ++#include <linux/security.h> +#include "iopen.h" + +#ifndef assert @@ -63,14 +95,15 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c +/* + * This implements looking up an inode by number. + */ -+static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) ++static struct dentry *iopen_lookup(struct inode * dir, struct dentry *dentry, ++ struct nameidata *nd) +{ -+ struct inode * inode; ++ struct inode *inode; + unsigned long ino; + struct list_head *lp; + struct dentry *alternate; + char buf[IOPEN_NAME_LEN]; -+ ++ + if (dentry->d_name.len >= IOPEN_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + @@ -99,6 +132,9 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + return ERR_PTR(-ENOENT); + } + ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ + /* preferrably return a connected dentry */ + spin_lock(&dcache_lock); + list_for_each(lp, &inode->i_dentry) { @@ -116,9 +152,14 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + return alternate; + } + dentry->d_flags |= DCACHE_DISCONNECTED; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++ ++ __d_rehash(dentry, 0); /* d_rehash */ + spin_unlock(&dcache_lock); + -+ d_add(dentry, inode); + return NULL; +} + @@ -126,7 +167,7 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + __typeof__ (x) __tmp = x; \ + x = y; y = __tmp; } while (0) + -+static inline void switch_names(struct dentry * dentry, struct dentry * target) ++static inline void switch_names(struct dentry *dentry, struct dentry *target) +{ + const unsigned char *old_name, *new_name; + @@ -141,20 +182,27 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + dentry->d_name.name = old_name; +} + -+ -+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode) ++/* This function is spliced into ext3_lookup and does the move of a ++ * disconnected dentry (if it exists) to a connected dentry. ++ */ ++struct dentry *iopen_connect_dentry(struct dentry *dentry, struct inode *inode, ++ int rehash) +{ + struct dentry *tmp, *goal = NULL; + struct list_head *lp; + -+ /* preferrably return a connected dentry */ -+ spin_lock(&dcache_lock); + /* verify this dentry is really new */ -+ assert(!de->d_inode); -+ assert(list_empty(&de->d_subdirs)); -+ assert(list_empty(&de->d_alias)); ++ assert(dentry->d_inode == NULL); ++ assert(list_empty(&dentry->d_alias)); /* d_instantiate */ ++ if (rehash) ++ assert(d_unhashed(dentry)); /* d_rehash */ ++ assert(list_empty(&dentry->d_subdirs)); + ++ spin_lock(&dcache_lock); ++ if (!inode) ++ goto do_rehash; + ++ /* preferrably return a connected dentry */ + list_for_each(lp, &inode->i_dentry) { + tmp = list_entry(lp, struct dentry, d_alias); + if (tmp->d_flags & DCACHE_DISCONNECTED) { @@ -165,16 +213,30 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + break; + } + } -+ spin_unlock(&dcache_lock); + + if (!goal) -+ return NULL; ++ goto do_instantiate; + -+ goal->d_flags &= ~DCACHE_DISCONNECTED; -+ d_rehash(de); -+ d_move(goal, de); ++ /* Move the goal to the de hash queue */ ++ goal->d_flags &= ~ DCACHE_DISCONNECTED; ++ security_d_instantiate(goal, inode); ++ __d_rehash(dentry, 0); ++ __d_move(goal, dentry); ++ spin_unlock(&dcache_lock); ++ iput(inode); + + return goal; ++ ++ /* d_add(), but don't drop dcache_lock before adding dentry to inode */ ++do_instantiate: ++ list_add(&dentry->d_alias, &inode->i_dentry); /* d_instantiate */ ++ dentry->d_inode = inode; ++do_rehash: ++ if (rehash) ++ __d_rehash(dentry, 0); /* d_rehash */ ++ spin_unlock(&dcache_lock); ++ ++ return NULL; +} + +/* @@ -205,9 +267,9 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + * This function is spliced into ext3_lookup and returns 1 the file + * name is __iopen__ and dentry has been filled in appropriately. + */ -+int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry) ++int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry) +{ -+ struct inode * inode; ++ struct inode *inode; + + if (dir->i_ino != EXT3_ROOT_INO || + !test_opt(dir->i_sb, IOPEN) || @@ -227,7 +289,7 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + * number is the one for /__iopen__, in which case the inode is filled + * in appropriately. Otherwise, this fuction returns 0. + */ -+int ext3_iopen_get_inode(struct inode * inode) ++int ext3_iopen_get_inode(struct inode *inode) +{ + if (inode->i_ino != EXT3_BAD_INO) + return 0; @@ -256,10 +318,10 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.c + + return 1; +} -Index: linux-2.6.4-51.1/fs/ext3/iopen.h +Index: linux-stage/fs/ext3/iopen.h =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/iopen.h 2004-04-06 00:31:24.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/iopen.h 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/fs/ext3/iopen.h 2004-05-07 16:00:17.000000000 -0400 ++++ linux-stage/fs/ext3/iopen.h 2004-05-07 16:00:17.000000000 -0400 @@ -0,0 +1,15 @@ +/* + * iopen.h @@ -272,14 +334,14 @@ Index: linux-2.6.4-51.1/fs/ext3/iopen.h + * Public License. + */ + -+extern int ext3_check_for_iopen(struct inode * dir, struct dentry *dentry); -+extern int ext3_iopen_get_inode(struct inode * inode); -+ -+ -Index: linux-2.6.4-51.1/fs/ext3/namei.c ++extern int ext3_check_for_iopen(struct inode *dir, struct dentry *dentry); ++extern int ext3_iopen_get_inode(struct inode *inode); ++extern struct dentry *iopen_connect_dentry(struct dentry *dentry, ++ struct inode *inode, int rehash); +Index: linux-stage/fs/ext3/namei.c =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/namei.c 2004-04-06 00:31:11.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/namei.c 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/fs/ext3/namei.c 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/fs/ext3/namei.c 2004-05-07 16:00:17.000000000 -0400 @@ -37,6 +37,7 @@ #include <linux/buffer_head.h> #include <linux/smp_lock.h> @@ -288,47 +350,78 @@ Index: linux-2.6.4-51.1/fs/ext3/namei.c #include "acl.h" /* -@@ -970,15 +971,21 @@ - } - #endif - -+struct dentry *iopen_connect_dentry(struct dentry *de, struct inode *inode); -+ - static struct dentry *ext3_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) - { - struct inode * inode; - struct ext3_dir_entry_2 * de; - struct buffer_head * bh; -+ struct dentry *alternate = NULL; - +@@ -979,6 +980,9 @@ if (dentry->d_name.len > EXT3_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); -+ if (ext3_check_for_iopen(dir, dentry)) -+ return NULL; ++ if (ext3_check_for_iopen(dir, dentry)) ++ return NULL; + bh = ext3_find_entry(dentry, &de); inode = NULL; if (bh) { -@@ -989,8 +996,14 @@ +@@ -989,10 +993,8 @@ if (!inode) return ERR_PTR(-EACCES); } -+ if (inode && (alternate = iopen_connect_dentry(dentry, inode))) { -+ iput(inode); -+ return alternate; -+ } +- if (inode) +- return d_splice_alias(inode, dentry); +- d_add(dentry, inode); +- return NULL; + - if (inode) - return d_splice_alias(inode, dentry); ++ return iopen_connect_dentry(dentry, inode, 1); + } + + +@@ -2019,10 +2021,6 @@ + inode->i_nlink); + inode->i_version++; + inode->i_nlink = 0; +- /* There's no need to set i_disksize: the fact that i_nlink is +- * zero will ensure that the right thing happens during any +- * recovery. */ +- inode->i_size = 0; + ext3_orphan_add(handle, inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + ext3_mark_inode_dirty(handle, inode); +@@ -2139,6 +2137,23 @@ + return err; + } + ++/* Like ext3_add_nondir() except for call to iopen_connect_dentry */ ++static int ext3_add_link(handle_t *handle, struct dentry *dentry, ++ struct inode *inode) ++{ ++ int err = ext3_add_entry(handle, dentry, inode); ++ if (!err) { ++ err = ext3_mark_inode_dirty(handle, inode); ++ if (err == 0) { ++ (void)iopen_connect_dentry(dentry, inode, 0); ++ return 0; ++ } ++ } ++ ext3_dec_count(handle, inode); ++ iput(inode); ++ return err; ++} + - d_add(dentry, inode); - return NULL; + static int ext3_link (struct dentry * old_dentry, + struct inode * dir, struct dentry *dentry) + { +@@ -2161,7 +2176,8 @@ + ext3_inc_count(handle, inode); + atomic_inc(&inode->i_count); + +- err = ext3_add_nondir(handle, dentry, inode); ++ err = ext3_add_link(handle, dentry, inode); ++ ext3_orphan_del(handle,inode); + ext3_journal_stop(handle); + return err; } -Index: linux-2.6.4-51.1/fs/ext3/super.c +Index: linux-stage/fs/ext3/super.c =================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/super.c 2004-04-06 00:31:14.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/super.c 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/fs/ext3/super.c 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/fs/ext3/super.c 2004-05-07 17:21:59.000000000 -0400 @@ -536,7 +536,7 @@ Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload, Opt_commit, Opt_journal_update, Opt_journal_inum, @@ -353,24 +446,24 @@ Index: linux-2.6.4-51.1/fs/ext3/super.c set_opt(sbi->s_mount_opt, ABORT); break; + case Opt_iopen: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ set_opt (sbi->s_mount_opt, IOPEN); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); + break; + case Opt_noiopen: + clear_opt (sbi->s_mount_opt, IOPEN); -+ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ clear_opt (sbi->s_mount_opt, IOPEN_NOPRIV); + break; + case Opt_iopen_nopriv: -+ set_opt (sbi->s_mount_opt, IOPEN); -+ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); ++ set_opt (sbi->s_mount_opt, IOPEN); ++ set_opt (sbi->s_mount_opt, IOPEN_NOPRIV); + break; case Opt_ignore: break; default: -Index: linux-2.6.4-51.1/include/linux/ext3_fs.h +Index: linux-stage/include/linux/ext3_fs.h =================================================================== ---- linux-2.6.4-51.1.orig/include/linux/ext3_fs.h 2004-04-06 00:31:11.000000000 -0400 -+++ linux-2.6.4-51.1/include/linux/ext3_fs.h 2004-04-06 00:31:24.000000000 -0400 +--- linux-stage.orig/include/linux/ext3_fs.h 2004-05-07 16:00:16.000000000 -0400 ++++ linux-stage/include/linux/ext3_fs.h 2004-05-07 16:00:17.000000000 -0400 @@ -325,6 +325,8 @@ #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ @@ -380,16 +473,3 @@ Index: linux-2.6.4-51.1/include/linux/ext3_fs.h /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H -Index: linux-2.6.4-51.1/fs/ext3/Makefile -=================================================================== ---- linux-2.6.4-51.1.orig/fs/ext3/Makefile 2004-04-06 00:27:21.000000000 -0400 -+++ linux-2.6.4-51.1/fs/ext3/Makefile 2004-04-06 00:31:42.000000000 -0400 -@@ -5,7 +5,7 @@ - obj-$(CONFIG_EXT3_FS) += ext3.o - - ext3-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ -- ioctl.o namei.o super.o symlink.o hash.o -+ ioctl.o namei.o super.o symlink.o hash.o iopen.o - - ext3-$(CONFIG_EXT3_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o - ext3-$(CONFIG_EXT3_FS_POSIX_ACL) += acl.o diff --git a/lustre/kernel_patches/patches/iopen-misc-2.6-suse.patch b/lustre/kernel_patches/patches/iopen-misc-2.6-suse.patch index a84120a7de9b5c2ccfaa30928fa59d36f3833391..2d70c7b70ddbafd6a5f26ae498b9949dab7b053c 100644 --- a/lustre/kernel_patches/patches/iopen-misc-2.6-suse.patch +++ b/lustre/kernel_patches/patches/iopen-misc-2.6-suse.patch @@ -1,7 +1,7 @@ -Index: linux-2.6.0/Documentation/filesystems/ext2.txt +Index: linux-2.6.4-51.0/Documentation/filesystems/ext2.txt =================================================================== ---- linux-2.6.0.orig/Documentation/filesystems/ext2.txt 2002-11-11 06:28:06.000000000 +0300 -+++ linux-2.6.0/Documentation/filesystems/ext2.txt 2004-01-07 17:12:07.000000000 +0300 +--- linux-2.6.4-51.0.orig/Documentation/filesystems/ext2.txt 2004-05-06 22:21:26.000000000 -0400 ++++ linux-2.6.4-51.0/Documentation/filesystems/ext2.txt 2004-05-06 22:24:42.000000000 -0400 @@ -35,6 +35,22 @@ sb=n Use alternate superblock at this location. @@ -25,3 +25,56 @@ Index: linux-2.6.0/Documentation/filesystems/ext2.txt grpquota,noquota,quota,usrquota Quota options are silently ignored by ext2. +Index: linux-2.6.4-51.0/fs/dcache.c +=================================================================== +--- linux-2.6.4-51.0.orig/fs/dcache.c 2004-05-06 22:24:42.000000000 -0400 ++++ linux-2.6.4-51.0/fs/dcache.c 2004-05-06 22:58:37.000000000 -0400 +@@ -1195,12 +1195,11 @@ + * dcache entries should not be moved in this way. + */ + +-void d_move(struct dentry * dentry, struct dentry * target) ++void __d_move(struct dentry * dentry, struct dentry * target) + { + if (!dentry->d_inode) + printk(KERN_WARNING "VFS: moving negative dcache entry\n"); + +- spin_lock(&dcache_lock); + write_seqlock(&rename_lock); + /* + * XXXX: do we really need to take target->d_lock? +@@ -1253,6 +1252,14 @@ + spin_unlock(&target->d_lock); + spin_unlock(&dentry->d_lock); + write_sequnlock(&rename_lock); ++} ++ ++EXPORT_SYMBOL(__d_move); ++ ++void d_move(struct dentry *dentry, struct dentry *target) ++{ ++ spin_lock(&dcache_lock); ++ __d_move(dentry, target); + spin_unlock(&dcache_lock); + } + +Index: linux-2.6.4-51.0/include/linux/dcache.h +=================================================================== +--- linux-2.6.4-51.0.orig/include/linux/dcache.h 2004-05-06 22:24:42.000000000 -0400 ++++ linux-2.6.4-51.0/include/linux/dcache.h 2004-05-06 23:03:43.000000000 -0400 +@@ -234,6 +234,7 @@ + * This adds the entry to the hash queues. + */ + extern void d_rehash(struct dentry *); ++extern void __d_rehash(struct dentry *, int lock); + + /** + * d_add - add dentry to hash queues +@@ -252,6 +253,7 @@ + + /* used for rename() and baskets */ + extern void d_move(struct dentry *, struct dentry *); ++extern void __d_move(struct dentry *, struct dentry *); + + /* appendix may either be NULL or be used for transname suffixes */ + extern struct dentry * d_lookup(struct dentry *, struct qstr *); diff --git a/lustre/kernel_patches/patches/lustre_version.patch b/lustre/kernel_patches/patches/lustre_version.patch index 77c553120aaf1e0a9f6e61b3b618e45203499cd8..6d2b7e692ab2dd6d044f091f5a6eb4dffcb3a5d5 100644 --- a/lustre/kernel_patches/patches/lustre_version.patch +++ b/lustre/kernel_patches/patches/lustre_version.patch @@ -1,5 +1,6 @@ - - +Version 36: don't dput dentry after error (b=2350), zero page->private (3119) +Version 35: pass intent to real_lookup after revalidate failure (b=3285) +Version 34: fix ext3 iopen assertion failure (b=2517, b=2399) include/linux/lustre_version.h | 1 + 1 files changed, 1 insertion(+) @@ -7,6 +8,6 @@ --- /dev/null Fri Aug 30 17:31:37 2002 +++ linux-2.4.18-18.8.0-l12-braam/include/linux/lustre_version.h Thu Feb 13 07:58:33 2003 @@ -0,0 +1 @@ -+#define LUSTRE_KERNEL_VERSION 35 ++#define LUSTRE_KERNEL_VERSION 36 _ diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch index 91dc15bbb7aa27eff5f6d8373451c7dfceb20caa..b7185b96f3f35f522d759469ad12a9d5810a393f 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.18-18-chaos65.patch @@ -351,12 +351,12 @@ Index: linux-2.4.18-p4smp/fs/namei.c + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + if (!nd->dentry->d_inode) diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.19-pre1.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.19-pre1.patch index 4ccfa4d54bb624b3ceab07d783b304141ec86ea3..4dd96bc7429e6e5d317f1b674f4a92f92241ce66 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.19-pre1.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.19-pre1.patch @@ -333,12 +333,12 @@ Index: linux-2.4.19-pre1/fs/namei.c + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + if (!nd->dentry->d_inode) diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch index b6ab3b670950cada0878ef54c0c97786bd7ae611..d8e28ca942270bf7ef9b75300deced2006745aa4 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.19-suse.patch @@ -316,12 +316,12 @@ Index: linux-2.4.19.SuSE/fs/namei.c + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + if (!nd->dentry->d_inode) diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch index 424d90ee02c145a9f8313bb80ad00406bd06f7f7..2af2a0408b3e0bf4146ff961ffb663fa2fcd40a5 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.20-hp.patch @@ -371,12 +371,12 @@ Index: linux/fs/namei.c + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + if (!nd->dentry->d_inode) diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch index 37bf22721176dcc0c80ac931a6689545a1371aa6..87eedc1f349406b6c660c7202b3bd4f82facdf5b 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.20-rh.patch @@ -377,12 +377,12 @@ Index: linux-2.4.20/fs/namei.c + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + if (!nd->dentry->d_inode) diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch index dd293f15352ff3f1dd67b29cd14d244da8a0c305..737f366d8e04ee911b0e980c9061b6e84ce978ed 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.20-vanilla.patch @@ -308,12 +308,12 @@ Index: linux-2.4.24/fs/namei.c + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + if (!nd->dentry->d_inode) diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch index 002651412b5b75be0223b5c78f7b9878a1c6c450..b33176798c90fa02de80365150eb2fa8c432a135 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.21-chaos.patch @@ -314,12 +314,12 @@ Index: linux-ia64/fs/namei.c + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + if (!nd->dentry->d_inode) diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.21-sles8sp3.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.21-sles8sp3.patch index 2ff2de8d3c43dd24818a307f9293d45a2e8e3d7b..7fdb5610480bad4e6399be0d57704d67d32f4cdc 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.21-sles8sp3.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.21-sles8sp3.patch @@ -314,12 +314,12 @@ Index: linux-2.4.21/fs/namei.c + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + if (!nd->dentry->d_inode) diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch index 71b46e5ba5d33081092ff25a147bf9753e322b74..85f8cf4d6bbceab20ad2e64ce786085cdb209728 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.21-suse2.patch @@ -314,12 +314,12 @@ Index: linux-2.4.21-x86_64/fs/namei.c + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + if (!nd->dentry->d_inode) diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch index 7758b2c0b1edcf76467227160a66b65e296f107d..b51ff06c5218a3c1b9c328589f21139beae38e73 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.4.22-rh.patch @@ -302,12 +302,12 @@ + break; + new = real_lookup(dentry->d_parent, + &dentry->d_name, 0, it); -+ d_invalidate(dentry); -+ dput(dentry); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } ++ d_invalidate(dentry); ++ dput(dentry); + nd->dentry = new; + } + } else diff --git a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch index f40f808f7963f54f30a572824795da09fce9ca2f..c678b4ee6a787259a04f05f6bd09e04442f61fee 100644 --- a/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch +++ b/lustre/kernel_patches/patches/vfs_intent-2.6-suse.patch @@ -92,7 +92,7 @@ Index: linux-2.6.4-51.0/fs/namei.c } } return result; -@@ -563,6 +580,31 @@ +@@ -563,6 +580,33 @@ return PTR_ERR(dentry); } @@ -109,6 +109,8 @@ Index: linux-2.6.4-51.0/fs/namei.c + if ((err = permission(dentry->d_parent->d_inode, MAY_EXEC,nd))) + return err; + new = real_lookup(dentry->d_parent, &dentry->d_name, nd); ++ if (IS_ERR(new)) ++ return PTR_ERR(new); + d_invalidate(dentry); + dput(dentry); + nd->dentry = dentry = new; diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index d17e850ad34df69eb22213a30ac19d33a88cbb0c..83ad3c22c9e3c7a13bfb3d6f0309bf5915c09ec5 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -46,7 +46,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) int rq_portal, rp_portal, connect_op; char *name = obddev->obd_type->typ_name; char *mgmt_name = NULL; - int rc = 0; + int rc; struct obd_device *mgmt_obd; mgmtcli_register_for_events_t register_f; ENTRY; @@ -112,7 +112,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) cli->cl_max_pages_per_rpc = PTLRPC_MAX_BRW_PAGES; cli->cl_max_rpcs_in_flight = OSC_MAX_RIF_DEFAULT; - ldlm_get_ref(); + rc = ldlm_get_ref(); if (rc) { CERROR("ldlm_get_ref failed: %d\n", rc); GOTO(err, rc); diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 37cca17404b9dfdd1c21c4ec5b5531bfbf970806..cfd1c8c89917fcb59ad7fbd133c634c24dd26080 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -1294,7 +1294,6 @@ static int ldlm_setup(void) rc = kernel_thread(ldlm_bl_thread_main, &bltd, 0); if (rc < 0) { CERROR("cannot start LDLM thread #%d: rc %d\n", i, rc); - LBUG(); GOTO(out_thread, rc); } wait_for_completion(&blp->blp_comp); @@ -1302,17 +1301,13 @@ static int ldlm_setup(void) rc = ptlrpc_start_n_threads(NULL, ldlm_state->ldlm_cancel_service, LDLM_NUM_THREADS, "ldlm_cn"); - if (rc) { - LBUG(); + if (rc) GOTO(out_thread, rc); - } rc = ptlrpc_start_n_threads(NULL, ldlm_state->ldlm_cb_service, LDLM_NUM_THREADS, "ldlm_cb"); - if (rc) { - LBUG(); + if (rc) GOTO(out_thread, rc); - } INIT_LIST_HEAD(&expired_lock_thread.elt_expired_locks); spin_lock_init(&expired_lock_thread.elt_lock); diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index 4d157c223c4cf75f8a44f49741187df9591c5638..51d8e185d8bec5ad05e269e55ce4f43365f7a17c 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -105,7 +105,7 @@ static int llu_local_open(struct llu_inode_info *lli, struct lookup_intent *it) /* already opened? */ if (lli->lli_open_count++) RETURN(0); - + LASSERT(!lli->lli_file_data); OBD_ALLOC(fd, sizeof(*fd)); @@ -266,20 +266,19 @@ int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode) &fd->fd_cwlockh); } - valid = OBD_MD_FLID; + obdo.o_id = lli->lli_st_ino; + obdo.o_valid = OBD_MD_FLID; + valid = OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLSIZE |OBD_MD_FLBLOCKS | + OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME; if (test_bit(LLI_F_HAVE_OST_SIZE_LOCK, &lli->lli_flags)) valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; - memset(&obdo, 0, sizeof(obdo)); - obdo.o_id = lli->lli_st_ino; - obdo.o_mode = lli->lli_st_mode; - obdo.o_size = lli->lli_st_size; - obdo.o_blocks = lli->lli_st_blocks; + obdo_from_inode(&obdo, inode, valid); + if (0 /* ll_is_inode_dirty(inode) */) { obdo.o_flags = MDS_BFLAG_UNCOMMITTED_WRITES; - valid |= OBD_MD_FLFLAGS; + obdo.o_valid |= OBD_MD_FLFLAGS; } - obdo.o_valid = valid; rc = mdc_close(mdc_exp, &obdo, och, &req); if (rc == EAGAIN) { /* We are the last writer, so the MDS has instructed us to get @@ -287,7 +286,7 @@ int llu_mdc_close(struct obd_export *mdc_exp, struct inode *inode) //ll_queue_done_writing(inode); rc = 0; } else if (rc) { - CERROR("inode %lu close failed: rc = %d\n", lli->lli_st_ino, rc); + CERROR("inode %lu close failed: rc %d\n", lli->lli_st_ino, rc); } else { rc = llu_objects_destroy(req, inode); if (rc) diff --git a/lustre/liblustre/genlib.sh b/lustre/liblustre/genlib.sh index f371650331be3d91847503182645e8cf753ace69..c31ea2f75ac4314ca461ff3e6fec9e3635817c31 100755 --- a/lustre/liblustre/genlib.sh +++ b/lustre/liblustre/genlib.sh @@ -83,6 +83,6 @@ $RANLIB $CWD/liblustre.a # create shared lib lustre rm -f $CWD/liblustre.so $LD -shared -o $CWD/liblustre.so -init __liblustre_setup_ -fini __liblustre_cleanup_ \ - $ALL_OBJS -lpthread + $ALL_OBJS -lcap -lpthread #rm -rf $sysio_tmp diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index fbd199bb2b6a97b8623b687822185f16edd664e8..f5b2ba5e21dacfc270ef6f1ea8ff7bbeeba43462 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -25,6 +25,10 @@ #include <string.h> #include <assert.h> #include <signal.h> +#include <fcntl.h> +#include <netdb.h> +#include <syscall.h> +#include <sys/utsname.h> #include <sys/types.h> #include <sys/queue.h> @@ -98,28 +102,147 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str) return str; } -void init_current(char *comm) +/* + * random number generator stuff + */ +static int _rand_dev_fd = -1; + +static int get_ipv4_addr() +{ + struct utsname myname; + struct hostent *hptr; + int ip; + + if (uname(&myname) < 0) + return 0; + + hptr = gethostbyname(myname.nodename); + if (hptr == NULL || + hptr->h_addrtype != AF_INET || + *hptr->h_addr_list == NULL) { + printf("LibLustre: Warning: fail to get local IPv4 address\n"); + return 0; + } + + ip = ntohl(*((int *) *hptr->h_addr_list)); + + return ip; +} + +static void init_random() +{ + int seed; + struct timeval tv; + + _rand_dev_fd = syscall(SYS_open, "/dev/urandom", O_RDONLY); + if (_rand_dev_fd >= 0) { + if (syscall(SYS_read, _rand_dev_fd, &seed, sizeof(int)) == + sizeof(int)) { + srand(seed); + return; + } + syscall(SYS_close, _rand_dev_fd); + _rand_dev_fd = -1; + } + + gettimeofday(&tv, NULL); + srand(tv.tv_sec + tv.tv_usec + getpid() + __swab32(get_ipv4_addr())); +} + +void get_random_bytes(void *buf, int size) +{ + char *p = buf; + + if (size < 1) + return; + + if (_rand_dev_fd >= 0) { + if (syscall(SYS_read, _rand_dev_fd, buf, size) == size) + return; + syscall(SYS_close, _rand_dev_fd); + _rand_dev_fd = -1; + } + + while (size--) + *p++ = rand(); +} + +int in_group_p(gid_t gid) +{ + int i; + + if (gid == current->fsgid) + return 1; + + for (i = 0; i < current->ngroups; i++) { + if (gid == current->groups[i]) + return 1; + } + + return 0; +} + +static void init_capability(int *res) +{ + cap_t syscap; + cap_flag_value_t capval; + int i; + + *res = 0; + + syscap = cap_get_proc(); + if (!syscap) { + printf("Liblustre: Warning: failed to get system capability, " + "set to minimal\n"); + return; + } + + for (i = 0; i < sizeof(cap_value_t) * 8; i++) { + if (!cap_get_flag(syscap, i, CAP_EFFECTIVE, &capval)) { + if (capval == CAP_SET) { + *res |= 1 << i; + } + } + } +} + +static int init_current(char *comm) { current = malloc(sizeof(*current)); - current->fs = malloc(sizeof(*current->fs)); + if (!current) { + CERROR("Not enough memory\n"); + return -ENOMEM; + } + current->fs = ¤t->__fs; current->fs->umask = umask(0777); umask(current->fs->umask); + strncpy(current->comm, comm, sizeof(current->comm)); current->pid = getpid(); - current->fsuid = 0; - current->fsgid = 0; - current->cap_effective = -1; + current->fsuid = geteuid(); + current->fsgid = getegid(); memset(¤t->pending, 0, sizeof(current->pending)); + + current->max_groups = sysconf(_SC_NGROUPS_MAX); + current->groups = malloc(sizeof(gid_t) * current->max_groups); + if (!current->groups) { + CERROR("Not enough memory\n"); + return -ENOMEM; + } + current->ngroups = getgroups(current->max_groups, current->groups); + if (current->ngroups < 0) { + perror("Error getgroups"); + return -EINVAL; + } + + init_capability(¤t->cap_effective); + + return 0; } -/* FIXME */ void generate_random_uuid(unsigned char uuid_out[16]) { - int *arr = (int*)uuid_out; - int i; - - for (i = 0; i < sizeof(uuid_out)/sizeof(int); i++) - arr[i] = rand(); + get_random_bytes(uuid_out, sizeof(uuid_out)); } ptl_nid_t tcpnal_mynid; @@ -191,6 +314,10 @@ int lib_ioctl(int dev_id, unsigned int opc, void * ptr) int lllib_init(char *dumpfile) { + pid_t pid; + uint32_t ip; + struct in_addr in; + if (!g_zconf) { /* this parse only get my nid from config file * before initialize portals @@ -198,13 +325,21 @@ int lllib_init(char *dumpfile) if (parse_dump(dumpfile, lib_ioctl_nalcmd)) return -1; } else { - /* XXX need setup mynid before tcpnal initialize */ - tcpnal_mynid = ((uint64_t)getpid() << 32) | time(0); - printf("LibLustre: TCPNAL NID: %016llx\n", tcpnal_mynid); + /* need to setup mynid before tcpnal initialization */ + /* a meaningful nid could help debugging */ + ip = get_ipv4_addr(); + if (ip == 0) + get_random_bytes(&ip, sizeof(ip)); + pid = getpid() & 0xffffffff; + tcpnal_mynid = ((uint64_t)ip << 32) | pid; + + in.s_addr = htonl(ip); + printf("LibLustre: TCPNAL NID: %016llx (%s:%u)\n", + tcpnal_mynid, inet_ntoa(in), pid); } - init_current("dummy"); - if (init_obdclass() || + if (init_current("dummy") || + init_obdclass() || init_lib_portals() || ptlrpc_init() || mdc_init() || @@ -331,11 +466,6 @@ out: RETURN(rc); } -static void sighandler_USR1(int signum) -{ - /* do nothing */ -} - /* parse host:/mdsname/profile string */ int ll_parse_mount_target(const char *target, char **mdsnid, char **mdsname, char **profile) @@ -390,16 +520,8 @@ void __liblustre_setup_(void) char *lustre_driver = "llite"; char *root_path = "/"; unsigned mntflgs = 0; - int err; - /* consider tha case of starting multiple liblustre instances - * at a same time on single node. - */ - srand(time(NULL) + getpid()); - - signal(SIGUSR1, sighandler_USR1); - lustre_path = getenv(ENV_LUSTRE_MNTPNT); if (!lustre_path) { lustre_path = "/mnt/lustre"; @@ -455,6 +577,8 @@ void __liblustre_setup_(void) portal_debug = 0; portal_subsystem_debug = 0; #endif + init_random(); + err = lllib_init(dumpfile); if (err) { perror("init llite driver"); diff --git a/lustre/liblustre/namei.c b/lustre/liblustre/namei.c index 0403ad517e4b8fe00f66a0fb5c7ec875790c3cb5..6e596d274bc2bad6189d1ac1abe60ca76eceef0e 100644 --- a/lustre/liblustre/namei.c +++ b/lustre/liblustre/namei.c @@ -319,7 +319,11 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, /* NB 1 request reference will be taken away by ll_intent_lock() * when I return - * Note: libsysio require the inode must be generated here + */ + /* FIXME: for CREAT, libsysio require the inode must be generated here + * currently here we don't know the whether the create is successful + * or failed on mds. thus blinded return -EPERM in llu_iget(). need + * a fix later. */ if ((it->it_op & IT_CREAT) || !it_disposition(it, DISP_LOOKUP_NEG)) { struct lustre_md md; @@ -331,11 +335,11 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, RETURN(rc); inode = llu_iget(parent->i_fs, &md); - if (!inode) { + if (!inode || IS_ERR(inode)) { /* free the lsm if we allocated one above */ if (md.lsm != NULL) obd_free_memmd(sbi->ll_osc_exp, &md.lsm); - RETURN(-ENOMEM); + RETURN(inode ? PTR_ERR(inode) : -ENOMEM); } else if (md.lsm != NULL && llu_i2info(inode)->lli_smd != md.lsm) { obd_free_memmd(sbi->ll_osc_exp, &md.lsm); diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index 47ac4432781826669b09f1065338aad2151e0831..9fe16e59f133421369c90093368ac8e17584ad7f 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -448,21 +448,12 @@ void put_sysio_cookie(struct llu_sysio_cookie *cookie) struct lov_stripe_md *lsm = llu_i2info(cookie->lsc_inode)->lli_smd; struct obd_export *exp = llu_i2obdexp(cookie->lsc_inode); struct ll_async_page *llap = cookie->lsc_llap; -#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE - struct page *pages = cookie->lsc_pages; -#endif int i; for (i = 0; i< cookie->lsc_maxpages; i++) { if (llap[i].llap_cookie) obd_teardown_async_page(exp, lsm, NULL, llap[i].llap_cookie); -#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE - if (pages[i]._managed) { - free(pages[i].addr); - pages[i]._managed = 0; - } -#endif } I_RELE(cookie->lsc_inode); @@ -471,85 +462,6 @@ void put_sysio_cookie(struct llu_sysio_cookie *cookie) OBD_FREE(cookie, LLU_SYSIO_COOKIE_SIZE(cookie->lsc_maxpages)); } -#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE -/* Note: these code should be removed finally, don't need - * more cleanup - */ -static -int prepare_unaligned_write(struct llu_sysio_cookie *cookie) -{ - struct inode *inode = cookie->lsc_inode; - struct llu_inode_info *lli = llu_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - struct obdo oa; - struct page *pages = cookie->lsc_pages; - int i, pgidx[2] = {0, cookie->lsc_npages-1}; - int rc; - ENTRY; - - for (i = 0; i < 2; i++) { - struct page *oldpage = &pages[pgidx[i]]; - struct page newpage; - struct brw_page pg; - char *newbuf; - - if (i == 0 && pgidx[0] == pgidx[1]) - continue; - - LASSERT(oldpage->_offset + oldpage->_count <= PAGE_CACHE_SIZE); - - if (oldpage->_count == PAGE_CACHE_SIZE) - continue; - - if (oldpage->index << PAGE_CACHE_SHIFT >= - lli->lli_st_size) - continue; - - newbuf = malloc(PAGE_CACHE_SIZE); - if (!newbuf) - return -ENOMEM; - - newpage.index = oldpage->index; - newpage.addr = newbuf; - - pg.pg = &newpage; - pg.off = ((obd_off)newpage.index << PAGE_CACHE_SHIFT); - if (pg.off + PAGE_CACHE_SIZE > lli->lli_st_size) - pg.count = lli->lli_st_size % PAGE_CACHE_SIZE; - else - pg.count = PAGE_CACHE_SIZE; - pg.flag = 0; - - oa.o_id = lsm->lsm_object_id; - oa.o_mode = lli->lli_st_mode; - oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | OBD_MD_FLTYPE; - - /* issue read */ - rc = obd_brw(OBD_BRW_READ, llu_i2obdexp(inode), &oa, lsm, 1, &pg, NULL); - if (rc) { - free(newbuf); - RETURN(rc); - } - - /* copy page content, and reset page params */ - memcpy(newbuf + oldpage->_offset, - (char*)oldpage->addr + oldpage->_offset, - oldpage->_count); - - oldpage->addr = newbuf; - if ((((obd_off)oldpage->index << PAGE_CACHE_SHIFT) + - oldpage->_offset + oldpage->_count) > lli->lli_st_size) - oldpage->_count += oldpage->_offset; - else - oldpage->_count = PAGE_CACHE_SIZE; - oldpage->_offset = 0; - oldpage->_managed = 1; - } - - RETURN(0); -} -#endif - static int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd, char *buf, loff_t pos, size_t count) @@ -600,14 +512,6 @@ int llu_prep_async_io(struct llu_sysio_cookie *cookie, int cmd, cookie->lsc_npages = npages; -#ifdef LIBLUSTRE_HANDLE_UNALIGNED_PAGE - if (cmd == OBD_BRW_WRITE) { - rc = prepare_unaligned_write(cookie); - if (rc) - RETURN(rc); - } -#endif - for (i = 0; i < npages; i++) { llap[i].llap_magic = LLAP_MAGIC; rc = obd_prep_async_page(exp, lsm, NULL, &pages[i], @@ -741,7 +645,7 @@ llu_file_write(struct inode *inode, const struct iovec *iovec, if (err != ELDLM_OK) GOTO(err_out, err = -ENOLCK); - CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n", + CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %llu\n", lli->lli_st_ino, count, pos); cookie = llu_rw(OBD_BRW_WRITE, inode, buf, count, pos); diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 86048e636f317a991ec005a0380ca13d985a04e8..4bedaccf32b23dd6e84798a1fd2825bcc5cb0c43 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -1290,8 +1290,11 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md) if ((md->body->valid & (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) != - (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) - CERROR("invalide fields!\n"); + (OBD_MD_FLGENER | OBD_MD_FLID | OBD_MD_FLTYPE)) { + /* FIXME this is workaround for for open(O_CREAT), + * see lookup_it_finish(). */ + return ERR_PTR(-EPERM); + } /* try to find existing inode */ fid.id = md->body->ino; @@ -1490,7 +1493,7 @@ llu_fsswop_mount(const char *source, LASSERT(sbi->ll_rootino != 0); root = llu_iget(fs, &md); - if (root == NULL) { + if (!root || IS_ERR(root)) { CERROR("fail to generate root inode\n"); GOTO(out_request, err = -EBADF); } diff --git a/lustre/liblustre/tests/Makefile.am b/lustre/liblustre/tests/Makefile.am index 81e70588288f6dbf47fb47df146f0bee8419c562..ff73edff0991bb11af7f318d79e520f882820398 100644 --- a/lustre/liblustre/tests/Makefile.am +++ b/lustre/liblustre/tests/Makefile.am @@ -4,7 +4,7 @@ AM_CPPFLAGS = -I$(SYSIO)/include -I/opt/lam/include $(LLCPPFLAGS) -I$(top_srcdir AM_CFLAGS = $(LLCFLAGS) LIBS = $(LIBEFENCE) $(LIBREADLINE) -LLIB_EXEC= ../liblustre.a -lpthread +LLIB_EXEC= ../liblustre.a -lcap -lpthread if LIBLUSTRE noinst_LIBRARIES = libtestcommon.a @@ -21,7 +21,7 @@ libtestcommon_a_SOURCES = test_common.c test_common.h echo_test_SOURCES = echo_test.c ../../utils/parser.c ../../utils/obd.c ../../utils/lustre_cfg.c echo_test_CFLAGS = $(LL_CFLAGS) -echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lpthread +echo_test_LDADD = ../liblsupport.a $(LIBREADLINE) -lcap -lpthread echo_test_DEPENDENCIES=$(top_builddir)/liblustre/liblsupport.a sanity_SOURCES = sanity.c diff --git a/lustre/liblustre/tests/echo_test.c b/lustre/liblustre/tests/echo_test.c index f2230ab19afe5f4060855bbf12a1a477dcb67c7e..19fd83a9d1dd1923077bae91a769e502ad1de294 100644 --- a/lustre/liblustre/tests/echo_test.c +++ b/lustre/liblustre/tests/echo_test.c @@ -24,6 +24,17 @@ struct obd_import; unsigned int portal_subsystem_debug = ~0 - (S_PORTALS | S_QSWNAL | S_SOCKNAL | S_GMNAL | S_IBNAL); +void get_random_bytes(void *ptr, int size) +{ + char *p = ptr; + + if (size < 1) + return; + + while(size--) + *p++ = rand(); +} + void *inter_module_get(char *arg) { if (!strcmp(arg, "tcpnal_ni")) @@ -81,6 +92,11 @@ libcfs_nal_cmd(struct portals_cfg *pcfg) return 0; } +int in_group_p(gid_t gid) +{ + return 0; +} + int init_current(int argc, char **argv) { current = malloc(sizeof(*current)); diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index a719ca1d46d934a856e13b459578d80cfa522e9a..544d2cf98e93eabc0325b8b672ce13b9d943c5e7 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -49,11 +49,27 @@ static void ll_release(struct dentry *de) EXIT; } +/* should NOT be called with the dcache lock, see fs/dcache.c */ +static int ll_ddelete(struct dentry *de) +{ + ENTRY; + LASSERT(de); + CDEBUG(D_DENTRY, "%s dentry %*s (%p, parent %p, inode %p) %s%s\n", + (de->d_flags & DCACHE_LUSTRE_INVALID ? "keeping" : "deleting"), + de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode, + d_unhashed(de) ? "" : "hashed,", + list_empty(&de->d_subdirs) ? "" : "subdirs"); + RETURN(0); +} + void ll_set_dd(struct dentry *de) { ENTRY; LASSERT(de != NULL); + CDEBUG(D_DENTRY, "ldd on dentry %*s (%p) parent %p inode %p refc %d\n", + de->d_name.len, de->d_name.name, de, de->d_parent, de->d_inode, + atomic_read(&de->d_count)); lock_kernel(); if (de->d_fsdata == NULL) { OBD_ALLOC(de->d_fsdata, sizeof(struct ll_dentry_data)); @@ -93,39 +109,47 @@ void ll_intent_release(struct lookup_intent *it) void ll_unhash_aliases(struct inode *inode) { - struct list_head *tmp, *head; + struct list_head *tmp, *head; struct ll_sb_info *sbi; ENTRY; - sbi = ll_i2sbi(inode); - - CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n", - inode->i_ino, inode->i_generation, inode); - if (inode == NULL) { CERROR("unexpected NULL inode, tell phil\n"); return; } + + CDEBUG(D_INODE, "marking dentries for ino %lu/%u(%p) invalid\n", + inode->i_ino, inode->i_generation, inode); + + sbi = ll_i2sbi(inode); head = &inode->i_dentry; restart: - spin_lock(&dcache_lock); - tmp = head; - while ((tmp = tmp->next) != head) { - struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); - if (!atomic_read(&dentry->d_count)) { - dget_locked(dentry); - __d_drop(dentry); - spin_unlock(&dcache_lock); - dput(dentry); - goto restart; - } else { + spin_lock(&dcache_lock); + tmp = head; + while ((tmp = tmp->next) != head) { + struct dentry *dentry = list_entry(tmp, struct dentry, d_alias); + if (atomic_read(&dentry->d_count) == 0) { + CDEBUG(D_DENTRY, "deleting dentry %*s (%p) parent %p " + "inode %p\n", dentry->d_name.len, + dentry->d_name.name, dentry, dentry->d_parent, + dentry->d_inode); + dget_locked(dentry); + __d_drop(dentry); + spin_unlock(&dcache_lock); + dput(dentry); + goto restart; + } else if (!(dentry->d_flags & DCACHE_LUSTRE_INVALID)) { + CDEBUG(D_DENTRY, "unhashing dentry %*s (%p) parent %p " + "inode %p refc %d\n", dentry->d_name.len, + dentry->d_name.name, dentry, dentry->d_parent, + dentry->d_inode, atomic_read(&dentry->d_count)); hlist_del_init(&dentry->d_hash); dentry->d_flags |= DCACHE_LUSTRE_INVALID; hlist_add_head(&dentry->d_hash, &sbi->ll_orphan_dentry_list); } - } - spin_unlock(&dcache_lock); + } + spin_unlock(&dcache_lock); EXIT; } @@ -244,7 +268,7 @@ int ll_revalidate_it(struct dentry *de, int flags, struct lookup_intent *it) it = &lookup_it; GOTO(out, rc = 0); } - + if (req) ptlrpc_req_finished(req); req = NULL; @@ -286,8 +310,13 @@ int ll_revalidate_it(struct dentry *de, int flags, struct lookup_intent *it) ptlrpc_req_finished(req); if (rc == 0) { ll_unhash_aliases(de->d_inode); - de->d_flags |= DCACHE_LUSTRE_INVALID; + /* done in ll_unhash_aliases() + dentry->d_flags |= DCACHE_LUSTRE_INVALID; */ } else { + CDEBUG(D_DENTRY, "revalidated dentry %*s (%p) parent %p " + "inode %p refc %d\n", de->d_name.len, + de->d_name.name, de, de->d_parent, de->d_inode, + atomic_read(&de->d_count)); ll_lookup_finish_locks(it, de); de->d_flags &= ~DCACHE_LUSTRE_INVALID; } @@ -400,6 +429,7 @@ struct dentry_operations ll_d_ops = { .d_revalidate_it = ll_revalidate_it, #endif .d_release = ll_release, + .d_delete = ll_ddelete, #if 0 .d_pin = ll_pin, .d_unpin = ll_unpin, diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 05f657386c71da1a0aff6c210f5dfd76effc971a..961a00ebb22d3d6ff2411b0c56b1b9b390a0a043 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -635,12 +635,14 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, int ll_dir_open(struct inode *inode, struct file *file) { - return ll_file_open(inode, file); + ENTRY; + RETURN(ll_file_open(inode, file)); } int ll_dir_release(struct inode *inode, struct file *file) { - return ll_file_release(inode, file); + ENTRY; + RETURN(ll_file_release(inode, file)); } struct file_operations ll_dir_operations = { diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 61bb36da44a5cbeed597ab0986825daa50feb7a5..d06de4a4d6b053b90e702fe18c0368d2f8bf74b6 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -39,7 +39,7 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode, struct ptlrpc_request *req = NULL; struct obd_client_handle *och = &fd->fd_mds_och; struct obdo obdo; - int rc, valid; + int rc; ENTRY; /* clear group lock, if present */ @@ -50,18 +50,16 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode, &fd->fd_cwlockh); } - valid = OBD_MD_FLID; - - memset(&obdo, 0, sizeof(obdo)); obdo.o_id = inode->i_ino; - obdo.o_mode = inode->i_mode; - obdo.o_size = inode->i_size; - obdo.o_blocks = inode->i_blocks; + obdo.o_valid = OBD_MD_FLID; + obdo_from_inode(&obdo, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE | + OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | + OBD_MD_FLATIME | OBD_MD_FLMTIME | + OBD_MD_FLCTIME); if (0 /* ll_is_inode_dirty(inode) */) { obdo.o_flags = MDS_BFLAG_UNCOMMITTED_WRITES; - valid |= OBD_MD_FLFLAGS; + obdo.o_valid |= OBD_MD_FLFLAGS; } - obdo.o_valid = valid; rc = mdc_close(mdc_exp, &obdo, och, &req); if (rc == EAGAIN) { /* We are the last writer, so the MDS has instructed us to get @@ -188,7 +186,8 @@ int ll_local_open(struct file *file, struct lookup_intent *it) int ll_file_open(struct inode *inode, struct file *file) { struct ll_inode_info *lli = ll_i2info(inode); - struct lookup_intent *it; + struct lookup_intent *it, oit = { .it_op = IT_OPEN, + .it_flags = file->f_flags }; struct lov_stripe_md *lsm; struct ptlrpc_request *req; int rc = 0; @@ -203,9 +202,7 @@ int ll_file_open(struct inode *inode, struct file *file) it = file->f_it; - if (!it->d.lustre.it_disposition) { - struct lookup_intent oit = { .it_op = IT_OPEN, - .it_flags = file->f_flags }; + if (!it || !it->d.lustre.it_disposition) { it = &oit; rc = ll_intent_file_open(file, NULL, 0, it); if (rc) diff --git a/lustre/llite/llite_close.c b/lustre/llite/llite_close.c index 3e1c1955ecfdbf01a8dc1870ac3af5d3e592ee62..c2137812a098a1708a149db75ee27105eada5e37 100644 --- a/lustre/llite/llite_close.c +++ b/lustre/llite/llite_close.c @@ -134,7 +134,7 @@ static void ll_close_done_writing(struct inode *inode) rc = ll_extent_lock(NULL, inode, lli->lli_smd, LCK_PW, &policy, &lockh, ast_flags); - if (rc != ELDLM_OK) { + if (rc != 0) { CERROR("lock acquisition failed (%d): unable to send " "DONE_WRITING for inode %lu/%u\n", rc, inode->i_ino, inode->i_generation); diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 4c68ca7726c830089a37a50190cf7736733c334f..53796c40fddccb5faec3da3ea1f3c22e1e878a04 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -239,6 +239,8 @@ void lustre_common_put_super(struct super_block *sb) spin_lock(&dcache_lock); hlist_for_each_safe(tmp, next, &sbi->ll_orphan_dentry_list) { struct dentry *dentry = hlist_entry(tmp, struct dentry, d_hash); + CWARN("orphan dentry %*s (%p) at unmount\n", + dentry->d_name.len, dentry->d_name.name, dentry); shrink_dcache_parent(dentry); } spin_unlock(&dcache_lock); @@ -1132,6 +1134,7 @@ void ll_read_inode2(struct inode *inode, void *opaque) LTIME_S(inode->i_mtime) = 0; LTIME_S(inode->i_atime) = 0; LTIME_S(inode->i_ctime) = 0; + inode->i_rdev = 0; ll_update_inode(inode, md->body, md->lsm); /* OIDEBUG(inode); */ diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index 9ca3ec691b4cef6eef65a32e6ff2cb406bb8884d..d9eb99bcff5c6af838fdd6d98c4022ab63614444 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -76,6 +76,10 @@ static int ll_test_inode(struct inode *inode, void *opaque) md->body->ino, md->body->generation); } +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + if (inode->i_ino != md->body->ino) + return 0; +#endif if (inode->i_generation != md->body->generation) return 0; @@ -267,6 +271,9 @@ struct dentry *ll_find_alias(struct inode *inode, struct dentry *de) atomic_inc(&dentry->d_count); iput(inode); dentry->d_flags &= ~DCACHE_LUSTRE_INVALID; + CDEBUG(D_DENTRY, "alias dentry %*s (%p) parent %p inode %p " + "refc %d\n", de->d_name.len, de->d_name.name, de, + de->d_parent, de->d_inode, atomic_read(&de->d_count)); return dentry; } diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 9236c542177544dd5a891404d6ae19fd094b153a..8e9aaf24c39e2614a8be2746bcd583363f06d993 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -65,21 +65,22 @@ static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, ENTRY; pg.pg = page; - pg.off = ((obd_off)page->index) << PAGE_SHIFT; + pg.disk_offset = pg.page_offset = ((obd_off)page->index) << PAGE_SHIFT; - if (cmd == OBD_BRW_WRITE && (pg.off + PAGE_SIZE > inode->i_size)) + if (cmd == OBD_BRW_WRITE && + (pg.disk_offset + PAGE_SIZE > inode->i_size)) pg.count = inode->i_size % PAGE_SIZE; else pg.count = PAGE_SIZE; CDEBUG(D_PAGE, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n", cmd & OBD_BRW_WRITE ? "write" : "read", pg.count, inode->i_ino, - pg.off, pg.off); + pg.disk_offset, pg.disk_offset); if (pg.count == 0) { CERROR("ZERO COUNT: ino %lu: size %p:%Lu(%p:%Lu) idx %lu off " - LPU64"\n", - inode->i_ino, inode, inode->i_size, page->mapping->host, - page->mapping->host->i_size, page->index, pg.off); + LPU64"\n", inode->i_ino, inode, inode->i_size, + page->mapping->host, page->mapping->host->i_size, + page->index, pg.disk_offset); } pg.flag = flags; @@ -159,7 +160,7 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from, /* Check to see if we should return -EIO right away */ pga.pg = page; - pga.off = offset; + pga.disk_offset = pga.page_offset = offset; pga.count = PAGE_SIZE; pga.flag = 0; diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c index 8a3099f1fffc8104bf41dfc83aaa1fcecaddda8a..5ed4bfd3c7c87b7b561bd4016bf966b3d53c3881 100644 --- a/lustre/llite/rw24.c +++ b/lustre/llite/rw24.c @@ -116,11 +116,6 @@ static int ll_direct_IO_24(int rw, if (!lsm || !lsm->lsm_object_id) RETURN(-EBADF); - /* FIXME: io smaller than PAGE_SIZE is broken on ia64 */ - if ((iobuf->offset & (PAGE_SIZE - 1)) || - (iobuf->length & (PAGE_SIZE - 1))) - RETURN(-EINVAL); - set = ptlrpc_prep_set(); if (set == NULL) RETURN(-ENOMEM); @@ -132,15 +127,17 @@ static int ll_direct_IO_24(int rw, } flags = 0 /* | OBD_BRW_DIRECTIO */; - offset = ((obd_off)blocknr << inode->i_blkbits); + offset = ((obd_off)blocknr * blocksize); length = iobuf->length; + pga[0].page_offset = iobuf->offset; + LASSERT(iobuf->offset < PAGE_SIZE); for (i = 0, length = iobuf->length; length > 0; length -= pga[i].count, offset += pga[i].count, i++) { /*i last!*/ pga[i].pg = iobuf->maplist[i]; - pga[i].off = offset; + pga[i].disk_offset = offset; /* To the end of the page, or the length, whatever is less */ - pga[i].count = min_t(int, PAGE_SIZE - (offset & ~PAGE_MASK), + pga[i].count = min_t(int, PAGE_SIZE - pga[i].page_offset, length); pga[i].flag = flags; if (rw == READ) @@ -167,6 +164,14 @@ static int ll_direct_IO_24(int rw, CERROR("error from callback: rc = %d\n", rc); } ptlrpc_set_destroy(set); + if (rc == 0 && rw == WRITE) { + void lov_increase_kms(struct obd_export *, + struct lov_stripe_md *, obd_off size); + obd_off size = offset + length; + lov_increase_kms(ll_i2obdexp(inode), lsm, size); + if (size > inode->i_size) + inode->i_size = size; + } if (rc == 0) { rc = iobuf->length; obdo_to_inode(inode, &oa, OBD_MD_FLBLOCKS); diff --git a/lustre/llite/special.c b/lustre/llite/special.c index 084381e75bcf1d61b45ddcf9d21d468b2454026f..c932752bbd78f023e75a5efef71035308df1bf14 100644 --- a/lustre/llite/special.c +++ b/lustre/llite/special.c @@ -77,9 +77,9 @@ static void save_fops(struct file *filp, struct inode *inode, else if (S_ISFIFO(inode->i_mode)) filp->f_op = &ll_special_fifo_file_fops; - CWARN("saved %p, replaced with %p\n", *pfop, filp->f_op); + CDEBUG(D_INFO,"saved %p, replaced with %p\n", *pfop,filp->f_op); if ((*pfop)->owner) - CWARN("%p has owner %p\n", *pfop,(*pfop)->owner); + CDEBUG(D_INFO,"%p has owner %p\n",*pfop,(*pfop)->owner); } } @@ -309,7 +309,7 @@ static int ll_special_open(struct inode *inode, struct file *filp) err = ll_local_open(filp, it); if (rc != 0) { - CERROR("error opening special file: rc %d", rc); + CERROR("error opening special file: rc %d\n", rc); ll_mdc_close(ll_i2sbi(inode)->ll_mdc_exp, inode, filp); } else if (err) { if (pfop && *pfop) { @@ -348,12 +348,12 @@ struct inode_operations ll_special_inode_operations = { }; struct file_operations ll_special_chr_inode_fops = { - .owner = THIS_MODULE, + //FIXME .owner = THIS_MODULE, .open = ll_special_open, }; struct file_operations ll_special_blk_inode_fops = { - .owner = THIS_MODULE, + //FIXME .owner = THIS_MODULE, .read = ll_special_read, .write = ll_special_write, .ioctl = ll_special_ioctl, @@ -365,17 +365,17 @@ struct file_operations ll_special_blk_inode_fops = { }; struct file_operations ll_special_fifo_inode_fops = { - .owner = THIS_MODULE, + //FIXME .owner = THIS_MODULE, .open = ll_special_open, }; struct file_operations ll_special_sock_inode_fops = { - .owner = THIS_MODULE, + //FIXME .owner = THIS_MODULE, .open = ll_special_open }; struct file_operations ll_special_chr_file_fops = { - .owner = THIS_MODULE, + //FIXME .owner = THIS_MODULE, .llseek = ll_special_file_seek, .read = ll_special_file_read, .write = ll_special_file_write, @@ -387,7 +387,7 @@ struct file_operations ll_special_chr_file_fops = { }; struct file_operations ll_special_fifo_file_fops = { - .owner = THIS_MODULE, + //FIXME .owner = THIS_MODULE, .llseek = ll_special_file_seek, .read = ll_special_file_read, .write = ll_special_file_write, diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index dfecb74ec32cb1ad303613a72a6c2d4b32d6e2e2..f16940a5f24ff38c15e527ce6e62525ed4334059 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -396,6 +396,14 @@ static int lov_setup(struct obd_device *obd, obd_count len, void *buf) RETURN(-EINVAL); } + if (desc->ld_default_stripe_size < PTLRPC_MAX_BRW_SIZE) { + CWARN("Increasing default_stripe_size "LPU64" to %u\n", + desc->ld_default_stripe_size, PTLRPC_MAX_BRW_SIZE); + CWARN("Please update config and run --write-conf on MDS\n"); + + desc->ld_default_stripe_size = PTLRPC_MAX_BRW_SIZE; + } + /* Because of 64-bit divide/mod operations only work with a 32-bit * divisor in a 32-bit kernel, we cannot support a stripe width * of 4GB or larger on 32-bit CPUs. @@ -1460,13 +1468,13 @@ static int lov_brw_check(struct lov_obd *lov, struct obdo *oa, /* The caller just wants to know if there's a chance that this * I/O can succeed */ for (i = 0; i < oa_bufs; i++) { - int stripe = lov_stripe_number(lsm, pga[i].off); + int stripe = lov_stripe_number(lsm, pga[i].disk_offset); int ost = lsm->lsm_oinfo[stripe].loi_ost_idx; obd_off start, end; - if (!lov_stripe_intersects(lsm, i, pga[i].off, - pga[i].off + pga[i].count, &start, - &end)) + if (!lov_stripe_intersects(lsm, i, pga[i].disk_offset, + pga[i].disk_offset + pga[i].count, + &start, &end)) continue; if (lov->tgts[ost].active == 0) { @@ -1532,7 +1540,7 @@ static int lov_brw(int cmd, struct obd_export *exp, struct obdo *src_oa, } for (i = 0; i < oa_bufs; i++) { - where[i] = lov_stripe_number(lsm, pga[i].off); + where[i] = lov_stripe_number(lsm, pga[i].disk_offset); stripeinfo[where[i]].bufct++; } @@ -1551,7 +1559,8 @@ static int lov_brw(int cmd, struct obd_export *exp, struct obdo *src_oa, shift = stripeinfo[which].index + stripeinfo[which].subcount; LASSERT(shift < oa_bufs); ioarr[shift] = pga[i]; - lov_stripe_offset(lsm, pga[i].off, which, &ioarr[shift].off); + lov_stripe_offset(lsm, pga[i].disk_offset, which, + &ioarr[shift].disk_offset); stripeinfo[which].subcount++; } @@ -1684,7 +1693,7 @@ static int lov_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, GOTO(out_obdos, rc = -ENOMEM); for (i = 0; i < oa_bufs; i++) { - where[i] = lov_stripe_number(lsm, pga[i].off); + where[i] = lov_stripe_number(lsm, pga[i].disk_offset); stripeinfo[where[i]].bufct++; } @@ -1708,7 +1717,8 @@ static int lov_brw_async(int cmd, struct obd_export *exp, struct obdo *oa, shift = stripeinfo[which].index + stripeinfo[which].subcount; LASSERT(shift < oa_bufs); ioarr[shift] = pga[i]; - lov_stripe_offset(lsm, pga[i].off, which, &ioarr[shift].off); + lov_stripe_offset(lsm, pga[i].disk_offset, which, + &ioarr[shift].disk_offset); stripeinfo[which].subcount++; } diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 1870988afc4062c3d4c6afa877cb3160c42b4202..9c02ecd5a8ba87a145054e1cf1b54a5626d487ad 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -50,6 +50,10 @@ #include <linux/iobuf.h> #endif +#ifdef EXT3_MULTIBLOCK_ALLOCATOR +#include <linux/ext3_extents.h> +#endif + static kmem_cache_t *fcb_cache; static atomic_t fcb_cache_count = ATOMIC_INIT(0); @@ -661,12 +665,231 @@ static int fsfilt_ext3_sync(struct super_block *sb) return ext3_force_commit(sb); } +#ifdef EXT3_MULTIBLOCK_ALLOCATOR +struct bpointers { + unsigned long *blocks; + int *created; + unsigned long start; + int num; + int init_num; +}; + +static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, + struct ext3_ext_path *path, + struct ext3_extent *newex, int exist) +{ + struct inode *inode = tree->inode; + struct bpointers *bp = tree->private; + int count, err, goal; + unsigned long pblock; + unsigned long tgen; + loff_t new_i_size; + handle_t *handle; + int i; + + i = EXT_DEPTH(tree); + EXT_ASSERT(i == path->p_depth); + EXT_ASSERT(path[i].p_hdr); + + if (exist) { + err = EXT_CONTINUE; + goto map; + } + + tgen = EXT_GENERATION(tree); + count = ext3_ext_calc_credits_for_insert(tree, path); + up_write(&EXT3_I(inode)->truncate_sem); + + handle = ext3_journal_start(inode, count + EXT3_ALLOC_NEEDED + 1); + if (IS_ERR(handle)) { + down_write(&EXT3_I(inode)->truncate_sem); + return PTR_ERR(handle); + } + + if (tgen != EXT_GENERATION(tree)) { + /* the tree has changed. so path can be invalid at moment */ + ext3_journal_stop(handle, inode); + down_write(&EXT3_I(inode)->truncate_sem); + return EXT_REPEAT; + } + + down_write(&EXT3_I(inode)->truncate_sem); + goal = ext3_ext_find_goal(inode, path, newex->e_block); + count = newex->e_num; + pblock = ext3_new_blocks(handle, inode, &count, goal, &err); + if (!pblock) + goto out; + EXT_ASSERT(count <= newex->e_num); + + /* insert new extent */ + newex->e_start = pblock; + newex->e_num = count; + err = ext3_ext_insert_extent(handle, tree, path, newex); + if (err) + goto out; + + /* correct on-disk inode size */ + if (newex->e_num > 0) { + new_i_size = (loff_t) newex->e_block + newex->e_num; + new_i_size = new_i_size << inode->i_blkbits; + if (new_i_size > EXT3_I(inode)->i_disksize) { + EXT3_I(inode)->i_disksize = new_i_size; + err = ext3_mark_inode_dirty(handle, inode); + } + } + +out: + ext3_journal_stop(handle, inode); +map: + if (err >= 0) { + /* map blocks */ + if (bp->num == 0) { + CERROR("hmm. why do we find this extent?\n"); + CERROR("initial space: %lu:%u\n", + bp->start, bp->init_num); + CERROR("current extent: %u/%u/%u %d\n", + newex->e_block, newex->e_num, + newex->e_start, exist); + } + i = 0; + if (newex->e_block < bp->start) + i = bp->start - newex->e_block; + if (i >= newex->e_num) + CERROR("nothing to do?! i = %d, e_num = %u\n", + i, newex->e_num); + for (; i < newex->e_num && bp->num; i++) { + *(bp->created) = (exist == 0 ? 1 : 0); + bp->created++; + *(bp->blocks) = newex->e_start + i; + bp->blocks++; + bp->num--; + } + } + return err; +} + +int fsfilt_map_nblocks(struct inode *inode, unsigned long block, + unsigned long num, unsigned long *blocks, + int *created, int create) +{ + struct ext3_extents_tree tree; + struct bpointers bp; + int err; + + CDEBUG(D_OTHER, "blocks %lu-%lu requested for inode %u\n", + block, block + num, (unsigned) inode->i_ino); + + ext3_init_tree_desc(&tree, inode); + tree.private = &bp; + bp.blocks = blocks; + bp.created = created; + bp.start = block; + bp.init_num = bp.num = num; + + down_write(&EXT3_I(inode)->truncate_sem); + err = ext3_ext_walk_space(&tree, block, num, ext3_ext_new_extent_cb); + ext3_ext_invalidate_cache(&tree); + up_write(&EXT3_I(inode)->truncate_sem); + + return err; +} + +int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int *created, int create) +{ + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + int rc = 0, i = 0; + struct page *fp = NULL; + int clen = 0; + + CDEBUG(D_OTHER, "inode %lu: map %d pages from %lu\n", + inode->i_ino, pages, (*page)->index); + + /* pages are sorted already. so, we just have to find + * contig. space and process them properly */ + while (i < pages) { + if (fp == NULL) { + /* start new extent */ + fp = *page++; + clen = 1; + i++; + continue; + } else if (fp->index + clen == (*page)->index) { + /* continue the extent */ + page++; + clen++; + i++; + continue; + } + + /* process found extent */ + rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page, + clen * blocks_per_page, blocks, + created, create); + if (rc) + GOTO(cleanup, rc); + + /* look for next extent */ + fp = NULL; + blocks += blocks_per_page * clen; + created += blocks_per_page * clen; + } + + if (fp) + rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page, + clen * blocks_per_page, blocks, + created, create); +cleanup: + return rc; +} +#endif + extern int ext3_map_inode_page(struct inode *inode, struct page *page, unsigned long *blocks, int *created, int create); -int fsfilt_ext3_map_inode_page(struct inode *inode, struct page *page, - unsigned long *blocks, int *created, int create) +int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int *created, int create) +{ + int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; + unsigned long *b; + int rc = 0, i, *cr; + + for (i = 0, cr = created, b = blocks; i < pages; i++, page++) { + rc = ext3_map_inode_page(inode, *page, b, cr, create); + if (rc) { + CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n", + inode->i_ino, *b, *cr, create, rc); + break; + } + + b += blocks_per_page; + cr += blocks_per_page; + } + return rc; +} + +int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int *created, int create, + struct semaphore *optional_sem) { - return ext3_map_inode_page(inode, page, blocks, created, create); + int rc; +#ifdef EXT3_MULTIBLOCK_ALLOCATOR + if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) { + rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages, + blocks, created, create); + return rc; + } +#endif + if (optional_sem != NULL) + down(optional_sem); + rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, + created, create); + if (optional_sem != NULL) + up(optional_sem); + + return rc; } extern int ext3_prep_san_write(struct inode *inode, long *blocks, @@ -910,7 +1133,7 @@ static struct fsfilt_operations fsfilt_ext3_ops = { .fs_add_journal_cb = fsfilt_ext3_add_journal_cb, .fs_statfs = fsfilt_ext3_statfs, .fs_sync = fsfilt_ext3_sync, - .fs_map_inode_page = fsfilt_ext3_map_inode_page, + .fs_map_inode_pages = fsfilt_ext3_map_inode_pages, .fs_prep_san_write = fsfilt_ext3_prep_san_write, .fs_write_record = fsfilt_ext3_write_record, .fs_read_record = fsfilt_ext3_read_record, diff --git a/lustre/lvfs/fsfilt_smfs.c b/lustre/lvfs/fsfilt_smfs.c index 75bb6961ef4acf56d3044930de42961c9d832c5f..151e624d4c2c41a6110f28788138bc9b6ae26526 100644 --- a/lustre/lvfs/fsfilt_smfs.c +++ b/lustre/lvfs/fsfilt_smfs.c @@ -434,8 +434,10 @@ static int fsfilt_smfs_sync(struct super_block *sb) RETURN(rc); } -int fsfilt_smfs_map_inode_page(struct inode *inode, struct page *page, - unsigned long *blocks, int *created, int create) +int fsfilt_smfs_map_inode_pages(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int *created, int create, + struct semaphore *sem) { struct fsfilt_operations *cache_fsfilt = I2FOPS(inode); struct inode *cache_inode = NULL; @@ -449,12 +451,12 @@ int fsfilt_smfs_map_inode_page(struct inode *inode, struct page *page, if (!cache_inode) RETURN(rc); - if (!cache_fsfilt->fs_map_inode_page) + if (!cache_fsfilt->fs_map_inode_pages) RETURN(-ENOSYS); down(&cache_inode->i_sem); - rc = cache_fsfilt->fs_map_inode_page(cache_inode, page, - blocks, created, create); + rc = cache_fsfilt->fs_map_inode_pages(cache_inode, page, pages, blocks, + created, create, NULL); up(&cache_inode->i_sem); RETURN(rc); @@ -664,7 +666,7 @@ static struct fsfilt_operations fsfilt_smfs_ops = { .fs_add_journal_cb = fsfilt_smfs_add_journal_cb, .fs_statfs = fsfilt_smfs_statfs, .fs_sync = fsfilt_smfs_sync, - .fs_map_inode_page = fsfilt_smfs_map_inode_page, + .fs_map_inode_pages = fsfilt_smfs_map_inode_pages, .fs_prep_san_write = fsfilt_smfs_prep_san_write, .fs_write_record = fsfilt_smfs_write_record, .fs_read_record = fsfilt_smfs_read_record, diff --git a/lustre/lvfs/llog_lvfs.c b/lustre/lvfs/llog_lvfs.c index 4b4d90e7fffc6e6321956c14d5a32c9d06f7a197..80e60e5085d41304ea39b1dc737a3efb281e084d 100644 --- a/lustre/lvfs/llog_lvfs.c +++ b/lustre/lvfs/llog_lvfs.c @@ -58,7 +58,7 @@ static int llog_lvfs_pad(struct llog_ctxt *ctxt, struct l_file *file, tail.lrt_index = rec.lrh_index = cpu_to_le32(index); rec.lrh_type = 0; - rc = llog_fsfilt_write_record(ctxt, file, &rec, sizeof(rec), + rc = llog_fsfilt_write_record(ctxt, file, &rec, sizeof(rec), &file->f_pos, 0); if (rc) { CERROR("error writing padding record: rc %d\n", rc); @@ -100,14 +100,14 @@ static int llog_lvfs_write_blob(struct llog_ctxt *ctxt, struct l_file *file, /* the buf case */ rec->lrh_len = cpu_to_le32(sizeof(*rec) + buflen + sizeof(end)); - rc = llog_fsfilt_write_record(ctxt, file, rec, sizeof(*rec), + rc = llog_fsfilt_write_record(ctxt, file, rec, sizeof(*rec), &file->f_pos, 0); if (rc) { CERROR("error writing log hdr: rc %d\n", rc); goto out; } - rc = llog_fsfilt_write_record(ctxt, file, buf, buflen, + rc = llog_fsfilt_write_record(ctxt, file, buf, buflen, &file->f_pos, 0); if (rc) { CERROR("error writing log buffer: rc %d\n", rc); @@ -116,7 +116,7 @@ static int llog_lvfs_write_blob(struct llog_ctxt *ctxt, struct l_file *file, end.lrt_len = rec->lrh_len; end.lrt_index = rec->lrh_index; - rc = llog_fsfilt_write_record(ctxt, file, &end, sizeof(end), + rc = llog_fsfilt_write_record(ctxt, file, &end, sizeof(end), &file->f_pos, 0); if (rc) { CERROR("error writing log tail: rc %d\n", rc); @@ -175,7 +175,7 @@ static int llog_lvfs_read_header(struct llog_handle *handle) /* appends if idx == -1, otherwise overwrites record idx. */ static int llog_lvfs_write_rec(struct llog_handle *loghandle, struct llog_rec_hdr *rec, - struct llog_cookie *reccookie, + struct llog_cookie *reccookie, int cookiecount, void *buf, int idx) { @@ -387,7 +387,7 @@ static int llog_lvfs_next_block(struct llog_handle *loghandle, int *curr_idx, RETURN(-EIO); } -static int llog_lvfs_prev_block(struct llog_handle *loghandle, +static int llog_lvfs_prev_block(struct llog_handle *loghandle, int prev_idx, void *buf, int len) { struct llog_ctxt *ctxt = loghandle->lgh_ctxt; @@ -527,14 +527,14 @@ static struct file *llog_object_create(struct llog_ctxt *ctxt) handle = llog_fsfilt_start(ctxt, parent->d_inode, FSFILT_OP_RENAME, NULL); if (IS_ERR(handle)) GOTO(out_dput, rc = PTR_ERR(handle)); - + lock_kernel(); rc = vfs_rename(parent->d_inode, filp->f_dentry, parent->d_inode, new_child); unlock_kernel(); if (rc) CERROR("error renaming new object %lu:%u: rc %d\n", - filp->f_dentry->d_inode->i_ino, + filp->f_dentry->d_inode->i_ino, filp->f_dentry->d_inode->i_generation, rc); err = llog_fsfilt_commit(ctxt, parent->d_inode, handle, 0); @@ -546,10 +546,10 @@ out_close: up(&parent->d_inode->i_sem); if (rc) { filp_close(filp, 0); - filp = (struct file *)rc; + filp = (struct file *)rc; } - RETURN(filp); + RETURN(filp); } static int llog_add_link_object(struct llog_ctxt *ctxt, struct llog_logid logid, @@ -575,11 +575,11 @@ static int llog_add_link_object(struct llog_ctxt *ctxt, struct llog_logid logid, logid.lgl_oid, logid.lgl_ogen); LBUG(); } - handle = llog_fsfilt_start(ctxt, ctxt->loc_objects_dir->d_inode, + handle = llog_fsfilt_start(ctxt, ctxt->loc_objects_dir->d_inode, FSFILT_OP_LINK, NULL); if (IS_ERR(handle)) GOTO(out_dput, rc = PTR_ERR(handle)); - + lock_kernel(); rc = vfs_link(dentry, ctxt->loc_objects_dir->d_inode, new_child); unlock_kernel(); @@ -684,7 +684,7 @@ static int llog_lvfs_destroy(struct llog_handle *loghandle) void *handle; int rc = -EINVAL, err, namelen; ENTRY; - + if (ctxt->loc_lvfs_ctxt) push_ctxt(&saved, ctxt->loc_lvfs_ctxt, NULL); @@ -735,7 +735,7 @@ out_err: GOTO(out, rc); } - + if (!strcmp(fdentry->d_parent->d_name.name, "OBJECTS")) { LASSERT(parent_inode == ctxt->loc_objects_dir->d_inode); @@ -756,7 +756,7 @@ out: /* reads the catalog list */ int llog_get_cat_list(struct lvfs_run_ctxt *ctxt, - struct fsfilt_operations *fsops, char *name, + struct fsfilt_operations *fsops, char *name, int count, struct llog_catid *idarray) { struct lvfs_run_ctxt saved; @@ -800,8 +800,8 @@ int llog_get_cat_list(struct lvfs_run_ctxt *ctxt, EXPORT_SYMBOL(llog_get_cat_list); /* writes the cat list */ -int llog_put_cat_list(struct lvfs_run_ctxt *ctxt, - struct fsfilt_operations *fsops, char *name, +int llog_put_cat_list(struct lvfs_run_ctxt *ctxt, + struct fsfilt_operations *fsops, char *name, int count, struct llog_catid *idarray) { struct lvfs_run_ctxt saved; @@ -892,29 +892,29 @@ static int llog_lvfs_destroy(struct llog_handle *handle) } int llog_get_cat_list(struct lvfs_run_ctxt *ctxt, - struct fsfilt_operations *fsops, char *name, + struct fsfilt_operations *fsops, char *name, int count, struct llog_catid *idarray) { LBUG(); return 0; } -int llog_put_cat_list(struct lvfs_run_ctxt *ctxt, - struct fsfilt_operations *fsops, char *name, +int llog_put_cat_list(struct lvfs_run_ctxt *ctxt, + struct fsfilt_operations *fsops, char *name, int count, struct llog_catid *idarray) { LBUG(); return 0; } -int llog_lvfs_prev_block(struct llog_handle *loghandle, +int llog_lvfs_prev_block(struct llog_handle *loghandle, int prev_idx, void *buf, int len) { LBUG(); return 0; } -int llog_lvfs_next_block(struct llog_handle *h, int *curr_idx, +int llog_lvfs_next_block(struct llog_handle *loghandle, int *curr_idx, int next_idx, __u64 *offset, void *buf, int len) { LBUG(); diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h index 2b459b4f0b4da9f7bb7828bdbfc22e76b6a486b2..568df2ca18db7dcc79eea40a9fae3ea45003e394 100644 --- a/lustre/mdc/mdc_internal.h +++ b/lustre/mdc/mdc_internal.h @@ -21,6 +21,8 @@ void mdc_link_pack(struct ptlrpc_request *req, int offset, void mdc_rename_pack(struct ptlrpc_request *req, int offset, struct mdc_op_data *data, const char *old, int oldlen, const char *new, int newlen); +void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa, + int valid, struct obd_client_handle *och); struct mdc_open_data { struct obd_client_handle *mod_och; diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c index 794bcf95143c7aaf873f0588aefb29a6fe6f82d7..e2dc251fb9e502cf7bae789c692cacd13b9a74ce 100644 --- a/lustre/mdc/mdc_lib.c +++ b/lustre/mdc/mdc_lib.c @@ -292,3 +292,37 @@ void mdc_getattr_pack(struct ptlrpc_request *req, int valid, int offset, } } +void mdc_close_pack(struct ptlrpc_request *req, int offset, struct obdo *oa, + int valid, struct obd_client_handle *och) +{ + struct mds_body *body; + + body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body)); + + mdc_pack_fid(&body->fid1, oa->o_id, 0, oa->o_mode); + memcpy(&body->handle, &och->och_fh, sizeof(body->handle)); + if (oa->o_valid & OBD_MD_FLATIME) { + body->atime = oa->o_atime; + body->valid |= OBD_MD_FLATIME; + } + if (oa->o_valid & OBD_MD_FLMTIME) { + body->mtime = oa->o_mtime; + body->valid |= OBD_MD_FLMTIME; + } + if (oa->o_valid & OBD_MD_FLCTIME) { + body->ctime = oa->o_ctime; + body->valid |= OBD_MD_FLCTIME; + } + if (oa->o_valid & OBD_MD_FLSIZE) { + body->size = oa->o_size; + body->valid |= OBD_MD_FLSIZE; + } + if (oa->o_valid & OBD_MD_FLBLOCKS) { + body->blocks = oa->o_blocks; + body->valid |= OBD_MD_FLBLOCKS; + } + if (oa->o_valid & OBD_MD_FLFLAGS) { + body->flags = oa->o_flags; + body->valid |= OBD_MD_FLFLAGS; + } +} diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index fbc448de7a0b8989f104909d07f0fc74d1b3d42d..342dabd06c1367dcad44bc4972c277be23e01875 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -405,18 +405,21 @@ static void mdc_commit_close(struct ptlrpc_request *req) static int mdc_close_interpret(struct ptlrpc_request *req, void *data, int rc) { union ptlrpc_async_args *aa = data; - struct mdc_rpc_lock *rpc_lock = aa->pointer_arg[0]; + struct mdc_rpc_lock *rpc_lock; struct obd_device *obd = aa->pointer_arg[1]; + unsigned long flags; + + spin_lock_irqsave(&req->rq_lock, flags); + rpc_lock = aa->pointer_arg[0]; + aa->pointer_arg[0] = NULL; + spin_unlock_irqrestore (&req->rq_lock, flags); if (rpc_lock == NULL) { CERROR("called with NULL rpc_lock\n"); } else { mdc_put_rpc_lock(rpc_lock, NULL); - LASSERTF(req->rq_async_args.pointer_arg[0] == - obd->u.cli.cl_rpc_lock, "%p != %p\n", - req->rq_async_args.pointer_arg[0], - obd->u.cli.cl_rpc_lock); - aa->pointer_arg[0] = NULL; + LASSERTF(rpc_lock == obd->u.cli.cl_rpc_lock, "%p != %p\n", + rpc_lock, obd->u.cli.cl_rpc_lock); } wake_up(&req->rq_reply_waitq); RETURN(rc); @@ -430,9 +433,8 @@ static int mdc_close_check_reply(struct ptlrpc_request *req) unsigned long flags; spin_lock_irqsave(&req->rq_lock, flags); - if (PTLRPC_REQUEST_COMPLETE(req)) { + if (req->rq_async_args.pointer_arg[0] == NULL) rc = 1; - } spin_unlock_irqrestore (&req->rq_lock, flags); return rc; } @@ -442,13 +444,12 @@ static int go_back_to_sleep(void *unused) return 0; } -int mdc_close(struct obd_export *exp, struct obdo *obdo, +int mdc_close(struct obd_export *exp, struct obdo *oa, struct obd_client_handle *och, struct ptlrpc_request **request) { - struct mds_body *body; struct obd_device *obd = class_exp2obd(exp); - int reqsize = sizeof(*body); - int rc, repsize[3] = {sizeof(*body), + int reqsize = sizeof(struct mds_body); + int rc, repsize[3] = {sizeof(struct mds_body), obd->u.cli.cl_max_mds_easize, obd->u.cli.cl_max_mds_cookiesize}; struct ptlrpc_request *req; @@ -473,13 +474,7 @@ int mdc_close(struct obd_export *exp, struct obdo *obdo, CDEBUG(D_HA, "couldn't find open req; expecting close error\n"); } - body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*body)); - mdc_pack_fid(&body->fid1, obdo->o_id, 0, obdo->o_mode); - memcpy(&body->handle, &och->och_fh, sizeof(body->handle)); - body->size = obdo->o_size; - body->blocks = obdo->o_blocks; - body->flags = obdo->o_flags; - body->valid = obdo->o_valid; + mdc_close_pack(req, 0, oa, oa->o_valid, och); req->rq_replen = lustre_msg_size(3, repsize); req->rq_commit_cb = mdc_commit_close; @@ -501,7 +496,8 @@ int mdc_close(struct obd_export *exp, struct obdo *obdo, if (req->rq_repmsg == NULL) { CDEBUG(D_HA, "request failed to send: %p, %d\n", req, req->rq_status); - rc = req->rq_status ? req->rq_status : -EIO; + if (rc == 0) + rc = req->rq_status ? req->rq_status : -EIO; } else if (rc == 0) { rc = req->rq_repmsg->status; if (req->rq_repmsg->type == PTL_RPC_MSG_ERR) { diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index b7e7aa64559b711f2fad45cdfd9daae199ec2ae0..262f4d8b6570f1f56b2f8b8f2a42295146225595 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -101,7 +101,8 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, file->f_dentry->d_inode->i_size); rc = fsfilt_readpage(req->rq_export->exp_obd, file, - page_address(pages[i]), tmpsize, &offset); + kmap(pages[i]), tmpsize, &offset); + kunmap(pages[i]); if (rc != tmpsize) GOTO(cleanup_buf, rc = -EIO); @@ -342,6 +343,8 @@ static int mds_destroy_export(struct obd_export *export) list_del(&mfd->mfd_list); spin_unlock(&med->med_open_lock); + /* If you change this message, be sure to update + * replay_single:test_46 */ CERROR("force closing client file handle for %*s (%s:%lu)\n", dentry->d_name.len, dentry->d_name.name, ll_bdevname(dentry->d_inode->i_sb, btmp), @@ -1087,6 +1090,21 @@ int mds_handle(struct ptlrpc_request *req) obd = req->rq_export->exp_obd; mds = &obd->u.mds; + /* sanity check: if the xid matches, the request must + * be marked as a resent or replayed */ + if (req->rq_xid == med->med_mcd->mcd_last_xid) + LASSERTF(lustre_msg_get_flags(req->rq_reqmsg) & + (MSG_RESENT | MSG_REPLAY), + "rq_xid "LPU64" matches last_xid, " + "expected RESENT flag\n", + req->rq_xid); + /* else: note the opposite is not always true; a + * RESENT req after a failover will usually not match + * the last_xid, since it was likely never + * committed. A REPLAYed request will almost never + * match the last xid, however it could for a + * committed, but still retained, open. */ + /* Check for aborted recovery. */ spin_lock_bh(&obd->obd_processing_task_lock); abort_recovery = obd->obd_abort_recovery; @@ -1671,6 +1689,12 @@ static void fixup_handle_for_resent_req(struct ptlrpc_request *req, } l_unlock(&obd->obd_namespace->ns_lock); + /* If the xid matches, then we know this is a resent request, + * and allow it. (It's probably an OPEN, for which we don't + * send a lock */ + if (req->rq_xid == exp->exp_mds_data.med_mcd->mcd_last_xid) + return; + /* This remote handle isn't enqueued, so we never received or * processed this request. Clear MSG_RESENT, because it can * be handled like any normal request now. */ diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index b4a70b33548eb928c2d7793283cf4d044a4e9ec0..e5ce7e24538515261520c55e9738c0eadd2d7498 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -7,6 +7,8 @@ #include <linux/lustre_mds.h> +#define MAX_ATIME_DIFF 60 + struct mds_filter_data { __u64 io_epoch; }; diff --git a/lustre/mds/mds_open.c b/lustre/mds/mds_open.c index ce7736c68bf6438a2630bad08ba9c562e7409d48..ee7a50ba9f906781fcac7172376cb3ef110d2dc6 100644 --- a/lustre/mds/mds_open.c +++ b/lustre/mds/mds_open.c @@ -333,14 +333,6 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, RETURN(-ENOMEM); oti.oti_objid = *ids; - if (*handle == NULL) - *handle = fsfilt_start(obd, inode, FSFILT_OP_CREATE, NULL); - if (IS_ERR(*handle)) { - rc = PTR_ERR(*handle); - *handle = NULL; - GOTO(out_ids, rc); - } - /* replay case */ if(lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { LASSERT (rec->ur_fid2->id); @@ -349,6 +341,14 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, lmm = rec->ur_eadata; LASSERT(lmm); + if (*handle == NULL) + *handle = fsfilt_start(obd,inode,FSFILT_OP_CREATE,NULL); + if (IS_ERR(*handle)) { + rc = PTR_ERR(*handle); + *handle = NULL; + GOTO(out_ids, rc); + } + mds_objids_from_lmm(*ids, lmm, &mds->mds_lov_desc); lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, 0); @@ -398,7 +398,7 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, lmm, &lmm_size, 1); if (rc > 0) rc = obd_iocontrol(OBD_IOC_LOV_SETSTRIPE, - mds->mds_osc_exp, + mds->mds_osc_exp, 0, &lsm, lmm); OBD_FREE(lmm, mds->mds_max_mdsize); if (rc) @@ -454,6 +454,15 @@ static int mds_create_objects(struct ptlrpc_request *req, int offset, LASSERT(rc >= 0); lmm_size = rc; body->eadatasize = rc; + + if (*handle == NULL) + *handle = fsfilt_start(obd, inode, FSFILT_OP_CREATE, NULL); + if (IS_ERR(*handle)) { + rc = PTR_ERR(*handle); + *handle = NULL; + GOTO(out_ids, rc); + } + rc = fsfilt_set_md(obd, inode, *handle, lmm, lmm_size); lmm_buf = lustre_msg_buf(req->rq_repmsg, offset, 0); lmm_bufsize = req->rq_repmsg->buflens[offset]; @@ -588,6 +597,10 @@ static void reconstruct_open(struct mds_update_record *rec, int offset, GOTO(out_dput, req->rq_status = -ENOMEM); } put_child = 0; + } else { + body->handle.cookie = mfd->mfd_handle.h_cookie; + CDEBUG(D_INODE, "resend mfd %p, cookie "LPX64"\n", mfd, + mfd->mfd_handle.h_cookie); } out_dput: @@ -933,7 +946,10 @@ int mds_open(struct mds_update_record *rec, int offset, if (rc) CERROR("error on parent setattr: rc = %d\n", rc); - acc_mode = 0; /* Don't check for permissions */ + rc = mds_finish_transno(mds, dchild->d_inode, handle, req, 0, + rep ? rep->lock_policy_res1 : 0); + handle = NULL; + acc_mode = 0; /* Don't check for permissions */ } LASSERT(!mds_inode_is_orphan(dchild->d_inode)); @@ -1049,14 +1065,15 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, void *handle = NULL; struct mds_body *request_body = NULL, *reply_body = NULL; struct dentry_params dp; + struct iattr iattr = { 0 }; ENTRY; - if (req != NULL) { + if (req && req->rq_reqmsg != NULL) request_body = lustre_msg_buf(req->rq_reqmsg, 0, sizeof(*request_body)); + if (req && req->rq_repmsg != NULL) reply_body = lustre_msg_buf(req->rq_repmsg, 0, sizeof(*reply_body)); - } fidlen = ll_fid2str(fidname, inode->i_ino, inode->i_generation); @@ -1091,7 +1108,7 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, LASSERT(pending_child->d_inode != NULL); cleanup_phase = 2; /* dput(pending_child) when finished */ - if (req != NULL) { + if (req != NULL && req->rq_repmsg != NULL) { lmm = lustre_msg_buf(req->rq_repmsg, 1, 0); stripe_count = le32_to_cpu(lmm->lmm_stripe_count); } @@ -1104,7 +1121,8 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, GOTO(cleanup, rc); } - if (req != NULL && (reply_body->valid & OBD_MD_FLEASIZE) && + if (req != NULL && req->rq_repmsg != NULL && + (reply_body->valid & OBD_MD_FLEASIZE) && mds_log_op_unlink(obd, pending_child->d_inode, lmm, req->rq_repmsg->buflens[1], lustre_msg_buf(req->rq_repmsg, 2, 0), @@ -1121,32 +1139,64 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, rc = vfs_unlink(pending_dir, pending_child); if (rc) CERROR("error unlinking orphan %s: rc %d\n",fidname,rc); - } else if (mfd->mfd_mode & FMODE_WRITE && rc == 0) { + + goto out; /* Don't bother updating attrs on unlinked inode */ + } + + if (request_body != NULL && mfd->mfd_mode & FMODE_WRITE && rc == 0) { /* Update the on-disk attributes if this was the last write * close, and all information was provided (i.e., rc == 0) * * XXX this should probably be abstracted with mds_reint_setattr */ + #if 0 - struct iattr iattr; + if (request_body->valid & OBD_MD_FLMTIME && + request_body->mtime > LTIME_S(inode->i_mtime)) { + LTIME_S(iattr.ia_mtime) = request_body->mtime; + iattr.ia_valid |= ATTR_MTIME; + } + if (request_body->valid & OBD_MD_FLCTIME && + request_body->ctime > LTIME_S(inode->i_ctime)) { + LTIME_S(iattr.ia_ctime) = request_body->ctime; + iattr.ia_valid |= ATTR_CTIME; + } /* XXX can't set block count with fsfilt_setattr (!) */ - iattr.ia_valid = ATTR_CTIME | ATTR_ATIME | - ATTR_MTIME | ATTR_SIZE; - iattr.ia_atime = request_body->atime; - iattr.ia_ctime = request_body->ctime; - iattr.ia_mtime = request_body->mtime; - iattr.ia_size = request_body->size; - /* iattr.ia_blocks = request_body->blocks */ + if (request_body->valid & OBD_MD_FLSIZE) { + iattr.ia_valid |= ATTR_SIZE; + iattr.ia_size = request_body->size; + } + /* if (request_body->valid & OBD_MD_FLBLOCKS) { + iattr.ia_valid |= ATTR_BLOCKS; + iattr.ia_blocks = request_body->blocks + } */ +#endif + } + if (request_body != NULL && request_body->valid & OBD_MD_FLATIME) { + /* Only start a transaction to write out only the atime if + * it is more out-of-date than the specified limit. If we + * are already going to write out the atime then do it anyway. + * */ + if ((request_body->atime > + LTIME_S(inode->i_atime) + MAX_ATIME_DIFF) || + (iattr.ia_valid != 0 && + request_body->atime > LTIME_S(inode->i_atime))) { + LTIME_S(iattr.ia_atime) = request_body->atime; + iattr.ia_valid |= ATTR_ATIME; + } + } + + if (iattr.ia_valid != 0) { handle = fsfilt_start(obd, inode, FSFILT_OP_SETATTR, NULL); if (IS_ERR(handle)) GOTO(cleanup, rc = PTR_ERR(handle)); rc = fsfilt_setattr(obd, mfd->mfd_dentry, handle, &iattr, 0); if (rc) CERROR("error in setattr(%s): rc %d\n", fidname, rc); -#endif } +out: /* If other clients have this file open for write, rc will be > 0 */ if (rc > 0) rc = 0; @@ -1155,7 +1205,7 @@ int mds_mfd_close(struct ptlrpc_request *req, struct obd_device *obd, cleanup: atomic_dec(&mds->mds_open_count); - if (req) { + if (req != NULL && reply_body != NULL) { rc = mds_finish_transno(mds, pending_dir, handle, req, rc, 0); } else if (handle) { int err = fsfilt_commit(obd, pending_dir, handle, 0); @@ -1228,17 +1278,16 @@ int mds_close(struct ptlrpc_request *req) spin_unlock(&med->med_open_lock); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - req->rq_status = mds_mfd_close(rc ? NULL : req, obd, mfd, 1); + req->rq_status = mds_mfd_close(req, obd, mfd, 1); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + mds_mfd_put(mfd); if (OBD_FAIL_CHECK(OBD_FAIL_MDS_CLOSE_PACK)) { CERROR("test case OBD_FAIL_MDS_CLOSE_PACK\n"); req->rq_status = -ENOMEM; - mds_mfd_put(mfd); RETURN(-ENOMEM); } - mds_mfd_put(mfd); RETURN(0); } diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 09bf499baa2b549e37a9f85922493ffdb5103939..c6404c484affdc197952631fcdd88777e2ee2d4c 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -65,6 +65,7 @@ #include <linux/lprocfs_status.h> #ifdef __KERNEL__ #include <linux/lustre_build_version.h> +#include <linux/lustre_version.h> #endif #include <portals/list.h> @@ -447,6 +448,13 @@ int obd_proc_read_version(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n", BUILD_VERSION); } +int obd_proc_read_kernel_version(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + *eof = 1; + return snprintf(page, count, "%u\n", LUSTRE_KERNEL_VERSION); +} + int obd_proc_read_pinger(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -464,6 +472,7 @@ int obd_proc_read_pinger(char *page, char **start, off_t off, int count, struct proc_dir_entry *proc_lustre_root = NULL; struct lprocfs_vars lprocfs_base[] = { { "version", obd_proc_read_version, NULL, NULL }, + { "kernel_version", obd_proc_read_kernel_version, NULL, NULL }, { "pinger", obd_proc_read_pinger, NULL, NULL }, { 0 } }; @@ -645,8 +654,8 @@ static void cleanup_obdclass(void) /* Check that we're building against the appropriate version of the Lustre * kernel patch */ #include <linux/lustre_version.h> -#define LUSTRE_MIN_VERSION 28 -#define LUSTRE_MAX_VERSION 35 +#define LUSTRE_MIN_VERSION 32 +#define LUSTRE_MAX_VERSION 36 #if (LUSTRE_KERNEL_VERSION < LUSTRE_MIN_VERSION) # error Cannot continue: Your Lustre kernel patch is older than the sources #elif (LUSTRE_KERNEL_VERSION > LUSTRE_MAX_VERSION) diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index d5ef04ca37fe67dcb6ea2443c6ed2bccdcd8aff3..f302dc01f77ea9fc4caf7c424c5a7862d1d598ed 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -647,13 +647,13 @@ int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, san_preprw); LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export); LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin); LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin); LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event); LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify); - + for (i = num_private_stats; i < num_stats; i++) { /* If this LBUGs, it is likely that an obd * operation was added to struct obd_ops in diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index 67935cbf0b47e0fe50b3441011614e2c401644d1..71822802cfe2f07ccc8ef59ff1733cea65624a75 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -535,7 +535,7 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, goto out; pgp->count = PAGE_SIZE; - pgp->off = off; + pgp->disk_offset = pgp->page_offset = off; pgp->flag = 0; if (verify) @@ -556,7 +556,8 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, if (verify) { int vrc; vrc = echo_client_page_debug_check(lsm, pgp->pg, oa->o_id, - pgp->off, pgp->count); + pgp->page_offset, + pgp->count); if (vrc != 0 && rc == 0) rc = vrc; } @@ -615,7 +616,7 @@ static int echo_client_ubrw(struct obd_device *obd, int rw, for (i = 0, off = offset, pgp = pga; i < npages; i++, off += PAGE_SIZE, pgp++) { - pgp->off = off; + pgp->disk_offset = pgp->page_offset = off; pgp->pg = kiobuf->maplist[i]; pgp->count = PAGE_SIZE; pgp->flag = 0; diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index cb09e696370c6ee7fb13b7d1b93a0504ee3cb9cb..66b4633346b05c4582eec0611b52811a8ab8eee0 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1209,6 +1209,10 @@ int filter_common_setup(struct obd_device *obd, obd_count len, if (rc) GOTO(err_mntput, rc); + + filter->fo_destroy_in_progress = 0; + sema_init(&filter->fo_create_lock, 1); + spin_lock_init(&filter->fo_translock); spin_lock_init(&filter->fo_objidlock); INIT_LIST_HEAD(&filter->fo_export_list); @@ -1560,7 +1564,8 @@ struct dentry *__filter_oa2dentry(struct obd_device *obd, } if (dchild->d_inode == NULL) { - CERROR("%s on non-existent object: "LPU64"\n", what, oa->o_id); + CERROR("%s: %s on non-existent object: "LPU64"\n", + obd->obd_name, what, oa->o_id); f_dput(dchild); RETURN(ERR_PTR(-ENOENT)); } @@ -1740,6 +1745,16 @@ static void filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, } doa.o_mode = S_IFREG; + filter->fo_destroy_in_progress = 1; + down(&filter->fo_create_lock); + if (!filter->fo_destroy_in_progress) { + CERROR("%s: destroy_in_progress already cleared\n", + exp->exp_obd->obd_name); + up(&filter->fo_create_lock); + EXIT; + return; + } + last = filter_last_id(filter, &doa); CWARN("%s: deleting orphan objects from "LPU64" to "LPU64"\n", exp->exp_obd->obd_name, oa->o_id + 1, last); @@ -1754,6 +1769,10 @@ static void filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, spin_lock(&filter->fo_objidlock); filter->fo_last_objids[doa.o_gr] = oa->o_id; spin_unlock(&filter->fo_objidlock); + + filter->fo_destroy_in_progress = 0; + up(&filter->fo_create_lock); + EXIT; } @@ -1810,12 +1829,10 @@ static int filter_should_precreate(struct obd_export *exp, struct obdo *oa, static int filter_precreate(struct obd_device *obd, struct obdo *oa, obd_gr group, int *num) { - struct dentry *dchild = NULL; + struct dentry *dchild = NULL, *dparent = NULL; struct filter_obd *filter; - struct dentry *dparent; - int err = 0, rc = 0, i; + int err = 0, rc = 0, recreate_obj = 0, i; __u64 next_id; - int recreate_obj = 0; void *handle = NULL; ENTRY; @@ -1826,11 +1843,19 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, recreate_obj = 1; } - CDEBUG(D_HA, "%s: precreating %d objects\n", obd->obd_name, *num); + CDEBUG(D_HA, "%s: precreating %d objects\n", obd->obd_name, *num); + + down(&filter->fo_create_lock); for (i = 0; i < *num && err == 0; i++) { int cleanup_phase = 0; + if (filter->fo_destroy_in_progress) { + CWARN("%s: precreate aborted by destroy\n", + obd->obd_name); + break; + } + if (recreate_obj) { __u64 last_id; next_id = oa->o_id; @@ -1839,7 +1864,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, CERROR("Error: Trying to recreate obj greater" "than last id "LPD64" > "LPD64"\n", next_id, last_id); - RETURN(-EINVAL); + GOTO(cleanup, rc = -EINVAL); } } else next_id = filter_last_id(filter, oa) + 1; @@ -1864,13 +1889,13 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, if (recreate_obj) { CERROR("%s: Serious error: recreating obj %*s " "but obj already exists \n", - obd->obd_name, dchild->d_name.len, + obd->obd_name, dchild->d_name.len, dchild->d_name.name); LBUG(); } else { CERROR("%s: Serious error: objid %*s already " "exists; is this filesystem corrupt?\n", - obd->obd_name, dchild->d_name.len, + obd->obd_name, dchild->d_name.len, dchild->d_name.name); LBUG(); } @@ -1919,10 +1944,12 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, } *num = i; + up(&filter->fo_create_lock); + CDEBUG(D_HA, "%s: server last_objid for group "LPU64": "LPU64"\n", obd->obd_name, group, filter->fo_last_objids[group]); - CDEBUG(D_HA, "%s: filter_precreate() created %d objects\n", + CDEBUG(D_HA, "%s: filter_precreate() created %d objects\n", obd->obd_name, i); RETURN(rc); } diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h index 6203418579402c6d3399d9d3b7879f5ea77f62a9..055232d04148d48dec098ff2e1b9a438bcc0af35 100644 --- a/lustre/obdfilter/filter_internal.h +++ b/lustre/obdfilter/filter_internal.h @@ -126,6 +126,10 @@ int filter_brw(int cmd, struct obd_export *, struct obdo *, struct lov_stripe_md *, obd_count oa_bufs, struct brw_page *, struct obd_trans_info *); void flip_into_page_cache(struct inode *inode, struct page *new_page); +void filter_release_read_page(struct filter_obd *filter, struct inode *inode, + struct page *page); +void filter_release_write_page(struct filter_obd *filter, struct inode *inode, + struct niobuf_local *lnb, int rc); /* filter_io_*.c */ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index 8cf3f596df3ab5da660115898ffcd58ccdde46bd..c7679b5b115d18d6d555589d8a845ea047f69820 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -52,11 +52,6 @@ static int filter_start_page_read(struct obd_device *obd, struct inode *inode, lnb->page = page; - if (inode->i_size < lnb->offset + lnb->len - 1) - lnb->rc = inode->i_size - lnb->offset; - else - lnb->rc = lnb->len; - return 0; } @@ -362,6 +357,11 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, GOTO(cleanup, rc); } + if (inode->i_size < lnb->offset + lnb->len - 1) + lnb->rc = inode->i_size - lnb->offset; + else + lnb->rc = lnb->len; + tot_bytes += lnb->rc; if (lnb->rc < lnb->len) { /* short read, be sure to wait on it */ @@ -523,25 +523,55 @@ static int filter_grant_check(struct obd_export *exp, int objcount, return rc; } -static int filter_start_page_write(struct inode *inode, +static int filter_start_page_write(struct obd_device *obd, struct inode *inode, struct niobuf_local *lnb) { - struct page *page = alloc_pages(GFP_HIGHUSER, 0); + struct page *page; + + if (lnb->len != PAGE_SIZE) + return filter_start_page_read(obd, inode, lnb); + + page = alloc_pages(GFP_HIGHUSER, 0); if (page == NULL) { CERROR("no memory for a temp page\n"); RETURN(lnb->rc = -ENOMEM); } +#if 0 POISON_PAGE(page, 0xf1); if (lnb->len != PAGE_SIZE) { memset(kmap(page) + lnb->len, 0, PAGE_SIZE - lnb->len); kunmap(page); } +#endif page->index = lnb->offset >> PAGE_SHIFT; lnb->page = page; return 0; } +static void filter_abort_page_write(struct niobuf_local *lnb) +{ + LASSERT(lnb->page != NULL); + + if (lnb->len != PAGE_SIZE) + page_cache_release(lnb->page); + else + __free_pages(lnb->page, 0); +} + +/* a helper for both the 2.4 and 2.6 commitrw paths which are both built + * up by our shared filter_preprw_write() */ +void filter_release_write_page(struct filter_obd *filter, struct inode *inode, + struct niobuf_local *lnb, int rc) +{ + if (lnb->len != PAGE_SIZE) + return filter_release_read_page(filter, inode, lnb->page); + + if (rc == 0) + flip_into_page_cache(inode, lnb->page); + __free_page(lnb->page); +} + /* If we ever start to support multi-object BRW RPCs, we will need to get locks * on mulitple inodes. That isn't all, because there still exists the * possibility of a truncate starting a new transaction while holding the ext3 @@ -623,13 +653,13 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, lnb->len = rnb->len; lnb->flags = rnb->flags; - rc = filter_start_page_write(dentry->d_inode, lnb); + rc = filter_start_page_write(exp->exp_obd, dentry->d_inode,lnb); if (rc) { CERROR("page err %u@"LPU64" %u/%u %p: rc %d\n", lnb->len, lnb->offset, i, obj->ioo_bufcnt, dentry, rc); while (lnb-- > res) - __free_pages(lnb->page, 0); + filter_abort_page_write(lnb); f_dput(dentry); GOTO(cleanup, rc); } @@ -637,6 +667,17 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, tot_bytes += lnb->len; } + while (lnb-- > res) { + if (lnb->len == PAGE_SIZE) + continue; + rc = filter_finish_page_read(lnb); + if (rc) { + CERROR("error page %u@"LPU64" %u %p: rc %d\n", lnb->len, + lnb->offset, (int)(lnb - res), lnb->dentry, rc); + GOTO(cleanup, rc); + } + } + if (time_after(jiffies, now + 15 * HZ)) CERROR("slow start_page_write %lus\n", (jiffies - now) / HZ); else @@ -676,6 +717,24 @@ int filter_preprw(int cmd, struct obd_export *exp, struct obdo *oa, return -EPROTO; } +void filter_release_read_page(struct filter_obd *filter, struct inode *inode, + struct page *page) +{ + int drop = 0; + + if (inode != NULL && + (inode->i_size > filter->fo_readcache_max_filesize)) + drop = 1; + + /* drop from cache like truncate_list_pages() */ + if (drop && !TryLockPage(page)) { + if (page->mapping) + ll_truncate_complete_page(page); + unlock_page(page); + } + page_cache_release(page); +} + static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa, int objcount, struct obd_ioobj *obj, int niocount, struct niobuf_local *res, @@ -683,24 +742,19 @@ static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa, { struct obd_ioobj *o; struct niobuf_local *lnb; - int i, j, drop = 0; + int i, j; + struct inode *inode = NULL; ENTRY; if (res->dentry != NULL) - drop = (res->dentry->d_inode->i_size > - exp->exp_obd->u.filter.fo_readcache_max_filesize); + inode = res->dentry->d_inode; for (i = 0, o = obj, lnb = res; i < objcount; i++, o++) { for (j = 0 ; j < o->ioo_bufcnt ; j++, lnb++) { if (lnb->page == NULL) continue; - /* drop from cache like truncate_list_pages() */ - if (drop && !TryLockPage(lnb->page)) { - if (lnb->page->mapping) - ll_truncate_complete_page(lnb->page); - unlock_page(lnb->page); - } - page_cache_release(lnb->page); + filter_release_read_page(&exp->exp_obd->u.filter, + inode, lnb->page); } } @@ -811,7 +865,7 @@ int filter_brw(int cmd, struct obd_export *exp, struct obdo *oa, GOTO(out, ret = -ENOMEM); for (i = 0; i < oa_bufs; i++) { - rnb[i].offset = pga[i].off; + rnb[i].offset = pga[i].disk_offset; rnb[i].len = pga[i].count; } @@ -824,7 +878,7 @@ int filter_brw(int cmd, struct obd_export *exp, struct obdo *oa, for (i = 0; i < oa_bufs; i++) { void *virt = kmap(pga[i].pg); - obd_off off = pga[i].off & ~PAGE_MASK; + obd_off off = pga[i].disk_offset & ~PAGE_MASK; void *addr = kmap(lnb[i].page); /* 2 kmaps == vanishingly small deadlock opportunity */ diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index 1839f163bff8af1ef9de9030174aa395b425dda0..abc48adf73fb931487b2b94db63b250bc92b6b5a 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -84,10 +84,8 @@ static int filter_direct_io(int rw, struct dentry *dchild, struct kiobuf *iobuf, { struct obd_device *obd = exp->exp_obd; struct inode *inode = dchild->d_inode; - struct page *page; - unsigned long *b = iobuf->blocks; - int rc, i, create = (rw == OBD_BRW_WRITE), blocks_per_page; - int *cr, cleanup_phase = 0, *created = NULL; + int rc, create = (rw == OBD_BRW_WRITE), blocks_per_page; + int cleanup_phase = 0, *created = NULL; int committed = 0; ENTRY; @@ -105,22 +103,11 @@ static int filter_direct_io(int rw, struct dentry *dchild, struct kiobuf *iobuf, GOTO(cleanup, rc); cleanup_phase = 2; - down(&exp->exp_obd->u.filter.fo_alloc_lock); - for (i = 0, cr = created, b = iobuf->blocks; i < iobuf->nr_pages; i++){ - page = iobuf->maplist[i]; - - rc = fsfilt_map_inode_page(obd, inode, page, b, cr, create); - if (rc) { - CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n", - inode->i_ino, *b, *cr, create, rc); - up(&exp->exp_obd->u.filter.fo_alloc_lock); - GOTO(cleanup, rc); - } - - b += blocks_per_page; - cr += blocks_per_page; - } - up(&exp->exp_obd->u.filter.fo_alloc_lock); + rc = fsfilt_map_inode_pages(obd, inode, iobuf->maplist, + iobuf->nr_pages, iobuf->blocks, created, + create, &obd->u.filter.fo_alloc_lock); + if (rc) + GOTO(cleanup, rc); filter_tally_write(&obd->u.filter, iobuf->maplist, iobuf->nr_pages, iobuf->blocks, blocks_per_page); @@ -329,12 +316,11 @@ cleanup: free_kiovec(1, &iobuf); case 0: for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) { - /* flip_.. gets a ref, while free_page only frees - * when it decrefs to 0 */ - if (rc == 0) - flip_into_page_cache(inode, lnb->page); - __free_page(lnb->page); + filter_release_write_page(&obd->u.filter, + res->dentry->d_inode, lnb, + rc); } + f_dput(res->dentry); } diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index c9d2151270bf29037ca5d7927cca2af3099479fb..8014526afc9274fbb4c5e5c4a072b1828c121f17 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -161,6 +161,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) { loff_t this_size; sector_t sector; + struct page *pages[1]; int offs; /* If overwriting an existing block, we don't need a grant */ @@ -172,8 +173,10 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, continue; /* get block number for next page */ - rc = fsfilt_map_inode_page(obd, inode, lnb->page, dreq->blocks, - dreq->created, 1); + pages[0] = lnb->page; + rc = fsfilt_map_inode_pages(obd, inode, pages, 1, + dreq->blocks, dreq->created, 1, + NULL); if (rc != 0) GOTO(cleanup, rc); @@ -261,12 +264,11 @@ cleanup: OBD_FREE(dreq, sizeof(*dreq)); case 0: for (i = 0, lnb = res; i < obj->ioo_bufcnt; i++, lnb++) { - /* flip_.. gets a ref, while free_page only frees - * when it decrefs to 0 */ - if (rc == 0) - flip_into_page_cache(inode, lnb->page); - __free_page(lnb->page); + filter_release_write_page(&obd->u.filter, + res->dentry->d_inode, lnb, + rc); } + f_dput(res->dentry); } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index d095a384cd060f64f5fbe18b1167b1ef0ef96f6d..8ac8bef91412d71866d6efba6b3450e1baa9c25d 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -593,7 +593,7 @@ static void handle_short_read(int nob_read, obd_count page_count, if (pga->count > nob_read) { /* EOF inside this page */ - ptr = kmap(pga->pg) + (pga->off & ~PAGE_MASK); + ptr = kmap(pga->pg) + (pga->page_offset & ~PAGE_MASK); memset(ptr + nob_read, 0, pga->count - nob_read); kunmap(pga->pg); page_count--; @@ -608,7 +608,7 @@ static void handle_short_read(int nob_read, obd_count page_count, /* zero remaining pages */ while (page_count-- > 0) { - ptr = kmap(pga->pg) + (pga->off & ~PAGE_MASK); + ptr = kmap(pga->pg) + (pga->page_offset & ~PAGE_MASK); memset(ptr, 0, pga->count); kunmap(pga->pg); pga++; @@ -665,7 +665,7 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) return 0; } - return (p1->off + p1->count == p2->off); + return (p1->disk_offset + p1->count == p2->disk_offset); } #if CHECKSUM_BULK @@ -750,24 +750,24 @@ static int osc_brw_prep_request(int cmd, struct obd_import *imp,struct obdo *oa, struct brw_page *pg_prev = pg - 1; LASSERT(pg->count > 0); - LASSERT((pg->off & ~PAGE_MASK) + pg->count <= PAGE_SIZE); - LASSERTF(i == 0 || pg->off > pg_prev->off, + LASSERT((pg->page_offset & ~PAGE_MASK)+ pg->count <= PAGE_SIZE); + LASSERTF(i == 0 || pg->disk_offset > pg_prev->disk_offset, "i %d p_c %u pg %p [pri %lu ind %lu] off "LPU64 " prev_pg %p [pri %lu ind %lu] off "LPU64"\n", i, page_count, - pg->pg, pg->pg->private, pg->pg->index, pg->off, + pg->pg, pg->pg->private, pg->pg->index, pg->disk_offset, pg_prev->pg, pg_prev->pg->private, pg_prev->pg->index, - pg_prev->off); + pg_prev->disk_offset); - ptlrpc_prep_bulk_page(desc, pg->pg, pg->off & ~PAGE_MASK, - pg->count); + ptlrpc_prep_bulk_page(desc, pg->pg, + pg->page_offset & ~PAGE_MASK, pg->count); requested_nob += pg->count; if (i > 0 && can_merge_pages(pg_prev, pg)) { niobuf--; niobuf->len += pg->count; } else { - niobuf->offset = pg->off; + niobuf->offset = pg->disk_offset; niobuf->len = pg->count; niobuf->flags = pg->flag; } @@ -999,7 +999,8 @@ static void sort_brw_pages(struct brw_page *array, int num) for (i = stride ; i < num ; i++) { tmp = array[i]; j = i; - while (j >= stride && array[j - stride].off > tmp.off) { + while (j >= stride && array[j - stride].disk_offset > + tmp.disk_offset) { array[j] = array[j - stride]; j -= stride; } @@ -1281,7 +1282,8 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli, ops = oap->oap_caller_ops; caller_data = oap->oap_caller_data; } - pga[i].off = oap->oap_obj_off + oap->oap_page_off; + pga[i].disk_offset = oap->oap_obj_off + oap->oap_page_off; + pga[i].page_offset = pga[i].disk_offset; pga[i].pg = oap->oap_page; pga[i].count = oap->oap_count; pga[i].flag = oap->oap_brw_flags; @@ -1399,8 +1401,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, /* take the page out of our book-keeping */ list_del_init(&oap->oap_pending_item); lop_update_pending(cli, lop, cmd, -1); - if (!list_empty(&oap->oap_urgent_item)) - list_del_init(&oap->oap_urgent_item); + list_del_init(&oap->oap_urgent_item); /* ask the caller for the size of the io as the rpc leaves. */ if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) @@ -2096,9 +2097,10 @@ static int sanosc_brw_read(struct obd_export *exp, struct obdo *oa, for (mapped = 0; mapped < page_count; mapped++, nioptr++) { LASSERT(PageLocked(pga[mapped].pg)); - LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off); + LASSERT(mapped == 0 || + pga[mapped].disk_offset > pga[mapped - 1].disk_offset); - nioptr->offset = pga[mapped].off; + nioptr->offset = pga[mapped].disk_offset; nioptr->len = pga[mapped].count; nioptr->flags = pga[mapped].flag; } @@ -2225,9 +2227,10 @@ static int sanosc_brw_write(struct obd_export *exp, struct obdo *oa, /* pack request */ for (mapped = 0; mapped < page_count; mapped++, nioptr++) { LASSERT(PageLocked(pga[mapped].pg)); - LASSERT(mapped == 0 || pga[mapped].off > pga[mapped - 1].off); + LASSERT(mapped == 0 || + pga[mapped].disk_offset > pga[mapped - 1].disk_offset); - nioptr->offset = pga[mapped].off; + nioptr->offset = pga[mapped].disk_offset; nioptr->len = pga[mapped].count; nioptr->flags = pga[mapped].flag; } diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h index 958889a07e36c18d4dad5f9c19232858f165a3dc..c55dd373cd65113c28cf622f7e422f586e64a406 100644 --- a/lustre/portals/include/linux/kp30.h +++ b/lustre/portals/include/linux/kp30.h @@ -694,11 +694,15 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data); # endif #endif -/*#ifndef LP_POISON +#if BITS_PER_LONG > 32 # define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) # define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) # define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) -#endif*/ +#else +# define LI_POISON ((int)0x5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a) +# define LP_POISON ((void *)(long)0x5a5a5a5a) +#endif #if defined(__x86_64__) # define LPU64 "%Lu" @@ -706,33 +710,18 @@ typedef int (*cfg_record_cb_t)(enum cfg_record_type, int len, void *data); # define LPX64 "%#Lx" # define LPSZ "%lu" # define LPSSZ "%ld" -#ifndef LP_POISON -# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) -# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) -# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) -#endif #elif (BITS_PER_LONG == 32 || __WORDSIZE == 32) # define LPU64 "%Lu" # define LPD64 "%Ld" # define LPX64 "%#Lx" # define LPSZ "%u" # define LPSSZ "%d" -#ifndef LP_POISON -# define LI_POISON ((int)0x5a5a5a5a) -# define LL_POISON ((long)0x5a5a5a5a) -# define LP_POISON ((void *)(long)0x5a5a5a5a) -#endif #elif (BITS_PER_LONG == 64 || __WORDSIZE == 64) # define LPU64 "%lu" # define LPD64 "%ld" # define LPX64 "%#lx" # define LPSZ "%lu" # define LPSSZ "%ld" -#ifndef LP_POISON -# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) -# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) -# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) -#endif #endif #ifndef LPU64 # error "No word size defined" diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c index 5359ef7590da6af80b4f62fdd27afaa355efd2ec..f4005de8443f0980585e50ffeebfe16b2e4ac612 100644 --- a/lustre/portals/knals/qswnal/qswnal.c +++ b/lustre/portals/knals/qswnal/qswnal.c @@ -108,7 +108,7 @@ kqswnal_yield(nal_t *nal, unsigned long *flags, int milliseconds) CDEBUG (D_NET, "yield\n"); if (milliseconds == 0) { - if (current->need_resched) + if (need_resched()) schedule(); return 0; } @@ -817,8 +817,7 @@ kqswnal_startup (nal_t *nal, ptl_pid_t requested_pid, /**********************************************************************/ /* Spawn scheduling threads */ - for (i = 0; i < smp_num_cpus; i++) - { + for (i = 0; i < num_online_cpus(); i++) { rc = kqswnal_thread_start (kqswnal_scheduler, NULL); if (rc != 0) { diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h index 1cd42db9396b323de5dd83452925d42f3644d63b..6978aa062c407dc050e20c928c76e21fae1bb2b0 100644 --- a/lustre/portals/knals/qswnal/qswnal.h +++ b/lustre/portals/knals/qswnal/qswnal.h @@ -53,7 +53,11 @@ #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> -#include <linux/locks.h> +#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#include <linux/locks.h> /* wait_on_buffer */ +#else +#include <linux/buffer_head.h> /* wait_on_buffer */ +#endif #include <linux/unistd.h> #include <net/sock.h> #include <linux/uio.h> diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c index f92f97474d624d758f1ae45d5a3d440ba074b3a3..2bcb853a4629fe087dc51603f7f1e310ed267268 100644 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -1824,7 +1824,7 @@ kqswnal_scheduler (void *arg) !list_empty(&kqswnal_data.kqn_delayedtxds) || !list_empty(&kqswnal_data.kqn_delayedfwds)); LASSERT (rc == 0); - } else if (current->need_resched) + } else if (need_resched()) schedule (); spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c index a53ea6b41e8bbec1b84ee8090f9e0cb7a91975e7..4e63c8644ed5743b850fe1625e761a96356acab0 100644 --- a/lustre/portals/libcfs/module.c +++ b/lustre/portals/libcfs/module.c @@ -401,14 +401,22 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, err = lwt_control (data->ioc_flags, data->ioc_misc); break; - case IOC_PORTAL_LWT_SNAPSHOT: - err = lwt_snapshot (&data->ioc_nid, - &data->ioc_count, &data->ioc_misc, + case IOC_PORTAL_LWT_SNAPSHOT: { + cycles_t now; + int ncpu; + int total_size; + + err = lwt_snapshot (&now, &ncpu, &total_size, data->ioc_pbuf1, data->ioc_plen1); + data->ioc_nid = now; + data->ioc_count = ncpu; + data->ioc_misc = total_size; + if (err == 0 && copy_to_user((char *)arg, data, sizeof (*data))) err = -EFAULT; break; + } case IOC_PORTAL_LWT_LOOKUP_STRING: err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, @@ -421,7 +429,13 @@ static int libcfs_ioctl(struct inode *inode, struct file *file, case IOC_PORTAL_NAL_CMD: { struct portals_cfg pcfg; - LASSERT (data->ioc_plen1 == sizeof(pcfg)); + if (data->ioc_plen1 != sizeof(pcfg)) { + CERROR("Bad ioc_plen1 %d (wanted %d)\n", + data->ioc_plen1, sizeof(pcfg)); + err = -EINVAL; + break; + } + if (copy_from_user(&pcfg, (void *)data->ioc_pbuf1, sizeof(pcfg))) { err = -EFAULT; diff --git a/lustre/portals/unals/connection.c b/lustre/portals/unals/connection.c index ca6999a30093272578ee95cb69ee8948b50938c5..3448460cfaa503cbbd179359f0d1153a35534b52 100644 --- a/lustre/portals/unals/connection.c +++ b/lustre/portals/unals/connection.c @@ -229,7 +229,7 @@ tcpnal_hello (int sockfd, ptl_nid_t *nid, int type, __u64 incarnation) hdr.type = __cpu_to_le32 (PTL_MSG_HELLO); hdr.msg.hello.type = __cpu_to_le32 (type); - hdr.msg.hello.incarnation = 0; + hdr.msg.hello.incarnation = __cpu_to_le64(incarnation); /* Assume sufficient socket buffering for this message */ rc = syscall(SYS_write, sockfd, &hdr, sizeof(hdr)); @@ -315,6 +315,8 @@ connection force_tcp_connection(manager m, connection conn; struct sockaddr_in addr; unsigned int id[2]; + struct timeval tv; + __u64 incarnation; port = tcpnal_acceptor_port; @@ -353,8 +355,11 @@ connection force_tcp_connection(manager m, setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &option, sizeof(option)); #endif + gettimeofday(&tv, NULL); + incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + /* say hello */ - if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, 0)) + if (tcpnal_hello(fd, &peernid, SOCKNAL_CONN_ANY, incarnation)) exit(-1); conn = allocate_connection(m, ip, port, fd); diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c index f3e82c6bc2cd04b893c607b51214071e23fbe3a0..f8107d8068603fa41ade8ec3e9c4404bb20a81cb 100644 --- a/lustre/portals/utils/portals.c +++ b/lustre/portals/utils/portals.c @@ -1565,14 +1565,11 @@ lwt_put_string(char *ustr) static int lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e) { - char whenstr[32]; char *where = lwt_get_string(e->lwte_where); if (where == NULL) return (-1); - sprintf(whenstr, LPU64, (__u64)(e->lwte_when - t0)); - fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n", e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4, (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0), @@ -1624,6 +1621,7 @@ jt_ptl_lwt(int argc, char **argv) cycles_t tnow; struct timeval tvnow; int printed_date = 0; + int nlines = 0; FILE *f = stdout; if (argc < 2 || @@ -1773,6 +1771,12 @@ jt_ptl_lwt(int argc, char **argv) rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]); if (rc != 0) break; + + if (++nlines % 10000 == 0 && f != stdout) { + /* show some activity... */ + printf("."); + fflush (stdout); + } } tlast = next_event[cpu]->lwte_when; @@ -1786,8 +1790,10 @@ jt_ptl_lwt(int argc, char **argv) next_event[cpu] = NULL; } - if (f != stdout) + if (f != stdout) { + printf("\n"); fclose(f); + } free(events); return (0); diff --git a/lustre/scripts/collect-stats.sh b/lustre/scripts/collect-stats.sh new file mode 100644 index 0000000000000000000000000000000000000000..b8c585c6e9e5831182518d90e1d0e665308cbcbc --- /dev/null +++ b/lustre/scripts/collect-stats.sh @@ -0,0 +1,180 @@ +#!/bin/bash + +die() { + echo $* 1>&2 + exit 1 +} +cleanup_lock="" +cleanup() { + [ ! -z "$cleanup_lock" ] && rmdir $cleanup_lock +} +trap cleanup EXIT + +usage() { + echo " -d dir (required)" + echo " Specifies the top level directory that all hosts share" + echo " and collects stats under. Each host will use a " + echo " subdirectory named after its hostname." + echo + echo " If the host directory doesn't exist, stats collection" + echo " begins by clearing accumulators in /proc and launching" + echo " background tasks." + echo + echo " If the host directory exists, the script stops " + echo " background processes and collects the results. A host" + echo " directory can not be reused once it has collected" + echo " stats." + echo " -h" + echo " Shows this help message." + echo + echo "Example:" + echo " [on all nodes] $0 -d /tmp/collection" + echo " (time passes while a load is run)" + echo " [on all nodes] $0 -d /tmp/collection" + echo " tree /tmp/collection" + echo + exit +} + +[ ${#*} == 0 ] && usage + +while getopts ":d:" opt; do + case $opt in + d) topdir=$OPTARG ;; + \?) usage + esac +done + +if [ ! -e $topdir ]; then + mkdir -p $topdir || die "couldn't create dir $topdir" +fi + +[ ! -d $topdir ] && die "$topdir isn't a directory" + +mydir="$topdir/`hostname`" +lock="$topdir/.`hostname`-lock" + +mkdir $lock || "another script is working on $mydir, exiting." +cleanup_lock="$lock" + +clear_files() { + for f in $1; do + [ ! -f $f ] && continue + echo 0 > $f + done +} + +dump_files() { + dirglob=$1 + shift + for d in $dirglob; do + [ ! -d $d ] && continue + log="$mydir/`basename $d`" + > $log + for f in $*; do + [ ! -f $d/$f ] && continue + echo "----------------- $f" >> $log + ( cd $d && cat $f ) >> $log + done + done +} + +# find filter dirs, sigh. +num_filter_dirs=0 +for f in /proc/fs/lustre/obdfilter/*; do + [ ! -d $f ] && continue; + num_filter_dirs=$((num_filter_dirs + 1)) + filter_dirs="$filter_dirs,`basename $f`" +done +if [ $num_filter_dirs == "1" ]; then + tmp=`echo $filter_dirs | sed -e 's/,//g'` + filter_dirs="/proc/fs/lustre/obdfilter/$tmp" +fi +if [ $num_filter_dirs -gt "1" ]; then + filter_dirs="/proc/fs/lustre/obdfilter/{$filter_dirs}" +fi + +save_proc_files() { + cd /proc + for f in $*; do + save=`echo $f | sed -e 's@/@_@g'` + [ ! -f $f ] && continue + cat $f > $mydir/$save + done + cd - +} + +launch() { + touch $mydir/pids + + if ! which $1 > /dev/null 2>&1; then + return + fi + + cd $mydir + $* > $1.log 2>&1 & + PID=$! + if [ $? = 0 ]; then + echo $PID >> pids + echo "launched '$*' as pid $PID" + else + echo "'$*' failed" + rm $1.log + fi + cd - +} + + +start_collection() { + echo "starting collection in $mydir" + mkdir $mydir || die "couldn't create dir $mydir" + + echo clearing files in /proc/fs/lustre + clear_files '/proc/fs/lustre/osc/*MNT*/rpc_stats' + clear_files '/proc/fs/lustre/llite/*/read_ahead_stats' + [ ! -z "$filter_dirs" ] && clear_files "$filter_dirs/brw_stats" + + launch vmstat 2 + launch iostat -x 2 + + + date > $mydir/started +} + + +stop_collection() { + pids="$mydir/pids" + + [ -e $mydir/finished ] && die "$mydir already contains collected files" + [ ! -e $mydir/started ] && die "$mydir hasn't started collection?" + + echo "collecting files for $mydir" + dump_files '/proc/fs/lustre/osc/*MNT*' max_dirty_mb max_pages_per_rpc \ + max_rpcs_in_flight cur_grant_bytes rpc_stats + dump_files '/proc/fs/lustre/llite/*' read_ahead max_read_ahead_mb \ + read_ahead_stats + [ ! -z "$filter_dirs" ] && dump_files $filter_dirs \ + readcache_max_filesize tot_granted \ + brw_stats + + for pid in `cat $pids`; do + echo killing pid $pid + kill $pid + done + rm $pids + + save_proc_files cpuinfo meminfo slabinfo + + if which lspci > /dev/null 2>&1; then + lspci > $mydir/lspci 2>&1 + fi + + date > $mydir/finished + echo DONE +} + +if [ -e $mydir ]; then + stop_collection +else + start_collection +fi diff --git a/lustre/scripts/land1.sh b/lustre/scripts/land1.sh index 16f6b205793958b1a749451def7326399c1d008a..c3a04684688df3bafd5b1bef2b459a7773dede70 100755 --- a/lustre/scripts/land1.sh +++ b/lustre/scripts/land1.sh @@ -36,7 +36,7 @@ case $child in *) child="b_$child" esac -if [ "$parent" != "HEAD" -a "`cat CVS/Tag`" != "T$parent" ]; then +if [ "$parent" != "HEAD" -a "`cat CVS/Tag 2> /dev/null`" != "T$parent" ]; then echo "This script must be run within the $parent branch" exit 1 fi diff --git a/lustre/scripts/merge1.sh b/lustre/scripts/merge1.sh index 9bdc9b5de5b87dec4a0be42044bd48d233af1eb5..ac074d74ec363c58d8b6319a9aea2a7dbf2e715a 100755 --- a/lustre/scripts/merge1.sh +++ b/lustre/scripts/merge1.sh @@ -36,7 +36,7 @@ case $child in *) child="b_$child" esac -if [ "$child" != "HEAD" -a "`cat CVS/Tag`" != "T$child" ]; then +if [ "$child" != "HEAD" -a "`cat CVS/Tag 2> /dev/null`" != "T$child" ]; then echo "This script must be run within the $child branch" exit 1 fi @@ -59,9 +59,12 @@ echo "done" echo -n "tagging $child as '${PARENT}_${CHILD}_UPDATE_CHILD_$date' ...." $CVS rtag -r $child ${PARENT}_${CHILD}_UPDATE_CHILD_$date $module echo "done" + +# Apply all of the changes to your local tree: echo "Updating: -j ${CHILD}_BASE -j ${PARENT}_${CHILD}_UPDATE_PARENT_$date ...." $CVS update -j ${CHILD}_BASE -j ${PARENT}_${CHILD}_UPDATE_PARENT_$date -dP echo "done" + echo -n "Recording conflicts in $CONFLICTS ..." if $CVS update | grep '^C' > $CONFLICTS; then echo "Conflicts found, fix before committing." @@ -70,4 +73,6 @@ else echo "No conflicts found" rm -f $CONFLICTS fi +echo "done" + echo "Test, commit and then run merge2.sh (no arguments)" diff --git a/lustre/tests/.cvsignore b/lustre/tests/.cvsignore index 4df0fc194219fbd1da8288153ae1b90d09bb6b13..0b00c70632f0ac37b1090ff35f57a724bb003194 100644 --- a/lustre/tests/.cvsignore +++ b/lustre/tests/.cvsignore @@ -63,3 +63,4 @@ logs ostactive ll_dirstripe_verify openfilleddirunlink +copy_attr diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 97cbeee2399f26fa8e28619a9535dec135f5fda2..4d812e04abaa9bb56afbdd0bfc55c778b47ccc31 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -15,11 +15,12 @@ noinst_SCRIPTS += runfailure-ost runiozone runregression-net.sh runtests noinst_SCRIPTS += sanity.sh rundbench noinst_PROGRAMS = openunlink testreq truncate directio openme writeme open_delay noinst_PROGRAMS += tchmod toexcl fsx test_brw openclose createdestroy -noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink utime cmknod +noinst_PROGRAMS += stat createmany statmany multifstat createtest mlink utime noinst_PROGRAMS += opendirunlink opendevunlink unlinkmany fchdir_test checkstat noinst_PROGRAMS += wantedi statone runas openfile getdents mkdirdeep o_directory -noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify -noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify openfilleddirunlink +noinst_PROGRAMS += small_write multiop sleeptest ll_sparseness_verify cmknod +noinst_PROGRAMS += ll_sparseness_write mrename ll_dirstripe_verify copy_attr +noinst_PROGRAMS += openfilleddirunlink # noinst_PROGRAMS += ldaptest bin_PROGRAMS = mcreate munlink mkdirmany iopentest1 iopentest2 endif # TESTS @@ -71,6 +72,8 @@ sleeptest_SOURCES = sleeptest.c #write_append_truncate_CC=mpicc #createmany_mpi_SOURCES=createmany_mpi.c #createmany_mpi_CC=mpicc +copy_attr_SOURCES= copy_attr.c +copy_attr_LDADD= -lattr #sanity test ll_sparseness_verify_SOURCES = ll_sparseness_verify.c diff --git a/lustre/tests/cfg/insanity-local.sh b/lustre/tests/cfg/insanity-local.sh index 2ca1485074ea4848604ebcdc730e819a905ab4f2..1e0c6e974bde21ea300bb8959eecdaf2e436e41f 100644 --- a/lustre/tests/cfg/insanity-local.sh +++ b/lustre/tests/cfg/insanity-local.sh @@ -25,7 +25,7 @@ OSTSIZE=${OSTSIZE:=10000} #50000000 OSTJOURNALSIZE=${OSTJOURNALSIZE:-0} FSTYPE=${FSTYPE:-ext3} -STRIPE_BYTES=${STRIPE_BYTES:-65536} #1048576 +STRIPE_BYTES=${STRIPE_BYTES:-524288} #1048576 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index 9398d07cc62ccd8249c52287ad7527ecbe308a81..38effed2fb8e09bbf0798a0d53edbbf9f8de629f 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -27,7 +27,7 @@ FSTYPE=${FSTYPE:-ext3} TIMEOUT=${TIMEOUT:-10} UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh} -STRIPE_BYTES=${STRIPE_BYTES:-65536} +STRIPE_BYTES=${STRIPE_BYTES:-524288} STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD diff --git a/lustre/tests/cfg/mdev.sh b/lustre/tests/cfg/mdev.sh index 7d50f079cc3ee404f5981830e0d7ff3587f46e2a..c8a42e092e50c43f4b98f794143ad6990b623b93 100644 --- a/lustre/tests/cfg/mdev.sh +++ b/lustre/tests/cfg/mdev.sh @@ -23,7 +23,7 @@ FSTYPE=${FSTYPE:-ext3} TIMEOUT=${TIMEOUT:-10} UPCALL=${UPCALL:-$PWD/replay-single-upcall.sh} -STRIPE_BYTES=${STRIPE_BYTES:-65536} +STRIPE_BYTES=${STRIPE_BYTES:-524288} STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-0} FAILURE_MODE=${FAILURE_MODE:-SOFT} # or HARD diff --git a/lustre/tests/copy_attr.c b/lustre/tests/copy_attr.c new file mode 100644 index 0000000000000000000000000000000000000000..bee26f64ac37a0b7817d5750302875ae471a5f13 --- /dev/null +++ b/lustre/tests/copy_attr.c @@ -0,0 +1,56 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ +#include <stdio.h> +#include <liblustre.h> +#include <linux/lustre_lib.h> +#include <linux/lustre_idl.h> +#include <linux/lustre_mds.h> +#include <sys/types.h> +#include <attr/xattr.h> + +#define XATTR_LUSTRE_MDS_OBJID "trusted.lov" + +int +main(int argc, char *argv[]) +{ + struct lov_user_md *lmm1,*lmm2; + int size; + struct stat statbuf; + + if (argc != 3) { + fprintf(stderr,"usage: copy_attr file1 file2 \n"); + exit(1); + } + + size = getxattr(argv[1], XATTR_LUSTRE_MDS_OBJID, NULL, 0); + if (size < 0) { + perror("getting attr size"); + exit(1); + } + lmm1 = malloc(size); + lmm2 = malloc(size); + if (lmm1 == NULL || lmm2 == NULL) { + fprintf(stderr,"Failure to get memory \n"); + exit(1); + } + + if (getxattr(argv[1], XATTR_LUSTRE_MDS_OBJID, lmm1, size) < 0) { + perror("getting xattr :"); + exit(1); + } + + if (stat(argv[2], &statbuf)) { + perror("stat"); + exit(1); + } + + memcpy(lmm2, lmm1, size); + lmm2->lmm_object_id = statbuf.st_ino; + if (setxattr(argv[2], XATTR_LUSTRE_MDS_OBJID, lmm2, size, 0) < 0) { + perror("setxattr"); + exit(1); + } + + exit(0); +} diff --git a/lustre/tests/directio.c b/lustre/tests/directio.c index 51315bc5c113114b36172ba21c77428baa3c259a..7bcc5dc2dfa9e3a6da07724d4287c56840268aa3 100644 --- a/lustre/tests/directio.c +++ b/lustre/tests/directio.c @@ -12,6 +12,39 @@ #include <sys/stat.h> #include <sys/mman.h> +int write_buffer(char *fname, char *buffer, int len) +{ + int fd, rc; + + fd = open(fname, O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (fd == -1) { + printf("Cannot open %s: %s\n", fname, strerror(errno)); + exit(1); + } + rc = write(fd, buffer, len); + if (rc != len) { + printf("write: %d\n", rc); + exit(1); + } + close(fd); + return 0; +} + +void verify(char *buffer, char *compare, int length) +{ + int i; + for (i = 0; i < length; i++) { + if (buffer[i] != compare[i]) { + fprintf(stderr, "garbage read (i=%d): expected %c, found %c\n", + i, compare[i], buffer[i]); + write_buffer("/tmp/dio1", buffer, length); + write_buffer("/tmp/dio2", compare, length); + exit(1); + } + } +} + + int main(int argc, char **argv) { int fd; @@ -43,15 +76,16 @@ int main(int argc, char **argv) return 1; } - printf("directio on %s for %dx%lu bytes \n", argv[1], blocks, - st.st_blksize); + fprintf(stderr, "directio on %s for %dx%lu bytes \n", argv[1], blocks, + st.st_blksize); seek = (off64_t)seek_blocks * (off64_t)st.st_blksize; +#if 0 if (lseek64(fd, seek, SEEK_SET) < 0) { printf("lseek64 failed: %s\n", strerror(errno)); return 1; } - +#endif len = blocks * st.st_blksize; wbuf = mmap(0, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, 0, 0); if (wbuf == MAP_FAILED) { @@ -83,8 +117,84 @@ int main(int argc, char **argv) return 1; } + verify(rbuf, wbuf, len); + if (memcmp(wbuf, rbuf, len)) { + printf("Data mismatch on line %d\n", __LINE__); + return 1; + } + + /* try 512-byte buffers, and make sure that the other parts of the + * page aren't modified. */ + if (st.st_blksize < 4096) { + printf("512-byte block size tests skipped (because blocksize " + "passed is < 4k)\n"); + printf("PASS\n"); + return 0; + } + + + + /* write test */ + if (lseek64(fd, 512, SEEK_SET) < 0) { + printf("Cannot seek %s\n", strerror(errno)); + return 1; + } + + memset(wbuf, 0x44, len); + memset(wbuf + 2048, 0x69, 512); + rc = write(fd, wbuf + 2048, 512); + if (rc != 512) { + printf("Write error %s (rc = %d)\n", strerror(errno), rc); + return 1; + } + + memset(rbuf, 0x44, len); + memset(rbuf + 2048, 0x69, 512); + if (memcmp(wbuf, rbuf, len)) { + printf("Data mismatch on line %d\n", __LINE__); + return 1; + } + + /* read test */ + if (lseek64(fd, 512, SEEK_SET) < 0) { + printf("Cannot seek %s\n", strerror(errno)); + return 1; + } + memset(rbuf, 0xba, len); + rc = read(fd, rbuf + 1024, 512); + if (rc != 512) { + printf("Read error: %s (rc = %d)\n", strerror(errno), rc); + return 1; + } + + memset(wbuf, 0xba, len); + memset(wbuf + 1024, 0x69, 512); + + verify(rbuf, wbuf, len); +#if 0 + if (memcmp(wbuf, rbuf, len)) { + printf("Data mismatch on line %d\n", __LINE__); + return 1; + } +#endif + + /* read back the whole block, to see that it's untouched. */ + if (lseek64(fd, seek, SEEK_SET) < 0) { + printf("Cannot seek %s\n", strerror(errno)); + return 1; + } + + memset(rbuf, 0x1, len); + rc = read(fd, rbuf, len); + if (rc != len) { + printf("Read error: %s (rc = %d)\n", strerror(errno), rc); + return 1; + } + + memset(wbuf, 0xba, len); + memset(wbuf + 512, 0x69, 512); if (memcmp(wbuf, rbuf, len)) { - printf("Data mismatch\n"); + printf("Data mismatch on line %d\n", __LINE__); return 1; } diff --git a/lustre/tests/echo.sh b/lustre/tests/echo.sh index 67dd27ea525a7afc30bcfa9df4e24a7f73be2277..a45fd3992dea47204fd95df00dec9c820fadba9c 100755 --- a/lustre/tests/echo.sh +++ b/lustre/tests/echo.sh @@ -37,7 +37,7 @@ MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`} MDSSIZE=10000 FSTYPE=${FSTYPE:-ext3} -STRIPE_BYTES=65536 +STRIPE_BYTES=1048576 STRIPES_PER_OBJ=2 # 0 means stripe over all OSTs rm -f $config diff --git a/lustre/tests/lfsck_config.sh b/lustre/tests/lfsck_config.sh new file mode 100755 index 0000000000000000000000000000000000000000..8f1173e3cde39f1192a37ab247c3750e735b0057 --- /dev/null +++ b/lustre/tests/lfsck_config.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +export PATH=`dirname $0`/../../utils:$PATH + +config=${1:-lfsck_config.xml} + +LMC="${LMC:-lmc} -m $config" +TMP=${TMP:-/tmp} + +MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`} +MDSSIZE=${MDSSIZE:-100000} +FSTYPE=${FSTYPE:-ext3} +MOUNT=${MOUNT:-/mnt/lustre} +#MOUNT2=${MOUNT2:-${MOUNT}2} +NETWORKTYPE=${NETWORKTYPE:-tcp} + +OSTSIZE=${OSTSIZE:-200000} + +# specific journal size for the ost, in MB +JSIZE=${JSIZE:-0} +[ "$JSIZE" -gt 0 ] && JARG="--journal_size $JSIZE" +MDSISIZE=${MDSISIZE:-128} + +STRIPE_BYTES=524288 +STRIPES_PER_OBJ=0 # 0 means stripe over all OSTs + +rm -f $config + +# create nodes +${LMC} --add node --node localhost || exit 10 +${LMC} --add net --node localhost --nid `hostname` --nettype $NETWORKTYPE || exit 11 + +# configure mds server +${LMC} --add mds --nspath /mnt/mds_ns --node localhost --mds mds1 --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE $JARG --mkfsoptions "-I $MDSISIZE" || exit 20 + +# configure osts +${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 || exit 20 +i=0 +while [ $i -lt $NUM_OSTS ] +do +${LMC} --add ost --node localhost --lov lov1 --fstype $FSTYPE --dev $TMP/ost$i-`hostname` --size $OSTSIZE $JARG || exit 30 +i=`expr $i + 1` +done + +# create client config +${LMC} --add mtpt --node localhost --path $MOUNT --mds mds1 --lov lov1 || exit 40 +#${LMC} --add mtpt --node localhost --path $MOUNT2 --mds mds1 --lov lov1 || exit 40 diff --git a/lustre/tests/lfscktest.sh b/lustre/tests/lfscktest.sh new file mode 100755 index 0000000000000000000000000000000000000000..4c9fed0b02382f6439ff7f815d1ecdc58ba3f9ec --- /dev/null +++ b/lustre/tests/lfscktest.sh @@ -0,0 +1,123 @@ +#!/bin/bash +set -vx +set -e + +. ./lfscktest_config.sh + +#Create mount points on target OST and MDS +#Create test directory + +mkdir -p $OST_MOUNTPT +mkdir -p $MDS_MOUNTPT +mkdir -p $TEST_DIR + +export PATH=$E2FSCK_PATH/e2fsck:$PATH + +# Create some files on the filesystem +for i in `seq 0 3`; do + mkdir -p ${MOUNT}/d$i + for j in `seq 0 5`; do + mkdir -p ${MOUNT}/d$i/d$j + for k in `seq 1 5`; do + FILE="${MOUNT}/d$i/d$j/test$k" + echo "creating $FILE" + dd if=/dev/zero bs=4k count=1 of=$FILE + done + done +done +# Create Files to be modified + +file_name=${TESTNAME} + +for FILE in `seq -f ${TEST_DIR}/${file_name}.%g 0 40`; do + dd if=/dev/zero count=1 bs=64k of=$FILE || exit 1 +done + +#Create some more files + +for i in `seq 21 23`; do + mkdir -p ${MOUNT}/d$i + for j in `seq 0 5`; do + mkdir -p ${MOUNT}/d$i/d$j + for k in `seq 0 5`; do + FILE="${MOUNT}/d$i/d$j/test$k" + echo "creating $FILE" + dd if=/dev/zero bs=4k count=1 of=$FILE + done + done +done + +# Get objids for a file on the OST +OST_TEST_FILE_OBJIDS="" +for i in `seq 0 19`; do + OST_TEST_FILE=${TEST_DIR}/${file_name}.$i + ##Get the file OBJID + OST_TEST_FILE_OBJID=`$LFIND -v -o $OST_UUID $OST_TEST_FILE|grep '\*$' | awk '{ print $2 }'` || exit 1 + if [ "$OST_TEST_FILE_OBJID" ]; then + echo "REMOVING OBJID $OST_TEST_FILE_OBJID on $OST_UUID from $OST_TEST_FILE" + fi + OST_TEST_FILE_OBJIDS="$OST_TEST_FILE_OBJIDS $OST_TEST_FILE_OBJID" +done + +MDS_FILES="" +for i in `seq 20 39`; do + TEST_FILE=${TEST_DIR}/${file_name}.$i + echo "REMOVING MDS FILE $TEST_FILE which has info:" + $LFIND -v $TEST_FILE || exit 1 + MDS_FILES="$MDS_FILES ${TESTNAME}/${file_name}.$i" +done + +$LCONF --cleanup ${CONFIGXML} || exit 1 + +# Remove objects associated with files +echo "removing objects: $OST_TEST_FILE_OBJIDS" +for i in $OST_TEST_FILE_OBJIDS; do + z=`expr $i % 32` + $DEBUGFS -w -R "rm O/0/d$z/$i" "$OSTDEV" || exit 1 +done + +mount "-o" loop $MDSDEV $MDS_MOUNTPT + +#Remove files from mds +for i in $MDS_FILES; do + rm $MDS_MOUNTPT/ROOT/$i || (umount $MDS_MOUNTPT && exit 1) +done + +#Create EAs on files so objects are referenced twice from different mds files +for i in `seq 40 59`; do + touch $MDS_MOUNTPT/ROOT/${TESTNAME}/${TESTNAME}.bad.$i + ${GPATH}/copy_attr $MDS_MOUNTPT/ROOT/${TESTNAME}/${TESTNAME}.$i $MDS_MOUNTPT/ROOT/${TESTNAME}/${TESTNAME}.bad.$i || (umount $MDS_MOUNTPT && exit 1) + i=`expr $i + 1` +done + umount $MDS_MOUNTPT + rmdir $MDS_MOUNTPT + rmdir $OST_MOUNTPT + +# Run e2fsck to get mds and ost info +# a return status of 1 indicates e2fsck successfuly fixed problems found + +e2fsck -d -f -y --mdsdb $GPATH/mdsdb $MDSDEV +RET=$? +[ $RET -ne 0 -a $RET -ne 1 ] && exit 1 +i=0 +OSTDB_LIST="" +while [ $i -lt $NUM_OSTS ]; do + e2fsck -d -f -y --mdsdb $GPATH/mdsdb --ostdb $GPATH/ostdb-$i $TMP/ost$i-`hostname` + RET=$? + [ $RET -ne 0 -a $RET -ne 1 ] && exit 1 + if [ -z "${OSTDB_LIST}" ]; then + OSTDB_LIST=${GPATH}/ostdb-$i + else + OSTDB_LIST=${GPATH}/ostdb-$i,${OSTDB_LIST} + fi + i=`expr $i + 1` +done + +#Mount filesystem +${LCONF} ${CONFIGXML} || exit 1 + +lfsck -l --mdsdb $GPATH/mdsdb --ostdb ${OSTDB_LIST} ${MOUNT} || exit 1 + +#Cleanup +rm $GPATH/mdsdb +rm $GPATH/ostdb-* diff --git a/lustre/tests/lfscktest_config.sh b/lustre/tests/lfscktest_config.sh new file mode 100644 index 0000000000000000000000000000000000000000..45746b2e8ee53fcbf5a76614f29c91fead8a2284 --- /dev/null +++ b/lustre/tests/lfscktest_config.sh @@ -0,0 +1,36 @@ +export TESTNAME="lfscktest" +export TESTDESC="Test of lfsck functionality" + +export LUSTRE=${LUSTRE:-"../.."} +export LCONF=${LCONF:-"$LUSTRE/utils/lconf"} +export LMC=${LMC:-"$LUSTRE/utils/lmc"} +export LCTL=${LCTL:-"$LUSTRE/utils/lctl"} +export LFIND=${LFIND:-"$LUSTRE/utils/lfind"} +export E2FSCK_PATH=${E2FSCK_PATH:-"/usr/src/e2fsprogs-1.34"} +export TMP=${TMP:-"/tmp"} +export CONFIG=${CONFIG:-"./lfsck_config.sh"} +export LOG=${LOG:-"${TMP}/lfscktest.log"} +export CONFIGXML=${CONFIGXML:-"./lfsck_config.xml"} +export LUSTRE_TAG=${LUSTRE_TAG:="HEAD"} +export MACHINENAME=`hostname | sed -e 's/[0-9]\+//'` +export TESTGROUP=${TESTGROUP:-"unspecified"} +export CONFIGDESC=${CONFIGDESC:-"local"} +export TESTARCH=${TESTARCH:-`uname -m`} +export NETWORKTYPE=${NETWORKTYPE:-"tcp"} +export MDSDEV=${MDSDEV:-$TMP/mds1-`hostname`} +export MDSNODES=${MDSNODES:-`hostname`} +export OSTDEV=${OSTDEV:-$TMP/ost1-`hostname`} +export OSTNODES=${OSTNODES:-`hostname`} +export CLIENTNODES=${CLIENTNODES:-`hostname`} +export RECIPIENTS=${RECIPIENTS:-"liam.kelleher@hp.com"} +export SENDER=${SENDER:-"liam.kelleher@hp.com"} +export NUM_OSTS=${NUM_OSTS:-5} +export DEBUGFS=${DEBUGFS:-"debugfs"} + +export GPATH=`pwd` +export OST_UUID="OST_localhost_2_UUID" + +export MDS_MOUNTPT="/mnt/mds_${TESTNAME}" +export OST_MOUNTPT="/mnt/ost_${TESTNAME}" +export MOUNT="/mnt/lustre" +export TEST_DIR="${MOUNT}/${TESTNAME}" diff --git a/lustre/tests/local.sh b/lustre/tests/local.sh index 4bf2e5d45f5d3eda0e008182fb31b1415e353bce..bdcd2e03d89c6752a5aef6ef30763ad3b37ce5e6 100755 --- a/lustre/tests/local.sh +++ b/lustre/tests/local.sh @@ -31,7 +31,7 @@ JSIZE=${JSIZE:-0} MDSISIZE=${MDSISIZE:-0} [ "$MDSISIZE" -gt 0 ] && IARG="--inode_size $MDSISIZE" -STRIPE_BYTES=65536 +STRIPE_BYTES=1048576 STRIPES_PER_OBJ=0 # 0 means stripe over all OSTs rm -f $config diff --git a/lustre/tests/lov.sh b/lustre/tests/lov.sh index ec09598e7a0edf27f415d5c2596e744ad4e38acf..d148f205064b9715cd89ed5e68a2d85994977a81 100755 --- a/lustre/tests/lov.sh +++ b/lustre/tests/lov.sh @@ -23,7 +23,7 @@ OSTSIZE=${OSTSIZE:-150000} # 1 to config an echo client instead of llite ECHO_CLIENT=${ECHO_CLIENT:-} -STRIPE_BYTES=65536 +STRIPE_BYTES=524288 STRIPES_PER_OBJ=${STRIPES_PER_OBJ:-$((OSTCOUNT -1))} # specific journal size for the ost, in MB diff --git a/lustre/tests/mcr-routed-config.sh b/lustre/tests/mcr-routed-config.sh index 8d8a100edc439515ed6661b4d205ea21ee65f95b..bf08dbba806df9bc55a122c58c79eaa63da4246d 100755 --- a/lustre/tests/mcr-routed-config.sh +++ b/lustre/tests/mcr-routed-config.sh @@ -46,7 +46,7 @@ gw2node() { ${LMC} --add net --node $MDS --nid `h2elan $MDS` --nettype elan || exit 1 ${LMC} --add mds --node $MDS --mds mds1 --dev /tmp/mds1 --size 100000 || exit 1 -${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 1 --stripe_pattern 0 +${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz 1048576 --stripe_cnt 1 --stripe_pattern 0 # Client node #${LMC} --add net --node client --tcpbuf $TCPBUF --nid '*' --nettype tcp || exit 1 diff --git a/lustre/tests/mcrlov.sh b/lustre/tests/mcrlov.sh index cce8878b276ae242e1f3ec2e3bb67504f038b676..d09866b9c5f5dd447d87428b7b85af464b9ca61a 100755 --- a/lustre/tests/mcrlov.sh +++ b/lustre/tests/mcrlov.sh @@ -36,7 +36,7 @@ ${LMC} --add route --node $ROUTER --gw `h2elan $ROUTER` --lo `h2elan $CLIENT_LO` ${LMC} --add net --node $MDS --nid `h2elan $MDS` --nettype elan || exit 1 ${LMC} --add mds --node $MDS --mds mds1 --dev $TMP/mds1 --size 100000 || exit 1 -${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 0 --stripe_pattern 0 || exit 1 +${LMC} --add lov --lov lov1 --mds mds1 --stripe_sz 1048576 --stripe_cnt 0 --stripe_pattern 0 || exit 1 ${LMC} --add mtpt --node client --path /mnt/lustre --mds mds1 --lov lov1 diff --git a/lustre/tests/mount2fs.sh b/lustre/tests/mount2fs.sh index 28df665e09e5501cd8c3db05582a65817de72275..64decffb4a9c82effe1b110b020cd200ef5054a5 100644 --- a/lustre/tests/mount2fs.sh +++ b/lustre/tests/mount2fs.sh @@ -15,6 +15,7 @@ MOUNT2=${MOUNT2:-${MOUNT}2} MDSSIZE=50000 FSTYPE=${FSTYPE:-ext3} +STRIPE_BYTES=${STRIPE_BYTES:-1048576} OSTDEV1=${OSTDEV1:-$TMP/ost1-`hostname`} OSTDEV2=${OSTDEV2:-$TMP/ost2-`hostname`} OSTSIZE=100000 @@ -33,8 +34,8 @@ ${LMC} -m $config --format --add mds --node $MDSNODE --mds mds1 --fstype $FSTYPE ${LMC} -m $config --format --add mds --node $MDSNODE --mds mds2 --fstype $FSTYPE --dev $MDSDEV2 --size $MDSSIZE ||exit 10 # configure ost -${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt 0 --stripe_pattern 0 || exit 20 -${LMC} -m $config --add lov --lov lov2 --mds mds2 --stripe_sz 65536 --stripe_cnt 0 --stripe_pattern 0 || exit 20 +${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt 0 --stripe_pattern 0 || exit 20 +${LMC} -m $config --add lov --lov lov2 --mds mds2 --stripe_sz $STRIPE_BYTES --stripe_cnt 0 --stripe_pattern 0 || exit 20 ${LMC} -m $config --add ost --node $OSTNODE --lov lov1 --fstype $FSTYPE --dev $OSTDEV1 --size $OSTSIZE || exit 21 ${LMC} -m $config --add ost --node $OSTNODE --lov lov2 --fstype $FSTYPE --dev $OSTDEV2 --size $OSTSIZE || exit 22 diff --git a/lustre/tests/oos.sh b/lustre/tests/oos.sh index 910346ed3f84c857bd7f03f4a107ba69e4cc38f3..1d068d536251e6e06e571b6153ba9d439a2e38f9 100755 --- a/lustre/tests/oos.sh +++ b/lustre/tests/oos.sh @@ -34,11 +34,8 @@ fi export LANG=C LC_LANG=C # for "No space left on device" message -# make sure, that log file will be removed. Somehow it was possible -# to me, that log file had +a and could not be rewritten, what led -# to test fail. -chattr -ai $LOG >/dev/null 2>&1 rm -f $LOG >/dev/null 2>&1 +[ -f $LOG ] && echo "ERROR: log file wasn't removed?" && exit 1 # make sure we stripe over all OSTs to avoid OOS on only a subset of OSTs $LFS setstripe $OOS 65536 0 $STRIPECOUNT diff --git a/lustre/tests/recovery-cleanup.sh b/lustre/tests/recovery-cleanup.sh index e4eefd01ad74f69409fa44ace5d0395ab936047b..77b5646a05cf7368a8db764ce13f04ae03aee9da 100755 --- a/lustre/tests/recovery-cleanup.sh +++ b/lustre/tests/recovery-cleanup.sh @@ -27,6 +27,7 @@ MDSSIZE=${MDSSIZE:-100000} FSTYPE=${FSTYPE:-ext3} OSTDEV=${OSTDEV:-/tmp/ost1-`hostname`} OSTSIZE=${OSTSIZE:-100000} +STRIPE_BYTES=${STRIPE_BYTES:-1048576} do_mds() { $PDSH $MDSNODE "PATH=\$PATH:$LUSTRE/utils:$LUSTRE/tests; cd $PWD; $@" || exit $? @@ -54,7 +55,7 @@ make_config() { done lmc -m $CONFIG --add mds --node $MDSNODE --mds mds1 --fstype $FSTYPE \ --dev $MDSDEV --size $MDSSIZE || exit 5 - lmc -m $CONFIG --add lov --lov lov1 --mds mds1 --stripe_sz 65536 \ + lmc -m $CONFIG --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES \ --stripe_cnt 0 --stripe_pattern 0 || exit 6 lmc -m $CONFIG --add ost --nspath /mnt/ost_ns --node $OSTNODE \ --lov lov1 --dev $OSTDEV --size $OSTSIZE --fstype $FSTYPE || exit 7 diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 47d77eff525ad4262767bf5c57841ae5b476141a..3eebfb12ddfc5030c611b5cd0ee979e7dc69a79e 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2,8 +2,8 @@ set -e -# bug 2732 2986 -ALWAYS_EXCEPT="17 20b" +# bug 2986 +ALWAYS_EXCEPT="20b" LUSTRE=${LUSTRE:-`dirname $0`/..} @@ -26,7 +26,7 @@ CLEANUP=${CLEANUP:-"cleanup"} make_config() { rm -f $XMLCONFIG add_mds mds --dev $MDSDEV --size $MDSSIZE - add_lov lov1 mds --stripe_sz $STRIPE_BYTES\ + add_lov lov1 mds --stripe_sz $STRIPE_BYTES \ --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE @@ -224,14 +224,17 @@ test_16() { run_test 16 "timeout bulk put, evict client (2732)" test_17() { -#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE - # will get evicted here + # OBD_FAIL_PTLRPC_BULK_GET_NET 0x0503 | OBD_FAIL_ONCE + # client will get evicted here sysctl -w lustre.fail_loc=0x80000503 - do_facet client cp /etc/termcap $MOUNT && return 1 - - do_facet client "cmp /etc/termcap $MOUNT/termcap" && return 1 + do_facet client cp /etc/termcap $DIR/$tfile sysctl -w lustre.fail_loc=0 - do_facet client "cmp /etc/termcap $MOUNT/termcap" || return 2 + + sleep $TIMEOUT + # expect cmp to fail + do_facet client "cmp /etc/termcap $DIR/$tfile" && return 1 + do_facet client "rm $DIR/$tfile" || return 2 + return 0 } run_test 17 "timeout bulk get, evict client (2732)" diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index 9c1f1e1e7bd30a90eb57f7e7e0022f239b29b1ea..3c84d8dfc960da0fd59289caa461f3153829de60 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -16,7 +16,7 @@ gen_config() { add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE fi - add_lov lov1 mds --stripe_sz $STRIPE_BYTES\ + add_lov lov1 mds --stripe_sz $STRIPE_BYTES \ --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index f1523bbfe906a12a3a8428c63ac8efd4b7825926..247c1f0afe36486f78118956e09ef46c9474723c 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -18,7 +18,7 @@ ALWAYS_EXCEPT="5" gen_config() { rm -f $XMLCONFIG add_mds mds --dev $MDSDEV --size $MDSSIZE - add_lov lov1 mds --stripe_sz $STRIPE_BYTES\ + add_lov lov1 mds --stripe_sz $STRIPE_BYTES \ --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE --failover if [ ! -z "$ostfailover_HOST" ]; then diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index 76ce388fa9bbcd76a480ba569792b04203a29a5a..89cd32ceaa30d01df67602899764cba7277aef93 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -24,7 +24,7 @@ gen_config() { add_mdsfailover mds --dev $MDSDEV --size $MDSSIZE fi - add_lov lov1 mds --stripe_sz $STRIPE_BYTES\ + add_lov lov1 mds --stripe_sz $STRIPE_BYTES \ --stripe_cnt $STRIPES_PER_OBJ --stripe_pattern 0 add_ost ost --lov lov1 --dev $OSTDEV --size $OSTSIZE add_ost ost2 --lov lov1 --dev ${OSTDEV}-2 --size $OSTSIZE @@ -899,6 +899,17 @@ test_45() { } run_test 45 "Handle failed close" +test_46() { + dmesg -c >/dev/null + drop_reply "touch $DIR/$tfile" + fail mds + # ironically, the previous test, 45, will cause a real forced close, + # so just look for one for this test + dmesg | grep -i "force closing client file handle for $tfile" && return 1 + return 0 +} +run_test 46 "Don't leak file handle after open resend (3325)" + equals_msg test complete, cleaning up $CLEANUP diff --git a/lustre/tests/run_lfscktest.sh b/lustre/tests/run_lfscktest.sh new file mode 100755 index 0000000000000000000000000000000000000000..cd3562a7e6b40b4f00d065efadd73628e85ae414 --- /dev/null +++ b/lustre/tests/run_lfscktest.sh @@ -0,0 +1,22 @@ +#!/bin/sh + +set -e + +. ./lfscktest_config.sh + +#create xml file for config +${CONFIG} ${CONFIGXML} || exit 1 + +#Mount lustre +${LCONF} --reformat ${CONFIGXML} || exit 1 + +export LUSTRE_BUILD=${LUSTRE_BUILD:-`$LCTL lustre_build_version | awk '/^lctl/ {print $3}'`} +rm -f ${LOG} +#Run test +sh -vx lfscktest.sh 2>&1 | tee $LOG +RESULT=$? +[ ${RESULT} -eq 0 ] && echo PASS || echo FAIL + +#Umount Lustre +$LCONF --cleanup $CONFIGXML +exit $RESULT diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 201de89bf0008c2389ee0c093360e86637b94ebd..4fbfddfc9a05cde14421e95baa7d2b17a9996488 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -7,8 +7,8 @@ set -e ONLY=${ONLY:-"$*"} -# bug number for skipped test: -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-""} +# bug number for skipped test: 2108 +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"42a"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" @@ -68,6 +68,25 @@ log() { lctl mark "$*" 2> /dev/null || true } +trace() { + log "STARTING: $*" + strace -o $TMP/$1.strace -ttt $* + RC=$? + log "FINISHED: $*: rc $RC" + return 1 +} +TRACE=${TRACE:-""} + +check_kernel_version() { + VERSION_FILE=/proc/fs/lustre/kernel_version + WANT_VER=$1 + [ ! -f $VERSION_FILE ] && echo "can't find kernel version" && return 1 + GOT_VER=`cat $VERSION_FILE` + [ $GOT_VER -ge $WANT_VER ] && return 0 + log "test needs at least kernel version $WANT_VER, running $GOT_VER" + return 1 +} + run_one() { if ! mount | grep -q $DIR; then $START @@ -870,9 +889,10 @@ test_31d() { run_test 31d "remove of open directory =========================" test_31e() { + check_kernel_version 34 || return 0 openfilleddirunlink $DIR/d31e || error } -run_test 31e "remove of open non-removable directory =========================" +run_test 31e "remove of open non-empty directory ===============" test_32a() { echo "== more mountpoints and symlinks =================" @@ -1414,7 +1434,7 @@ run_test 43c "md5sum of copy into lustre========================" test_44() { [ "$OSTCOUNT" -lt "2" ] && echo "skipping 2-stripe test" && return - dd if=/dev/zero of=$DIR/f1 bs=4k count=1 seek=127 + dd if=/dev/zero of=$DIR/f1 bs=4k count=1 seek=1023 dd if=$DIR/f1 bs=4k count=1 } run_test 44 "zero length read from a sparse stripe =============" @@ -1491,18 +1511,18 @@ page_size() { getconf PAGE_SIZE } -# in a 2 stripe file (lov.sh), page 63 maps to page 31 in its object. this +# in a 2 stripe file (lov.sh), page 1023 maps to page 511 in its object. this # test tickles a bug where re-dirtying a page was failing to be mapped to the -# objects offset and an assert hit when an rpc was built with 63's mapped -# offset 31 and 31's raw 31 offset. it also found general redirtying bugs. +# objects offset and an assert hit when an rpc was built with 1023's mapped +# offset 511 and 511's raw 511 offset. it also found general redirtying bugs. test_46() { f="$DIR/f46" stop_writeback sync - dd if=/dev/zero of=$f bs=`page_size` seek=31 count=1 + dd if=/dev/zero of=$f bs=`page_size` seek=511 count=1 sync - dd conv=notrunc if=/dev/zero of=$f bs=`page_size` seek=63 count=1 - dd conv=notrunc if=/dev/zero of=$f bs=`page_size` seek=31 count=1 + dd conv=notrunc if=/dev/zero of=$f bs=`page_size` seek=1023 count=1 + dd conv=notrunc if=/dev/zero of=$f bs=`page_size` seek=511 count=1 sync start_writeback } @@ -1515,7 +1535,8 @@ test_47() { run_test 47 "Device nodes check ================================" test_48a() { # bug 2399 - mkdir $DIR/d48a + check_kernel_version 34 || return 0 + mkdir -p $DIR/d48a cd $DIR/d48a mv $DIR/d48a $DIR/d48.new || error "move directory failed" mkdir $DIR/d48a || error "recreate directory failed" @@ -1527,24 +1548,65 @@ test_48a() { # bug 2399 mkdir . && error "'mkdir .' worked after recreating cwd" rmdir . && error "'rmdir .' worked after recreating cwd" ln -s . baz || error "'ln -s .' failed after recreating cwd" + cd .. || error "'cd ..' failed after recreating cwd" } run_test 48a "Access renamed working dir (should return errors)=" test_48b() { # bug 2399 - mkdir $DIR/d48b + check_kernel_version 34 || return 0 + mkdir -p $DIR/d48b cd $DIR/d48b rmdir $DIR/d48b || error "remove cwd $DIR/d48b failed" touch foo && error "'touch foo' worked after removing cwd" mkdir foo && error "'mkdir foo' worked after removing cwd" ls . && error "'ls .' worked after removing cwd" ls .. || error "'ls ..' failed after removing cwd" - cd . && error "'cd .' worked after recreate cwd" + cd . && error "'cd .' worked after removing cwd" mkdir . && error "'mkdir .' worked after removing cwd" rmdir . && error "'rmdir .' worked after removing cwd" ln -s . foo && error "'ln -s .' worked after removing cwd" || true + #cd .. || error "'cd ..' failed after removing cwd" } run_test 48b "Access removed working dir (should return errors)=" +test_48c() { # bug 2350 + check_kernel_version 36 || return 0 + #sysctl -w portals.debug=-1 + #set -vx + mkdir -p $DIR/d48c/dir + cd $DIR/d48c/dir + rmdir $DIR/d48c/dir || error "remove cwd $DIR/d48c/dir failed" + $TRACE touch foo && error "'touch foo' worked after removing cwd" + $TRACE mkdir foo && error "'mkdir foo' worked after removing cwd" + $TRACE ls . && error "'ls .' worked after removing cwd" + $TRACE ls .. || error "'ls ..' failed after removing cwd" + $TRACE cd . && error "'cd .' worked after recreate cwd" + $TRACE mkdir . && error "'mkdir .' worked after removing cwd" + $TRACE rmdir . && error "'rmdir .' worked after removing cwd" + $TRACE ln -s . foo && error "'ln -s .' worked after removing cwd" ||true + $TRACE cd .. || error "'cd ..' failed after removing cwd" +} +run_test 48c "Access removed working subdir (should return errors)" + +test_48d() { # bug 2350 + check_kernel_version 36 || return 0 + #sysctl -w portals.debug=-1 + #set -vx + mkdir -p $DIR/d48d/dir + cd $DIR/d48d/dir + rm -r $DIR/d48d || error "remove cwd and parent $DIR/d48d failed" + $TRACE touch foo && error "'touch foo' worked after removing cwd" + $TRACE mkdir foo && error "'mkdir foo' worked after removing cwd" + $TRACE ls . && error "'ls .' worked after removing cwd" + $TRACE ls .. && error "'ls ..' worked after removing cwd" + $TRACE cd . && error "'cd .' worked after recreate cwd" + $TRACE mkdir . && error "'mkdir .' worked after removing cwd" + $TRACE rmdir . && error "'rmdir .' worked after removing cwd" + $TRACE ln -s . foo && error "'ln -s .' worked after removing cwd" ||true + $TRACE cd .. && error "'cd ..' worked after removing cwd" || true +} +run_test 48d "Access removed parent subdir (should return errors)" + test_50() { # bug 1485 mkdir $DIR/d50 @@ -1925,6 +1987,7 @@ run_test 66 "update inode blocks count on client ===============" test_67() { # bug 3285 - supplementary group fails on MDS, passes on client [ "$RUNAS_ID" = "$UID" ] && echo "skipping test 67" && return + check_kernel_version 35 || return 0 mkdir $DIR/d67 chmod 771 $DIR/d67 chgrp $RUNAS_ID $DIR/d67 diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 0c34d6bc51073c790c6d58f593bce04fefc83de0..974fdbb155b31e47b426c011299105fcd06e4f41 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -3,8 +3,8 @@ set -e ONLY=${ONLY:-"$*"} -# bug number for skipped test: 1768 1557 -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4 8 14b"} +# bug number for skipped test: 1768 +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"4 14b"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! [ "$ALWAYS_EXCEPT$EXCEPT" ] && echo "Skipping tests: $ALWAYS_EXCEPT $EXCEPT" diff --git a/lustre/tests/uml.sh b/lustre/tests/uml.sh index 9195e59358bb9a650cb7a1e922dffec8a43f8ea9..acc38b922004b56a1cb0807412f78a01930171e2 100644 --- a/lustre/tests/uml.sh +++ b/lustre/tests/uml.sh @@ -15,6 +15,7 @@ OSTDEVBASE=$TMP/ost #etc OSTSIZE=${OSTSIZE:-100000} STRIPECNT=${STRIPECNT:-1} +STRIPE_BYTES=${STRIPE_BYTES:-1048576} OSDTYPE=${OSDTYPE:-obdfilter} OSTFAILOVER=${OSTFAILOVER:-} @@ -87,7 +88,7 @@ echo; echo "adding MDS on: $MDSNODE" ${LMC} -m $config --add mds --format --node $MDSNODE --mds mds1 --fstype $FSTYPE --dev $MDSDEV --size $MDSSIZE ||exit 10 # configure ost -${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz 65536 --stripe_cnt $STRIPECNT --stripe_pattern 0 || exit 20 +${LMC} -m $config --add lov --lov lov1 --mds mds1 --stripe_sz $STRIPE_BYTES --stripe_cnt $STRIPECNT --stripe_pattern 0 || exit 20 COUNT=1 echo -n "adding OST on:" for NODE in $OSTNODES; do diff --git a/lustre/utils/Lustre/lustredb.py b/lustre/utils/Lustre/lustredb.py index 4ba04db11ba90c3b46f25891c1f7298b21459c36..eda57798e7c22e89d9d4e7623dfd9159596815a9 100644 --- a/lustre/utils/Lustre/lustredb.py +++ b/lustre/utils/Lustre/lustredb.py @@ -129,6 +129,10 @@ class LustreDB_XML(LustreDB): self.dom_node = dom self.root_node = root_node + def close(self): + # do nothing + return None + def xmltext(self, dom_node, tag): list = dom_node.getElementsByTagName(tag) if len(list) > 0: diff --git a/lustre/utils/lconf b/lustre/utils/lconf index bb3a45dbf45d9412c992f8d8fb773c7f0e9f3639..dbfe7ef854ed95ce86089c758a4d6f949cd8ceb8 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -1315,7 +1315,7 @@ class LOV(Module): self.name = "lov_%s" % name_override self.add_lustre_module('lov', 'lov') self.mds_uuid = self.db.get_first_ref('mds') - self.stripe_sz = self.db.get_val_int('stripesize', 65536) + self.stripe_sz = self.db.get_val_int('stripesize', 1048576) self.stripe_off = self.db.get_val_int('stripeoffset', 0) self.pattern = self.db.get_val_int('stripepattern', 0) self.devlist = self.db.get_refs('obd') @@ -1849,9 +1849,9 @@ class OSD(Module): def mgmt_uuid_for_fs(mtpt_name): if not mtpt_name: return '' - mtpt_db = toplevel.lookup_name(mtpt_name) + mtpt_db = toplustreDB.lookup_name(mtpt_name) fs_uuid = mtpt_db.get_first_ref('filesystem') - fs = toplevel.lookup(fs_uuid) + fs = toplustreDB.lookup(fs_uuid) if not fs: return '' return fs.get_first_ref('mgmt') @@ -2436,6 +2436,7 @@ def doHost(lustreDB, hosts): prof_list = node_db.get_refs('profile') if config.write_conf: + lustreDB.close() for_each_profile(node_db, prof_list, doModules) sys_make_devices() for_each_profile(node_db, prof_list, doWriteconf) @@ -2464,6 +2465,7 @@ def doHost(lustreDB, hosts): for_each_profile(node_db, prof_list, doCleanup) for_each_profile(node_db, prof_list, doUnloadModules) + lustreDB.close() else: # ugly hack, only need to run lctl commands for --dump @@ -2494,22 +2496,24 @@ def doHost(lustreDB, hosts): sys_set_portals_upcall(portals_upcall) for_each_profile(node_db, prof_list, doSetup) + lustreDB.close() -def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid): - tgt = db.lookup(tgt_uuid) +def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid): + tgt = lustreDB.lookup(tgt_uuid) if not tgt: raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.") new_uuid = get_active_target(tgt) if not new_uuid: raise Lustre.LconfError("doRecovery: no active target found for: " + tgt_uuid) - net = choose_local_server(get_ost_net(db, new_uuid)) + net = choose_local_server(get_ost_net(lustreDB, new_uuid)) if not net: raise Lustre.LconfError("Unable to find a connection to:" + new_uuid) log("Reconnecting", tgt_uuid, " to ", net.nid_uuid); try: - oldnet = get_server_by_nid_uuid(db, nid_uuid) + oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid) + lustreDB.close() if oldnet: lctl.disconnect(oldnet) except CommandError, e: @@ -2757,7 +2761,7 @@ lconf_options = [ ] def main(): - global lctl, config, toplevel, CONFIG_FILE + global lctl, config, toplustreDB, CONFIG_FILE # in the upcall this is set to SIG_IGN signal.signal(signal.SIGCHLD, signal.SIG_DFL) @@ -2809,8 +2813,9 @@ def main(): except Exception: panic("%s does not appear to be a config file." % (args[0])) sys.exit(1) # make sure to die here, even in debug mode. + config_file.close() CONFIG_FILE = args[0] - db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement) + lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement) if not config.config: config.config = os.path.basename(args[0])# use full path? if config.config[-4:] == '.xml': @@ -2819,7 +2824,7 @@ def main(): if not config.config: panic("--ldapurl requires --config name") dn = "config=%s,fs=lustre" % (config.config) - db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl) + lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl) elif config.ptldebug or config.subsystem: sys_set_ptldebug(None) sys_set_subsystem(None) @@ -2829,9 +2834,9 @@ def main(): print 'see lconf --help for command summary' sys.exit(1) - toplevel = db + toplustreDB = lustreDB - ver = db.get_version() + ver = lustreDB.get_version() if not ver: panic("No version found in config data, please recreate.") if ver != Lustre.CONFIG_VERSION: @@ -2863,7 +2868,7 @@ def main(): lctl.clear_log(config.record_device, config.record_log) lctl.record(config.record_device, config.record_log) - doHost(db, node_list) + doHost(lustreDB, node_list) if config.record: lctl.end_record() diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index 2fcf5ad01e1a0a02c07c67dbf7d04e2578f57da5..4d3900e590489acc849b08bd33a456bda8ef1bce 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -112,7 +112,7 @@ static int lfs_setstripe(int argc, char **argv) return CMD_HELP; } - result = op_create_file(argv[1], st_size, st_offset, st_count); + result = llapi_file_create(argv[1], st_size, st_offset, st_count, 0); if (result) fprintf(stderr, "error: %s: create stripe file failed\n", argv[0]); @@ -173,7 +173,7 @@ static int lfs_find(int argc, char **argv) return CMD_HELP; do { - rc = op_find(argv[optind], obduuid, recursive, verbose, quiet); + rc = llapi_find(argv[optind], obduuid, recursive,verbose,quiet); } while (++optind < argc && !rc); if (rc) @@ -192,7 +192,7 @@ static int lfs_getstripe(int argc, char **argv) optind = 1; do { - rc = op_find(argv[optind], obduuid, 0, 0, 0); + rc = llapi_find(argv[optind], obduuid, 0, 0, 0); } while (++optind < argc && !rc); if (rc) @@ -221,10 +221,11 @@ static int lfs_osts(int argc, char **argv) mnt = getmntent(fp); while (feof(fp) == 0 && ferror(fp) ==0) { if (llapi_is_lustre_mnttype(mnt->mnt_type)) { - rc = op_find(mnt->mnt_dir, obduuid, 0, 0, 0); + rc = llapi_find(mnt->mnt_dir, obduuid, 0, 0, 0); if (rc) - fprintf(stderr, "error: lfs osts failed for %s\n", - mnt->mnt_dir); + fprintf(stderr, + "error: lfs osts failed on %s\n", + mnt->mnt_dir); } mnt = getmntent(fp); } @@ -239,25 +240,25 @@ static int lfs_check(int argc, char **argv) int rc; FILE *fp; struct mntent *mnt = NULL; - int type_num = 1; - char *obd_type_p[2]; + int num_types = 1; + char *obd_types[2]; char obd_type1[4]; char obd_type2[4]; if (argc != 2) return CMD_HELP; - obd_type_p[1]=obd_type1; - obd_type_p[2]=obd_type2; - - if (strcmp(argv[1],"osts")==0) { - strcpy(obd_type_p[0],"osc"); - } else if (strcmp(argv[1],"mds")==0) { - strcpy(obd_type_p[0],"mdc"); - } else if (strcmp(argv[1],"servers")==0) { - type_num=2; - strcpy(obd_type_p[0],"osc"); - strcpy(obd_type_p[1],"mdc"); + obd_types[1] = obd_type1; + obd_types[2] = obd_type2; + + if (strcmp(argv[1], "osts") == 0) { + strcpy(obd_types[0], "osc"); + } else if (strcmp(argv[1], "mds") == 0) { + strcpy(obd_types[0], "mdc"); + } else if (strcmp(argv[1], "servers") == 0) { + num_types = 2; + strcpy(obd_types[0], "osc"); + strcpy(obd_types[1], "mdc"); } else { fprintf(stderr, "error: %s: option '%s' unrecognized\n", argv[0], argv[1]); @@ -278,7 +279,7 @@ static int lfs_check(int argc, char **argv) endmntent(fp); } - rc = op_check(type_num,obd_type_p,mnt->mnt_dir); + rc = llapi_target_check(num_types, obd_types, mnt->mnt_dir); if (rc) fprintf(stderr, "error: %s: %s status failed\n", @@ -316,9 +317,9 @@ static int lfs_catinfo(int argc, char **argv) if (mnt) { if (argc == 3) - rc = op_catinfo(mnt->mnt_dir, argv[1], argv[2]); + rc = llapi_catinfo(mnt->mnt_dir, argv[1], argv[2]); else - rc = op_catinfo(mnt->mnt_dir, argv[1], NULL); + rc = llapi_catinfo(mnt->mnt_dir, argv[1], NULL); } else { fprintf(stderr, "no lustre_lite mounted.\n"); rc = -1; diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 254d7a086a8d801b1665f467353b0c5fc54c3def..ca1a49036deb04a5bc89a4722fb71e2ae0f804f7 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -61,17 +61,18 @@ static void err_msg(char *fmt, ...) fprintf(stderr, ": %s (%d)\n", strerror(tmp_errno), tmp_errno); } -int op_create_file(char *name, long stripe_size, int stripe_offset, - int stripe_count) +int llapi_file_create(char *name, long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern) { struct lov_user_md lum = { 0 }; int fd, rc = 0; /* Initialize IOCTL striping pattern structure */ lum.lmm_magic = LOV_USER_MAGIC; + lum.lmm_pattern = stripe_pattern; lum.lmm_stripe_size = stripe_size; - lum.lmm_stripe_offset = stripe_offset; lum.lmm_stripe_count = stripe_count; + lum.lmm_stripe_offset = stripe_offset; fd = open(name, O_CREAT | O_RDWR | O_LOV_DELAY_CREATE, 0644); if (errno == EISDIR) @@ -100,21 +101,23 @@ int op_create_file(char *name, long stripe_size, int stripe_offset, return rc; } +/* short term backwards compat only */ +int op_create_file(char *name, long stripe_size, int stripe_offset, + int stripe_count) +{ + return llapi_file_create(name, stripe_size, stripe_offset, + stripe_count, 0); +} + struct find_param { int recursive; int verbose; int quiet; struct obd_uuid *obduuid; - struct obd_ioctl_data data; - struct lov_desc desc; - int uuidslen; - char *buf; - int buflen; - struct obd_uuid *uuids; + int lumlen; struct lov_user_md *lum; int got_uuids; int obdindex; - int max_ost_count; }; /* XXX Max obds per lov currently hardcoded to 1000 in lov/lov_obd.c */ @@ -123,49 +126,15 @@ struct find_param { static int prepare_find(struct find_param *param) { - int datalen, desclen; - int cfglen, lumlen; - int max_ost_count = MAX_LOV_UUID_COUNT; - - datalen = size_round(sizeof(param->data)); - desclen = size_round(sizeof(param->desc)); - param->uuidslen = size_round(max_ost_count * sizeof(*param->uuids)); - cfglen = datalen + desclen + param->uuidslen; - lumlen = lov_mds_md_size(max_ost_count); - if (cfglen > lumlen) - param->buflen = cfglen; - else - param->buflen = lumlen; - - /* XXX max ioctl buffer size currently hardcoded to 8192 */ - if (param->buflen > 8192) { - int nuuids, remaining; - - param->buflen = 8192; - nuuids = (param->buflen - datalen - desclen) / - sizeof(*param->uuids); - param->uuidslen = size_round(nuuids * sizeof(*param->uuids)); - remaining = nuuids * sizeof(*param->uuids); - if (param->uuidslen > remaining) - nuuids--; - max_ost_count = nuuids; - while ((lumlen=lov_mds_md_size(max_ost_count)) > param->buflen) - --max_ost_count; - - cfglen = datalen + desclen + param->uuidslen; - } - - if ((param->buf = malloc(param->buflen)) == NULL) { - err_msg("unable to allocate %d bytes of memory for ioctl's", - param->buflen); + param->lumlen = lov_mds_md_size(MAX_LOV_UUID_COUNT); + if ((param->lum = malloc(param->lumlen)) == NULL) { + err_msg("unable to allocate %d bytes of memory for ioctl", + param->lumlen); return ENOMEM; } - param->lum = (struct lov_user_md *)param->buf; - param->uuids = (struct obd_uuid *)param->buf; param->got_uuids = 0; param->obdindex = OBD_NOT_FOUND; - param->max_ost_count = max_ost_count; return 0; } @@ -174,48 +143,72 @@ static void cleanup_find(struct find_param *param) { if (param->obduuid) free(param->obduuid); - if (param->buf) - free(param->buf); + if (param->lum) + free(param->lum); } -static int get_obd_uuids(DIR *dir, char *dname, struct find_param *param) +int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count) { - int obdcount; - struct obd_uuid *uuidp; - int rc, i; - - param->got_uuids = 1; - memset(¶m->data, 0, sizeof(param->data)); - param->data.ioc_inllen1 = sizeof(struct lov_desc); - param->data.ioc_inlbuf1 = (char *)¶m->desc; - param->data.ioc_inllen2 = param->uuidslen; - param->data.ioc_inlbuf2 = (char *)param->uuids; - - memset(¶m->desc, 0, sizeof(struct lov_desc)); - param->desc.ld_tgt_count = param->max_ost_count; - - if (obd_ioctl_pack(¶m->data, ¶m->buf, param->buflen)) { - fprintf(stderr, "internal buffer error from %s\n", dname); - return (param->obduuid ? EINVAL : 0); + struct obd_ioctl_data data = { 0, }; + struct lov_desc desc = { 0, }; + char *buf = NULL; + int max_ost_count, rc; + + max_ost_count = (OBD_MAX_IOCTL_BUFFER - size_round(sizeof(data)) - + size_round(sizeof(desc))) / sizeof(*uuidp); + if (max_ost_count > *ost_count) + max_ost_count = *ost_count; + + data.ioc_inllen1 = sizeof(desc); + data.ioc_inlbuf1 = (char *)&desc; + data.ioc_inllen2 = size_round(max_ost_count * sizeof(*uuidp)); + data.ioc_inlbuf2 = (char *)uuidp; + + desc.ld_tgt_count = max_ost_count; + + if (obd_ioctl_pack(&data, &buf, OBD_MAX_IOCTL_BUFFER)) { + fprintf(stderr, "internal buffer error packing\n"); + rc = EINVAL; + goto out; } - rc = ioctl(dirfd(dir), OBD_IOC_LOV_GET_CONFIG, param->buf); + rc = ioctl(fd, OBD_IOC_LOV_GET_CONFIG, buf); if (rc) { - err_msg("error getting LOV config from %s", dname); - return (param->obduuid ? errno : 0); + err_msg("error getting LOV config"); + rc = errno; + goto out; } - if (obd_ioctl_unpack(¶m->data, param->buf, param->buflen)) { - err_msg("invalid reply from ioctl from %s", dname); - return (param->obduuid ? EINVAL : 0); + if (obd_ioctl_unpack(&data, buf, OBD_MAX_IOCTL_BUFFER)) { + fprintf(stderr, "invalid reply from ioctl"); + rc = EINVAL; + goto out; } - obdcount = param->desc.ld_tgt_count; + *ost_count = desc.ld_tgt_count; +out: + free(buf); + + return 0; +} + +static int setup_obd_uuids(DIR *dir, char *dname, struct find_param *param) +{ + struct obd_uuid uuids[1024], *uuidp; + int obdcount = 1024; + int rc, i; + + param->got_uuids = 1; + + rc = llapi_lov_get_uuids(dirfd(dir), uuids, &obdcount); + if (rc != 0) + return (param->obduuid ? rc : 0); + if (obdcount == 0) return 0; if (param->obduuid) { - for (i = 0, uuidp = param->uuids; i < obdcount; i++, uuidp++) { + for (i = 0, uuidp = uuids; i < obdcount; i++, uuidp++) { if (strncmp(param->obduuid->uuid, uuidp->uuid, sizeof(*uuidp)) == 0) { param->obdindex = i; @@ -228,7 +221,7 @@ static int get_obd_uuids(DIR *dir, char *dname, struct find_param *param) } } else if (!param->quiet) { printf("OBDS:\n"); - for (i = 0, uuidp = param->uuids; i < obdcount; i++, uuidp++) + for (i = 0, uuidp = uuids; i < obdcount; i++, uuidp++) printf("%4d: %s\n", i, uuidp->uuid); } @@ -289,7 +282,7 @@ void lov_dump_user_lmm_v1(struct lov_user_md_v1 *lum, char *dname, char *fname, } } -void lov_dump_user_lmm(struct find_param *param, char *dname, char *fname) +void llapi_lov_dump_user_lmm(struct find_param *param, char *dname, char *fname) { switch(*(__u32 *)param->lum) { /* lum->lmm_magic */ case LOV_USER_MAGIC_V1: @@ -303,7 +296,7 @@ void lov_dump_user_lmm(struct find_param *param, char *dname, char *fname) } } -int get_file_stripe(char *path, struct lov_user_md *lum) +int llapi_file_get_stripe(char *path, struct lov_user_md *lum) { char *dname, *fname; int fd, rc = 0; @@ -346,12 +339,18 @@ int get_file_stripe(char *path, struct lov_user_md *lum) return rc; } +/* short term backwards compat only */ +int op_get_file_stripe(char *path, struct lov_user_md *lum) +{ + return llapi_file_get_stripe(path, lum); +} + static int process_file(DIR *dir, char *dname, char *fname, struct find_param *param) { int rc; - strncpy((char *)param->lum, fname, param->buflen); + strncpy((char *)param->lum, fname, param->lumlen); rc = ioctl(dirfd(dir), IOC_MDC_GETSTRIPE, (void *)param->lum); if (rc) { @@ -373,7 +372,7 @@ static int process_file(DIR *dir, char *dname, char *fname, return rc; } - lov_dump_user_lmm(param, dname, fname); + llapi_lov_dump_user_lmm(param, dname, fname); return 0; } @@ -407,13 +406,13 @@ static int process_dir(DIR *dir, char *dname, struct find_param *param) int rc; if (!param->got_uuids) { - rc = get_obd_uuids(dir, dname, param); + rc = setup_obd_uuids(dir, dname, param); if (rc) return rc; } /* retrieve dir's stripe info */ - strncpy((char *)param->lum, dname, param->buflen); + strncpy((char *)param->lum, dname, param->lumlen); rc = ioctl(dirfd(dir), LL_IOC_LOV_GETSTRIPE, (void *)param->lum); if (rc) { if (errno == ENODATA) { @@ -425,7 +424,7 @@ static int process_dir(DIR *dir, char *dname, struct find_param *param) return errno; } } else { - lov_dump_user_lmm(param, dname, ""); + llapi_lov_dump_user_lmm(param, dname, ""); } /* Handle the contents of the directory */ @@ -513,7 +512,7 @@ static int process_path(char *path, struct find_param *param) rc = errno; } else { if (!param->got_uuids) - rc = get_obd_uuids(dir, dname, param); + rc = setup_obd_uuids(dir, dname, param); if (rc == 0) rc = process_file(dir, dname, fname, param); closedir(dir); @@ -523,9 +522,8 @@ static int process_path(char *path, struct find_param *param) return rc; } - -int op_find(char *path, struct obd_uuid *obduuid, int recursive, - int verbose, int quiet) +int llapi_find(char *path, struct obd_uuid *obduuid, int recursive, + int verbose, int quiet) { struct find_param param; int ret = 0; @@ -556,7 +554,7 @@ out: #define MAX_STRING_SIZE 128 #define DEVICES_LIST "/proc/fs/lustre/devices" -int op_check(int type_num, char **obd_type, char *dir) +int llapi_target_check(int type_num, char **obd_type, char *dir) { char buf[MAX_STRING_SIZE]; FILE *fp = fopen(DEVICES_LIST, "r"); @@ -564,8 +562,8 @@ int op_check(int type_num, char **obd_type, char *dir) int i; if (fp == NULL) { - fprintf(stderr, "error: %s could not open file " - DEVICES_LIST " .\n", strerror(rc = errno)); + fprintf(stderr, "error: %s opening "DEVICES_LIST"\n", + strerror(rc = errno)); return rc; } @@ -618,7 +616,7 @@ int op_check(int type_num, char **obd_type, char *dir) #undef MAX_STRING_SIZE -int op_catinfo(char *dir, char *keyword, char *node_name) +int llapi_catinfo(char *dir, char *keyword, char *node_name) { char raw[OBD_MAX_IOCTL_BUFFER]; char out[LLOG_CHUNK_SIZE]; diff --git a/lustre/utils/llmount.c b/lustre/utils/llmount.c index 3ac52de87255cee326c295a1b99daafc93d78366..980f9fedb5f0299f01aaa3c40f80c6dbfc834093 100644 --- a/lustre/utils/llmount.c +++ b/lustre/utils/llmount.c @@ -387,7 +387,11 @@ main(int argc, char * const argv[]) rc = mount(source, target, "lustre", 0, (void *)&lmd); if (rc) { + rc = errno; perror(argv[0]); + if (rc == ENODEV) + fprintf(stderr, "Are the lustre modules loaded?\n" + "Check /etc/modules.conf and /proc/filesystems\n"); } else { update_mtab_entry(source, target, "lustre", options, 0, 0, 0); } diff --git a/lustre/utils/lmc b/lustre/utils/lmc index 8d3d260c2fbffbbed30796e67ebc61317aebb549..b9a3e71b2674d45ca647e708edaf225ef3ce4458 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -87,8 +87,8 @@ Object creation command summary: --failover --dev path --backdev path - --fstype extN|ext3 - --backfstype ext3|tmpfs + --fstype ldiskfs|ext3 + --backfstype ldiskfs|ext3|tmpfs --size size --nspath --journal_size size @@ -111,8 +111,8 @@ Object creation command summary: --dev path --backdev path --size size - --fstype extN|ext3 - --backfstype ext3|tmpfs + --fstype ldiskfs|ext3 + --backfstype ldiskfs|ext3|tmpfs --journal_size size --inode_size size --osdtype obdecho|obdfilter @@ -198,8 +198,8 @@ lmc_options = [ ('dev', "Path of the device on local system.", PARAM,""), ('backdev', "Path of the device for backing storage on local system.", PARAM,""), ('size', "Specify the size of the device if needed.", PARAM,"0"), - ('journal_size', "Specify new journal size for underlying ext3 file system.", PARAM,"0"), - ('inode_size', "Specify new inode size for underlying ext3 file system.", PARAM,"0"), + ('journal_size', "Specify new journal size for underlying file system.", PARAM,"0"), + ('inode_size', "Specify new inode size for underlying file system.", PARAM,"0"), ('fstype', "Optional argument to specify the filesystem type.", PARAM, "ext3"), ('backfstype', "Optional argument to specify the backing filesystem type.", PARAM, "ext3"), ('mkfsoptions', "Optional argument to mkfs.", PARAM, ""), @@ -387,17 +387,17 @@ class GenConfig: ldlm = self.newService("ldlm", name, uuid) return ldlm - def osd(self, name, uuid, fs, osdtype, devname, format, ost_uuid, + def osd(self, name, uuid, fstype, osdtype, devname, format, ost_uuid, node_uuid, dev_size=0, journal_size=0, inode_size=0, nspath="", - mkfsoptions="", mountfsoptions="", backfs="", backdevname=""): + mkfsoptions="", mountfsoptions="", backfstype="", backdevname=""): osd = self.newService("osd", name, uuid) osd.setAttribute('osdtype', osdtype) osd.appendChild(self.ref("target", ost_uuid)) osd.appendChild(self.ref("node", node_uuid)) - if fs: - self.addElement(osd, "fstype", fs) - if backfs: - self.addElement(osd, "backfstype", backfs) + if fstype: + self.addElement(osd, "fstype", fstype) + if backfstype: + self.addElement(osd, "backfstype", backfstype) if backdevname: self.addElement(osd, "backdevpath", backdevname) if devname: @@ -454,14 +454,14 @@ class GenConfig: self.addElement(mds, "group", group) return mds - def mdsdev(self, name, uuid, fs, devname, format, node_uuid, + def mdsdev(self, name, uuid, fstype, devname, format, node_uuid, mds_uuid, dev_size=0, journal_size=0, inode_size=256, - nspath="", mkfsoptions="", mountfsoptions="", backfs="", + nspath="", mkfsoptions="", mountfsoptions="", backfstype="", backdevname=""): mdd = self.newService("mdsdev", name, uuid) - self.addElement(mdd, "fstype", fs) - if backfs: - self.addElement(mdd, "backfstype", backfs) + self.addElement(mdd, "fstype", fstype) + if backfstype: + self.addElement(mdd, "backfstype", backfstype) dev = self.addElement(mdd, "devpath", devname) if backdevname: self.addElement(mdd, "backdevpath", backdevname) @@ -797,7 +797,6 @@ def add_ost(gen, lustre, options): devname = '' backdevname = '' size = 0 - fstype = '' journal_size = '' inode_size = '' mkfsoptions = '' diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index d0478106ba57ddc9f6afd96b39114c169b0f9a22..e754a90ac091a651f51698e5eb3d1ecde8ef3a08 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -779,22 +779,21 @@ int jt_obd_list(int argc, char **argv) int rc; char buf[MAX_STRING_SIZE]; FILE *fp = fopen(DEVICES_LIST, "r"); - + if (fp == NULL) { - fprintf(stderr, "error: %s: %s could not open file " - DEVICES_LIST " .\n", + fprintf(stderr, "error: %s: %s opening "DEVICES_LIST"\n", jt_cmdname(argv[0]), strerror(rc = errno)); return rc; } - + if (argc != 1) return CMD_HELP; - + while (fgets(buf, sizeof(buf), fp) != NULL) printf("%s", buf); - + fclose(fp); - + return 0; }