From 9b6f9d17a35188f5f4dbfae840164b999a7a78a2 Mon Sep 17 00:00:00 2001
From: adilger <adilger>
Date: Tue, 17 Apr 2007 23:43:17 +0000
Subject: [PATCH] Branch HEAD Description: data loss for recently-modified
 files Details    : In some cases it is possible that recently written or
 created 	     files may not be written to disk in a timely manner (this
 should 	     normally be within 30s unless client IO load is very
 high). 	     The problem appears as zero-length files or files that
 are a 	     multiple of 1MB in size after a client crash or client eviction 
      that are missing data at the end of the file.

	     This problem is more likely to be hit on clients where files are
	     repeatedly created and unlinked in the same directory, clients
	     have a large amount of RAM, have many CPUs, the filesystem has
	     many OSTs, the clients are rebooted frequently, and/or the files
	     are not accessed by other nodes after being written.

	     The presence of the problem can be detected by looking at
	     /proc/sys/fs/inode-state.  If the first number (nr_inodes) is
	     smaller than the second (nr_unused) then dirty files will not
	     be flushed automatically to disk.  "sync; sleep 10" should be
	     run several times on the node before unmounting it to update
	     Lustre (this is also safe to run on nodes without this problem).

	     There is also a related kernel bug in the RHEL4 4 2.6.9 kernel
	     that can cause this same problem, so customers using that kernel
	     also need to update the kernel in addition to Lustre.  In order
	     to properly fix this bug, the RHEL3 2.4.21 kernel is also updated.

	     It is normal that files written just before a client crash (less
	     than 30s) may not yet have been flushed to disk, even for local
	     filesystems.
i=green(original patch), i=shadow
b=12181, b=12203
---
 lustre/ChangeLog                              | 110 ++++++++++++++++--
 lustre/autoconf/lustre-version.ac             |   2 +-
 .../patches/inode-nr_unused-2.6.9-rhel4.patch |  46 ++++++++
 .../patches/vfs_intent-2.4.21-rhel.patch      |  31 ++++-
 lustre/llite/namei.c                          |   2 +-
 5 files changed, 181 insertions(+), 10 deletions(-)
 create mode 100644 lustre/kernel_patches/patches/inode-nr_unused-2.6.9-rhel4.patch

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 97ba66e599..8cdc3d7d3d 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -34,23 +34,19 @@ Details    : The on-disk ldiskfs filesystem has added support for nanosecond
 	     resolution timestamps.  There is not yet support for this at
 	     the Lustre filesystem level.
 
-Severity   : enhancement 
-Bugzilla   : 10802
-Description: Move random uuid functions to prng.c.
-
 Severity   : normal
 Frequency  : during server recovery
 Bugzilla   : 11203
 Description: MDS failing to send precreate requests due to OSCC_FLAG_RECOVERING
 Details    : request with rq_no_resend flag not awake l_wait_event if they get a
-             timeout.
+	     timeout.
 
 Severity   : minor
 Frequency  : nfs export on patchless client
 Bugzilla   : 11970
 Description: connectathon hang when test nfs export over patchless client
 Details    : Disconnected dentry cannot be found with lookup, so we do not need 
-             to unhash it or make it invalid
+	     to unhash it or make it invalid
 
 --------------------------------------------------------------------------------
 
@@ -284,9 +280,109 @@ Description: QOS code breaks on skipped indicies
 Details    : Add checks for missing OST indicies in the QOS code, so OSTs
 	     created with --index need not be sequential.
 
+Severity   : enhancement
+Bugzilla   : 11264
+Description: Add uninit_groups feature to ldiskfs2 to speed up e2fsck
+Details    : The uninit_groups feature works in conjunction with the kernel
+	     filesystem code (ldiskfs2 only) and e2fsprogs-1.39-cfs6 to speed
+	     up the pass1 processing of e2fsck.  This is a read-only feature
+	     in ldiskfs2 only, so older kernels and current ldiskfs cannot
+	     mount filesystems that have had this feature enabled.
+
+Severity   : enhancement
+Bugzilla   : 10816
+Description: Improve multi-block allocation algorithm to avoid fragmentation
+Details    : The mballoc3 code (ldiskfs2 only) adds new mechanisms to improve
+	     allocation locality and avoid filesystem fragmentation.
+
 ------------------------------------------------------------------------------
 
 TBD         Cluster File Systems, Inc. <info@clusterfs.com>
+       * version 1.4.12
+       * Support for kernels:
+        2.4.21-47.0.1.EL (RHEL 3)
+        2.6.5-7.283 (SLES 9)
+        2.6.9-42.0.10.EL (RHEL 4)
+        2.6.12.6 vanilla (kernel.org)
+        2.6.16.27-0.9 (SLES 10)
+       * Recommended e2fsprogs version: 1.39.cfs6
+       * Note that reiserfs quotas are disabled on SLES 10 in this kernel
+       * bug fixes
+
+------------------------------------------------------------------------------
+
+2007-04-30  Cluster File Systems, Inc. <info@clusterfs.com>
+       * version 1.4.11
+       * Support for kernels:
+        2.4.21-47.0.1.EL (RHEL 3)
+        2.6.5-7.283 (SLES 9)
+        2.6.9-42.0.10.EL (RHEL 4)
+        2.6.12.6 vanilla (kernel.org)
+        2.6.16.27-0.9 (SLES 10)
+       * Recommended e2fsprogs version: 1.39.cfs6
+       * Note that reiserfs quotas are disabled on SLES 10 in this kernel
+       * bug fixes
+
+Severity   : critical
+Frequency  : occasional, depends on client load and configuration
+Bugzilla   : 12181, 12203
+Description: data loss for recently-modified files
+Introduced : 1.4.6
+Details    : In some cases it is possible that recently written or created
+	     files may not be written to disk in a timely manner (this should
+	     normally be within 30s unless client IO load is very high).
+	     The problem appears as zero-length files or files that are a
+	     multiple of 1MB in size after a client crash or client eviction
+	     that are missing data at the end of the file.
+
+	     This problem is more likely to be hit on clients where files are
+	     repeatedly created and unlinked in the same directory, clients
+	     have a large amount of RAM, have many CPUs, the filesystem has
+	     many OSTs, the clients are rebooted frequently, and/or the files
+	     are not accessed by other nodes after being written.
+
+	     The presence of the problem can be detected by looking at
+	     /proc/sys/fs/inode-state.  If the first number (nr_inodes) is
+	     smaller than the second (nr_unused) then dirty files will not
+	     be flushed automatically to disk.  "sync; sleep 10" should be
+	     run several times on the node before unmounting it to update
+	     Lustre (this is also safe to run on nodes without this problem).
+
+	     There is also a related kernel bug in the RHEL4 4 2.6.9 kernel
+	     that can cause this same problem, so customers using that kernel
+	     also need to update the kernel in addition to Lustre.  In order
+	     to properly fix this bug, the RHEL3 2.4.21 kernel is also updated.
+
+	     It is normal that files written just before a client crash (less
+	     than 30s) may not yet have been flushed to disk, even for local
+	     filesystems.
+
+Severity   : normal
+Frequency  : frequent on thin XT3 nodes
+Bugzilla   : 10802
+Description: UUID collision on thin XT3 Linux nodes
+Details    : UUIDs on Compute Node Linux XT3 nodes were not generated
+	     randomly, since we relied on an insufficiently-seeded PRNG.
+
+Severity   : normal
+Frequency  : rare
+Bugzilla   : 11693
+Description: OSS hangs after "All ost request buffers busy"
+Details    : A deadlock between quota and journal operations caused OSS
+	     hangs after printing "All ost request buffers busy."
+
+Severity   : minor
+Frequency  : always on liblustre builds
+Bugzilla   : 11175
+Description: Cleanup compiler warnings on liblustre
+
+Severity   : enhancement
+Bugzilla   : 10802
+Description: Move random uuid functions to prng.c.
+
+------------------------------------------------------------------------------
+
+2007-04-01  Cluster File Systems, Inc. <info@clusterfs.com>
        * version 1.4.10
        * Support for kernels:
         2.6.16.21-0.8 (SLES10)
@@ -294,7 +390,7 @@ TBD         Cluster File Systems, Inc. <info@clusterfs.com>
         2.6.5-7.276 (SLES 9)
         2.4.21-47.0.1.EL (RHEL 3)
         2.6.12.6 vanilla (kernel.org)
-       * Recommended e2fsprogs version: 1.39.cfs2-0
+       * Recommended e2fsprogs version: 1.39.cfs5
 
 Severity   : major
 Frequency  : liblustre (e.g. catamount) on a large cluster with >= 8 OSTs/OSS
diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac
index 72efbd612d..5eee07bd92 100644
--- a/lustre/autoconf/lustre-version.ac
+++ b/lustre/autoconf/lustre-version.ac
@@ -1,7 +1,7 @@
 m4_define([LUSTRE_MAJOR],[1])
 m4_define([LUSTRE_MINOR],[6])
 m4_define([LUSTRE_PATCH],[0])
-m4_define([LUSTRE_FIX],[0])
+m4_define([LUSTRE_FIX],[90])
 
 dnl # liblustre delta is 0.0.1.32 , next version with fixes is ok, but
 dnl # after following release candidate/beta would spill this warning already.
diff --git a/lustre/kernel_patches/patches/inode-nr_unused-2.6.9-rhel4.patch b/lustre/kernel_patches/patches/inode-nr_unused-2.6.9-rhel4.patch
new file mode 100644
index 0000000000..4f7f591bb0
--- /dev/null
+++ b/lustre/kernel_patches/patches/inode-nr_unused-2.6.9-rhel4.patch
@@ -0,0 +1,46 @@
+diff -urp b1_4.RH_2_6_9_42_0_3.old/fs/fs-writeback.c b1_4.RH_2_6_9_42_0_3/fs/fs-writeback.c
+--- b1_4.RH_2_6_9_42_0_3.old/fs/fs-writeback.c	2006-10-23 13:33:05.000000000 +0300
++++ b1_4.RH_2_6_9_42_0_3/fs/fs-writeback.c	2007-04-15 00:31:43.000000000 +0300
+@@ -230,7 +230,6 @@ __sync_single_inode(struct inode *inode,
+ 			 * The inode is clean, unused
+ 			 */
+ 			list_move(&inode->i_list, &inode_unused);
+-			inodes_stat.nr_unused++;
+ 		}
+ 	}
+ 	wake_up_inode(inode);
+@@ -244,6 +243,11 @@ static int
+ __writeback_single_inode(struct inode *inode,
+ 			struct writeback_control *wbc)
+ {
++        if (!atomic_read(&inode->i_count))
++        	WARN_ON(!(inode->i_state & I_WILL_FREE));
++	else
++		WARN_ON(inode->i_state & I_WILL_FREE);
++
+ 	if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
+ 		list_move(&inode->i_list, &inode->i_sb->s_dirty);
+ 		return 0;
+@@ -253,10 +257,8 @@ __writeback_single_inode(struct inode *i
+ 	 * It's a data-integrity sync.  We must wait.
+ 	 */
+ 	while (inode->i_state & I_LOCK) {
+-		__iget(inode);
+ 		spin_unlock(&inode_lock);
+ 		__wait_on_inode(inode);
+-		iput(inode);
+ 		spin_lock(&inode_lock);
+ 	}
+ 	return __sync_single_inode(inode, wbc);
+Двоичные файлы b1_4.RH_2_6_9_42_0_3.old/fs/fs-writeback.o и b1_4.RH_2_6_9_42_0_3/fs/fs-writeback.o различаются
+diff -urp b1_4.RH_2_6_9_42_0_3.old/fs/inode.c b1_4.RH_2_6_9_42_0_3/fs/inode.c
+--- b1_4.RH_2_6_9_42_0_3.old/fs/inode.c	2006-12-14 15:20:40.000000000 +0200
++++ b1_4.RH_2_6_9_42_0_3/fs/inode.c	2007-04-15 00:31:28.000000000 +0300
+@@ -1054,6 +1054,7 @@ static void generic_forget_inode(struct 
+ 	if (inode->i_data.nrpages)
+ 		truncate_inode_pages(&inode->i_data, 0);
+ 	clear_inode(inode);
++        wake_up_inode(inode);	
+ 	destroy_inode(inode);
+ }
+ 
diff --git a/lustre/kernel_patches/patches/vfs_intent-2.4.21-rhel.patch b/lustre/kernel_patches/patches/vfs_intent-2.4.21-rhel.patch
index 33bf438e7f..83e1f2ddc7 100644
--- a/lustre/kernel_patches/patches/vfs_intent-2.4.21-rhel.patch
+++ b/lustre/kernel_patches/patches/vfs_intent-2.4.21-rhel.patch
@@ -1801,7 +1801,15 @@ Index: linux-2.4.21-40.EL/include/linux/fs.h
  
  extern void inode_init_once(struct inode *);
  extern void iput(struct inode *);
-@@ -1625,6 +1651,8 @@ extern struct file_operations generic_ro
+@@ -1512,6 +1538,7 @@ static inline struct inode *iget_locked(
+ 	return iget4_locked(sb, ino, NULL, NULL);
+ }
+ 
++extern void __iget(struct inode * inode);
+ extern void clear_inode(struct inode *);
+ extern struct inode *new_inode(struct super_block *sb);
+ extern void remove_suid(struct inode *inode);
+@@ -1629,6 +1656,8 @@ extern struct file_operations generic_ro
  
  extern int vfs_readlink(struct dentry *, char *, int, const char *);
  extern int vfs_follow_link(struct nameidata *, const char *);
@@ -1889,3 +1897,24 @@ Index: linux-2.4.21-40.EL/kernel/ksyms.c
  EXPORT_SYMBOL(page_readlink);
  EXPORT_SYMBOL(page_follow_link);
  EXPORT_SYMBOL(page_symlink_inode_operations);
+@@ -592,6 +593,7 @@ EXPORT_SYMBOL(si_meminfo);
+ EXPORT_SYMBOL(sys_tz);
+ EXPORT_SYMBOL(file_fsync);
+ EXPORT_SYMBOL(fsync_buffers_list);
++EXPORT_SYMBOL(__iget);
+ EXPORT_SYMBOL(clear_inode);
+ EXPORT_SYMBOL(___strtok);
+ EXPORT_SYMBOL(init_special_inode);
+Index: linux-2.4.21-47.0.1.EL/fs/inode.c
+===================================================================
+--- linux-2.4.21-47.0.1.EL.orig/fs/inode.c
++++ linux-2.4.21-47.0.1.EL/fs/inode.c
+@@ -278,7 +278,7 @@ static inline void write_inode(struct in
+ 		inode->i_sb->s_op->write_inode(inode, sync);
+ }
+ 
+-static inline void __iget(struct inode * inode)
++void __iget(struct inode * inode)
+ {
+ 	if (atomic_read(&inode->i_count)) {
+ 		atomic_inc(&inode->i_count);
diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c
index f06d1ccb0e..93d063d88e 100644
--- a/lustre/llite/namei.c
+++ b/lustre/llite/namei.c
@@ -84,7 +84,7 @@ static int ll_test_inode(struct inode *inode, void *opaque)
                 /* add "duplicate" inode into deathrow for destroy */
                 spin_lock(&sbi->ll_deathrow_lock);
                 if (list_empty(&lli->lli_dead_list)) {
-                        atomic_inc(&inode->i_count);
+                        __iget(inode);
                         list_add(&lli->lli_dead_list, &sbi->ll_deathrow);
                 }
                 spin_unlock(&sbi->ll_deathrow_lock);
-- 
GitLab