Skip to content
Snippets Groups Projects
Commit a4f6739f authored by girish's avatar girish
Browse files

Add kernel part of Journal Checksum feature.

b=10657
i=adilger
i=kalpak
parent a65710af
No related branches found
No related tags found
No related merge requests found
...@@ -46,6 +46,18 @@ Bugzilla : 11248 ...@@ -46,6 +46,18 @@ Bugzilla : 11248
Description: merge and cleanup kernel patches. Description: merge and cleanup kernel patches.
Details : Remove mnt_lustre_list in vfs_intent-2.6-rhel4.patch. Details : Remove mnt_lustre_list in vfs_intent-2.6-rhel4.patch.
Severity : normal
Bugzilla : 10657
Description: Add journal checksum support.(Kernel part)
Details : The journal checksum feature adds two new flags i.e
JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT and
JBD2_FEATURE_COMPAT_CHECKSUM. JBD2_FEATURE_CHECKSUM flag
indicates that the commit block contains the checksum for
the blocks described by the descriptor blocks. Now commit
record can be sent to disk without waiting for descriptor
blocks to be written to disk. This behavior is controlled
using JBD2_FEATURE_ASYNC_COMMIT flag.
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
2007-08-27 Cluster File Systems, Inc. <info@clusterfs.com> 2007-08-27 Cluster File Systems, Inc. <info@clusterfs.com>
......
Index: linux-2.6.16.46-0.14/fs/jbd/commit.c
===================================================================
--- linux-2.6.16.46-0.14.orig/fs/jbd/commit.c
+++ linux-2.6.16.46-0.14/fs/jbd/commit.c
@@ -22,6 +22,7 @@
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
#include <linux/jiffies.h>
+#include <linux/crc32.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -94,19 +95,23 @@ static int inverted_lock(journal_t *jour
return 1;
}
-/* Done it all: now write the commit record. We should have
+/*
+ * Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
* entirely.
*
* Returns 1 if the journal needs to be aborted or 0 on success
*/
-static int journal_write_commit_record(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_submit_commit_record(journal_t *journal,
+ transaction_t *commit_transaction,
+ struct buffer_head **cbh,
+ __u32 crc32_sum)
{
struct journal_head *descriptor;
+ struct commit_header *tmp;
struct buffer_head *bh;
- int i, ret;
+ int ret;
int barrier_done = 0;
if (is_journal_aborted(journal))
@@ -118,21 +123,34 @@ static int journal_write_commit_record(j
bh = jh2bh(descriptor);
- /* AKPM: buglet - add `i' to tmp! */
- for (i = 0; i < bh->b_size; i += 512) {
- journal_header_t *tmp = (journal_header_t*)bh->b_data;
- tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
- tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+ tmp = (struct commit_header *)bh->b_data;
+ tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+ tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+ tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+
+ if (JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_COMPAT_CHECKSUM)) {
+ tmp->h_chksum_type = JFS_CRC32_CHKSUM;
+ tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE;
+ tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
}
- JBUFFER_TRACE(descriptor, "write commit block");
+ JBUFFER_TRACE(descriptor, "submit commit block");
+ lock_buffer(bh);
+
set_buffer_dirty(bh);
- if (journal->j_flags & JFS_BARRIER) {
+ set_buffer_uptodate(bh);
+ bh->b_end_io = journal_end_buffer_io_sync;
+
+ if (journal->j_flags & JFS_BARRIER &&
+ !JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+
set_buffer_ordered(bh);
barrier_done = 1;
}
- ret = sync_dirty_buffer(bh);
+ ret = submit_bh(WRITE, bh);
+
/* is it possible for another commit to fail at roughly
* the same time as this one? If so, we don't want to
* trust the barrier flag in the super, but instead want
@@ -153,15 +171,74 @@ static int journal_write_commit_record(j
clear_buffer_ordered(bh);
set_buffer_uptodate(bh);
set_buffer_dirty(bh);
- ret = sync_dirty_buffer(bh);
+ ret = submit_bh(WRITE, bh);
}
- put_bh(bh); /* One for getblk() */
- journal_put_journal_head(descriptor);
+ *cbh = bh;
+ return ret;
+}
- return (ret == -EIO);
+/*
+ * This function along with journal_submit_commit_record
+ * allows to write the commit record asynchronously.
+ */
+static int journal_wait_on_commit_record(struct buffer_head *bh)
+{
+ int ret = 0;
+
+ clear_buffer_dirty(bh);
+ wait_on_buffer(bh);
+
+ if (unlikely(!buffer_uptodate(bh)))
+ ret = -EIO;
+ put_bh(bh); /* One for getblk() */
+ journal_put_journal_head(bh2jh(bh));
+
+ return ret;
}
/*
+ * Wait for all submitted IO to complete.
+ */
+static int journal_wait_on_locked_list(journal_t *journal,
+ transaction_t *commit_transaction)
+{
+ int ret = 0;
+ struct journal_head *jh;
+
+ while (commit_transaction->t_locked_list) {
+ struct buffer_head *bh;
+
+ jh = commit_transaction->t_locked_list->b_tprev;
+ bh = jh2bh(jh);
+ get_bh(bh);
+ if (buffer_locked(bh)) {
+ spin_unlock(&journal->j_list_lock);
+ wait_on_buffer(bh);
+ if (unlikely(!buffer_uptodate(bh)))
+ ret = -EIO;
+ spin_lock(&journal->j_list_lock);
+ }
+ if (!inverted_lock(journal, bh)) {
+ put_bh(bh);
+ spin_lock(&journal->j_list_lock);
+ continue;
+ }
+ if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+ __journal_unfile_buffer(jh);
+ jbd_unlock_bh_state(bh);
+ journal_remove_journal_head(bh);
+ put_bh(bh);
+ } else {
+ jbd_unlock_bh_state(bh);
+ }
+ put_bh(bh);
+ cond_resched_lock(&journal->j_list_lock);
+ }
+ return ret;
+}
+
+
+/*
* journal_commit_transaction
*
* The primary function for committing a transaction to the log. This
@@ -184,6 +261,8 @@ void journal_commit_transaction(journal_
int first_tag = 0;
int tag_flag;
int i;
+ struct buffer_head *cbh = NULL; /* For transactional checksums */
+ __u32 crc32_sum = ~0;
/*
* First job: lock down the current transaction and wait for
@@ -395,37 +474,14 @@ write_out_data:
}
/*
- * Wait for all previously submitted IO to complete.
+ * Wait for all previously submitted IO to complete if commit
+ * record is to be written synchronously.
*/
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
+ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT))
+ err = journal_wait_on_locked_list(journal,
+ commit_transaction);
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
- spin_lock(&journal->j_list_lock);
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- put_bh(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
spin_unlock(&journal->j_list_lock);
if (err)
@@ -598,6 +654,16 @@ write_out_data:
start_journal_io:
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
+ /*
+ * Compute checksum.
+ */
+ if (JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_COMPAT_CHECKSUM)) {
+ crc32_sum = crc32_be(crc32_sum,
+ (void *)bh->b_data,
+ bh->b_size);
+ }
+
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
@@ -614,6 +680,23 @@ start_journal_io:
}
}
+ /* Done it all: now write the commit record asynchronously. */
+
+ if (JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ err = journal_submit_commit_record(journal, commit_transaction,
+ &cbh, crc32_sum);
+ if (err)
+ __journal_abort_hard(journal);
+
+ spin_lock(&journal->j_list_lock);
+ err = journal_wait_on_locked_list(journal,
+ commit_transaction);
+ spin_unlock(&journal->j_list_lock);
+ if (err)
+ __journal_abort_hard(journal);
+ }
+
/* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
@@ -712,9 +795,15 @@ wait_for_iobuf:
}
jbd_debug(3, "JBD: commit phase 6\n");
-
- if (journal_write_commit_record(journal, commit_transaction))
- err = -EIO;
+
+ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ err = journal_submit_commit_record(journal, commit_transaction,
+ &cbh, crc32_sum);
+ if (err)
+ __journal_abort_hard(journal);
+ }
+ err = journal_wait_on_commit_record(cbh);
if (err)
__journal_abort_hard(journal);
Index: linux-2.6.16.46-0.14/include/linux/jbd.h
===================================================================
--- linux-2.6.16.46-0.14.orig/include/linux/jbd.h
+++ linux-2.6.16.46-0.14/include/linux/jbd.h
@@ -142,6 +142,29 @@ typedef struct journal_header_s
__be32 h_sequence;
} journal_header_t;
+/*
+ * Checksum types.
+ */
+#define JFS_CRC32_CHKSUM 1
+#define JFS_MD5_CHKSUM 2
+#define JFS_SHA1_CHKSUM 3
+
+#define JFS_CRC32_CHKSUM_SIZE 4
+
+#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
+/*
+ * Commit block header for storing transactional checksums:
+ */
+struct commit_header
+{
+ __be32 h_magic;
+ __be32 h_blocktype;
+ __be32 h_sequence;
+ unsigned char h_chksum_type;
+ unsigned char h_chksum_size;
+ unsigned char h_padding[2];
+ __be32 h_chksum[JFS_CHECKSUM_BYTES];
+};
/*
* The block tag: used to describe a single buffer in the journal
@@ -228,12 +251,16 @@ typedef struct journal_superblock_s
((j)->j_format_version >= 2 && \
((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
-#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
+#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001
+
+#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
+#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
/* Features known to this kernel version: */
-#define JFS_KNOWN_COMPAT_FEATURES 0
+#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM
#define JFS_KNOWN_ROCOMPAT_FEATURES 0
-#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
+#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE | \
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT
#ifdef __KERNEL__
@@ -1041,6 +1068,8 @@ extern int journal_check_available_fe
(journal_t *, unsigned long, unsigned long, unsigned long);
extern int journal_set_features
(journal_t *, unsigned long, unsigned long, unsigned long);
+extern int journal_clear_features
+ (journal_t *, unsigned long, unsigned long, unsigned long);
extern int journal_create (journal_t *);
extern int journal_load (journal_t *journal);
extern void journal_destroy (journal_t *);
Index: linux-2.6.16.46-0.14/fs/jbd/recovery.c
===================================================================
--- linux-2.6.16.46-0.14.orig/fs/jbd/recovery.c
+++ linux-2.6.16.46-0.14/fs/jbd/recovery.c
@@ -21,6 +21,7 @@
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/crc32.h>
#endif
/*
@@ -307,6 +308,37 @@ int journal_skip_recovery(journal_t *jou
return err;
}
+/*
+ * calc_chksums calculates the checksums for the blocks described in the
+ * descriptor block.
+ */
+static int calc_chksums(journal_t *journal, struct buffer_head *bh,
+ unsigned long *next_log_block, __u32 *crc32_sum)
+{
+ int i, num_blks, err;
+ unsigned io_block;
+ struct buffer_head *obh;
+
+ num_blks = count_tags(bh, journal->j_blocksize);
+ /* Calculate checksum of the descriptor block. */
+ *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
+
+ for (i = 0; i < num_blks; i++) {
+ io_block = (*next_log_block)++;
+ wrap(journal, *next_log_block);
+ err = jread(&obh, journal, io_block);
+ if (err) {
+ printk(KERN_ERR "JBD: IO error %d recovering block "
+ "%u in log\n", err, io_block);
+ return 1;
+ } else {
+ *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
+ obh->b_size);
+ }
+ }
+ return 0;
+}
+
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
@@ -318,6 +350,7 @@ static int do_one_pass(journal_t *journa
struct buffer_head * bh;
unsigned int sequence;
int blocktype;
+ __u32 crc32_sum = ~0; /* Transactional Checksums */
/* Precompute the maximum metadata descriptors in a descriptor block */
int MAX_BLOCKS_PER_DESC;
@@ -409,9 +442,24 @@ static int do_one_pass(journal_t *journa
switch(blocktype) {
case JFS_DESCRIPTOR_BLOCK:
/* If it is a valid descriptor block, replay it
- * in pass REPLAY; otherwise, just skip over the
- * blocks it describes. */
+ * in pass REPLAY; if journal_checksums enabled, then
+ * calculate checksums in PASS_SCAN, otherwise,
+ * just skip over the blocks it describes. */
if (pass != PASS_REPLAY) {
+ if (pass == PASS_SCAN &&
+ JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_COMPAT_CHECKSUM) &&
+ !info->end_transaction) {
+ if (calc_chksums(journal, bh,
+ &next_log_block,
+ &crc32_sum)) {
+ brelse(bh);
+ break;
+ }
+ brelse(bh);
+ continue;
+ }
+
next_log_block +=
count_tags(bh, journal->j_blocksize);
wrap(journal, next_log_block);
@@ -506,9 +554,97 @@ static int do_one_pass(journal_t *journa
continue;
case JFS_COMMIT_BLOCK:
- /* Found an expected commit block: not much to
- * do other than move on to the next sequence
+ /* How to differentiate between interrupted commit
+ * and journal corruption ?
+ *
+ * {nth transaction}
+ * Checksum Verification Failed
+ * |
+ * ____________________
+ * | |
+ * async_commit sync_commit
+ * | |
+ * | GO TO NEXT "Journal Corruption"
+ * | TRANSACTION
+ * |
+ * {(n+1)th transanction}
+ * |
+ * _______|______________
+ * | |
+ * Commit block found Commit block not found
+ * | |
+ * "Journal Corruption" |
+ * _____________|__________
+ * | |
+ * nth trans corrupt OR nth trans
+ * and (n+1)th interrupted interrupted
+ * before commit block
+ * could reach the disk.
+ * (Cannot find the difference in above
+ * mentioned conditions. Hence assume
+ * "Interrupted Commit".)
+ */
+
+ /* Found an expected commit block: if checksums
+ * are present verify them in PASS_SCAN; else not
+ * much to do other than move on to the next sequence
* number. */
+ if (pass == PASS_SCAN &&
+ JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_COMPAT_CHECKSUM)) {
+ int chksum_err, chksum_seen;
+ struct commit_header *cbh =
+ (struct commit_header *)bh->b_data;
+ unsigned found_chksum =
+ be32_to_cpu(cbh->h_chksum[0]);
+
+ chksum_err = chksum_seen = 0;
+
+ if (info->end_transaction) {
+ printk(KERN_ERR "JBD: Transaction %u "
+ "found to be corrupt.\n",
+ next_commit_ID - 1);
+ brelse(bh);
+ break;
+ }
+
+ if (crc32_sum == found_chksum &&
+ cbh->h_chksum_type == JFS_CRC32_CHKSUM &&
+ cbh->h_chksum_size ==
+ JFS_CRC32_CHKSUM_SIZE) {
+ chksum_seen = 1;
+ } else if (!(cbh->h_chksum_type == 0 &&
+ cbh->h_chksum_size == 0 &&
+ found_chksum == 0 &&
+ !chksum_seen)) {
+ /*
+ * If fs is mounted using an old kernel and then
+ * kernel with journal_chksum is used then we
+ * get a situation where the journal flag has
+ * checksum flag set but checksums are not
+ * present i.e chksum = 0, in the individual
+ * commit blocks.
+ * Hence to avoid checksum failures, in this
+ * situation, this extra check is added.
+ */
+ chksum_err = 1;
+ }
+
+ if (chksum_err) {
+ info->end_transaction = next_commit_ID;
+
+ if (!JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+ printk(KERN_ERR
+ "JBD: Transaction %u "
+ "found to be corrupt.\n",
+ next_commit_ID);
+ brelse(bh);
+ break;
+ }
+ }
+ crc32_sum = ~0;
+ }
brelse(bh);
next_commit_ID++;
continue;
@@ -543,9 +679,10 @@ static int do_one_pass(journal_t *journa
* transaction marks the end of the valid log.
*/
- if (pass == PASS_SCAN)
- info->end_transaction = next_commit_ID;
- else {
+ if (pass == PASS_SCAN) {
+ if (!info->end_transaction)
+ info->end_transaction = next_commit_ID;
+ } else {
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
if (info->end_transaction != next_commit_ID) {
Index: linux-2.6.16.46-0.14/fs/jbd/journal.c
===================================================================
--- linux-2.6.16.46-0.14.orig/fs/jbd/journal.c
+++ linux-2.6.16.46-0.14/fs/jbd/journal.c
@@ -64,6 +64,7 @@ EXPORT_SYMBOL(journal_update_format);
EXPORT_SYMBOL(journal_check_used_features);
EXPORT_SYMBOL(journal_check_available_features);
EXPORT_SYMBOL(journal_set_features);
+EXPORT_SYMBOL(journal_clear_features);
EXPORT_SYMBOL(journal_create);
EXPORT_SYMBOL(journal_load);
EXPORT_SYMBOL(journal_destroy);
@@ -1565,6 +1566,33 @@ int journal_set_features (journal_t *jou
return 1;
}
+/**
+ * int journal_clear_features () - Clear a given journal feature in the superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Clear a given journal feature as present on the
+ * superblock. Returns true if the requested features could be reset.
+ *
+ */
+int journal_clear_features (journal_t *journal, unsigned long compat,
+ unsigned long ro, unsigned long incompat)
+{
+ journal_superblock_t *sb;
+
+ jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+ compat, ro, incompat);
+
+ sb = journal->j_superblock;
+
+ sb->s_feature_compat &= ~cpu_to_be32(compat);
+ sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
+ sb->s_feature_incompat &= ~cpu_to_be32(incompat);
+
+ return 1;
+}
/**
* int journal_update_format () - Update on-disk journal structure.
Index: linux-2.6.16.46-0.14/fs/Kconfig
===================================================================
--- linux-2.6.16.46-0.14.orig/fs/Kconfig
+++ linux-2.6.16.46-0.14/fs/Kconfig
@@ -140,6 +140,7 @@ config EXT3_FS_SECURITY
config JBD
tristate
+ select CRC32
help
This is a generic journaling layer for block devices. It is
currently used by the ext3 and OCFS2 file systems, but it could
Index: linux-2.6.16.46-0.14/Documentation/filesystems/ext3.txt
===================================================================
--- linux-2.6.16.46-0.14.orig/Documentation/filesystems/ext3.txt
+++ linux-2.6.16.46-0.14/Documentation/filesystems/ext3.txt
@@ -14,6 +14,16 @@ Options
When mounting an ext3 filesystem, the following option are accepted:
(*) == default
+journal_checksum Enable checksumming of the journal transactions.
+ This will allow the recovery code in e2fsck and the
+ kernel to detect corruption in the kernel. It is a
+ compatible change and will be ignored by older kernels.
+
+journal_async_commit Commit block can be written to disk without waiting
+ for descriptor blocks. If enabled older kernels cannot
+ mount the device. This will enable 'journal_checksum'
+ internally.
+
journal=update Update the ext3 file system's journal to the current
format.
Index: linux-2.6.18-8.1.8/fs/jbd/commit.c
===================================================================
--- linux-2.6.18-8.1.8.orig/fs/jbd/commit.c
+++ linux-2.6.18-8.1.8/fs/jbd/commit.c
@@ -21,6 +21,7 @@
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
+#include <linux/crc32.h>
/*
* Default IO end handler for temporary BJ_IO buffer_heads.
@@ -93,19 +94,23 @@ static int inverted_lock(journal_t *jour
return 1;
}
-/* Done it all: now write the commit record. We should have
+/*
+ * Done it all: now submit the commit record. We should have
* cleaned up our previous buffers by now, so if we are in abort
* mode we can now just skip the rest of the journal write
* entirely.
*
* Returns 1 if the journal needs to be aborted or 0 on success
*/
-static int journal_write_commit_record(journal_t *journal,
- transaction_t *commit_transaction)
+static int journal_submit_commit_record(journal_t *journal,
+ transaction_t *commit_transaction,
+ struct buffer_head **cbh,
+ __u32 crc32_sum)
{
struct journal_head *descriptor;
+ struct commit_header *tmp;
struct buffer_head *bh;
- int i, ret;
+ int ret;
int barrier_done = 0;
if (is_journal_aborted(journal))
@@ -117,21 +122,34 @@ static int journal_write_commit_record(j
bh = jh2bh(descriptor);
- /* AKPM: buglet - add `i' to tmp! */
- for (i = 0; i < bh->b_size; i += 512) {
- journal_header_t *tmp = (journal_header_t*)bh->b_data;
- tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
- tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
- tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+ tmp = (struct commit_header *)bh->b_data;
+ tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
+ tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
+ tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
+
+ if (JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_COMPAT_CHECKSUM)) {
+ tmp->h_chksum_type = JFS_CRC32_CHKSUM;
+ tmp->h_chksum_size = JFS_CRC32_CHKSUM_SIZE;
+ tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
}
- JBUFFER_TRACE(descriptor, "write commit block");
+ JBUFFER_TRACE(descriptor, "submit commit block");
+ lock_buffer(bh);
+
set_buffer_dirty(bh);
- if (journal->j_flags & JFS_BARRIER) {
+ set_buffer_uptodate(bh);
+ bh->b_end_io = journal_end_buffer_io_sync;
+
+ if (journal->j_flags & JFS_BARRIER &&
+ !JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+
set_buffer_ordered(bh);
barrier_done = 1;
}
- ret = sync_dirty_buffer(bh);
+ ret = submit_bh(WRITE, bh);
+
/* is it possible for another commit to fail at roughly
* the same time as this one? If so, we don't want to
* trust the barrier flag in the super, but instead want
@@ -152,14 +170,72 @@ static int journal_write_commit_record(j
clear_buffer_ordered(bh);
set_buffer_uptodate(bh);
set_buffer_dirty(bh);
- ret = sync_dirty_buffer(bh);
+ ret = submit_bh(WRITE, bh);
}
- put_bh(bh); /* One for getblk() */
- journal_put_journal_head(descriptor);
+ *cbh = bh;
+ return ret;
+}
- return (ret == -EIO);
+/*
+ * This function along with journal_submit_commit_record
+ * allows to write the commit record asynchronously.
+ */
+static int journal_wait_on_commit_record(struct buffer_head *bh)
+{
+ int ret = 0;
+
+ clear_buffer_dirty(bh);
+ wait_on_buffer(bh);
+
+ if (unlikely(!buffer_uptodate(bh)))
+ ret = -EIO;
+ put_bh(bh); /* One for getblk() */
+ journal_put_journal_head(bh2jh(bh));
+
+ return ret;
}
+/*
+ * Wait for all submitted IO to complete.
+ */
+static int journal_wait_on_locked_list(journal_t *journal,
+ transaction_t *commit_transaction)
+{
+ int ret = 0;
+ struct journal_head *jh;
+
+ while (commit_transaction->t_locked_list) {
+ struct buffer_head *bh;
+
+ jh = commit_transaction->t_locked_list->b_tprev;
+ bh = jh2bh(jh);
+ get_bh(bh);
+ if (buffer_locked(bh)) {
+ spin_unlock(&journal->j_list_lock);
+ wait_on_buffer(bh);
+ if (unlikely(!buffer_uptodate(bh)))
+ ret = -EIO;
+ spin_lock(&journal->j_list_lock);
+ }
+ if (!inverted_lock(journal, bh)) {
+ put_bh(bh);
+ spin_lock(&journal->j_list_lock);
+ continue;
+ }
+ if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+ __journal_unfile_buffer(jh);
+ jbd_unlock_bh_state(bh);
+ journal_remove_journal_head(bh);
+ put_bh(bh);
+ } else {
+ jbd_unlock_bh_state(bh);
+ }
+ put_bh(bh);
+ cond_resched_lock(&journal->j_list_lock);
+ }
+ return ret;
+}
++
void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
{
int i;
@@ -293,6 +369,8 @@ void journal_commit_transaction(journal_
int first_tag = 0;
int tag_flag;
int i;
+ struct buffer_head *cbh = NULL; /* For transactional checksums */
+ __u32 crc32_sum = ~0;
/*
* First job: lock down the current transaction and wait for
@@ -428,38 +506,14 @@ void journal_commit_transaction(journal_
journal_submit_data_buffers(journal, commit_transaction);
/*
- * Wait for all previously submitted IO to complete.
+ * Wait for all previously submitted IO to complete if commit
+ * record is to be written synchronously.
*/
spin_lock(&journal->j_list_lock);
- while (commit_transaction->t_locked_list) {
- struct buffer_head *bh;
-
- jh = commit_transaction->t_locked_list->b_tprev;
- bh = jh2bh(jh);
- get_bh(bh);
- if (buffer_locked(bh)) {
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- if (unlikely(!buffer_uptodate(bh)))
- err = -EIO;
- spin_lock(&journal->j_list_lock);
- }
- if (!inverted_lock(journal, bh)) {
- put_bh(bh);
- spin_lock(&journal->j_list_lock);
- continue;
- }
- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
- __journal_unfile_buffer(jh);
- jbd_unlock_bh_state(bh);
- journal_remove_journal_head(bh);
- put_bh(bh);
- } else {
- jbd_unlock_bh_state(bh);
- }
- put_bh(bh);
- cond_resched_lock(&journal->j_list_lock);
- }
+ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT))
+ err = journal_wait_on_locked_list(journal,
+ commit_transaction);
spin_unlock(&journal->j_list_lock);
if (err)
@@ -627,6 +681,16 @@ void journal_commit_transaction(journal_
start_journal_io:
for (i = 0; i < bufs; i++) {
struct buffer_head *bh = wbuf[i];
+ /*
+ * Compute checksum.
+ */
+ if (JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_COMPAT_CHECKSUM)) {
+ crc32_sum = crc32_be(crc32_sum,
+ (void *)bh->b_data,
+ bh->b_size);
+ }
+
lock_buffer(bh);
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
@@ -642,6 +706,23 @@ start_journal_io:
}
}
+ /* Done it all: now write the commit record asynchronously. */
+
+ if (JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ err = journal_submit_commit_record(journal, commit_transaction,
+ &cbh, crc32_sum);
+ if (err)
+ __journal_abort_hard(journal);
+
+ spin_lock(&journal->j_list_lock);
+ err = journal_wait_on_locked_list(journal,
+ commit_transaction);
+ spin_unlock(&journal->j_list_lock);
+ if (err)
+ __journal_abort_hard(journal);
+ }
+
/* Lo and behold: we have just managed to send a transaction to
the log. Before we can commit it, wait for the IO so far to
complete. Control buffers being written are on the
@@ -740,9 +821,15 @@ wait_for_iobuf:
}
jbd_debug(3, "JBD: commit phase 6\n");
-
- if (journal_write_commit_record(journal, commit_transaction))
- err = -EIO;
+
+ if (!JFS_HAS_INCOMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ err = journal_submit_commit_record(journal, commit_transaction,
+ &cbh, crc32_sum);
+ if (err)
+ __journal_abort_hard(journal);
+ }
+ err = journal_wait_on_commit_record(cbh);
if (err)
__journal_abort_hard(journal);
Index: linux-2.6.18-8.1.8/include/linux/jbd.h
===================================================================
--- linux-2.6.18-8.1.8.orig/include/linux/jbd.h
+++ linux-2.6.18-8.1.8/include/linux/jbd.h
@@ -148,6 +148,29 @@ typedef struct journal_header_s
__be32 h_sequence;
} journal_header_t;
+/*
+ * Checksum types.
+ */
+#define JFS_CRC32_CHKSUM 1
+#define JFS_MD5_CHKSUM 2
+#define JFS_SHA1_CHKSUM 3
+
+#define JFS_CRC32_CHKSUM_SIZE 4
+
+#define JFS_CHECKSUM_BYTES (32 / sizeof(u32))
+/*
+ * Commit block header for storing transactional checksums:
+ */
+struct commit_header
+{
+ __be32 h_magic;
+ __be32 h_blocktype;
+ __be32 h_sequence;
+ unsigned char h_chksum_type;
+ unsigned char h_chksum_size;
+ unsigned char h_padding[2];
+ __be32 h_chksum[JFS_CHECKSUM_BYTES];
+};
/*
* The block tag: used to describe a single buffer in the journal
@@ -234,12 +257,16 @@ typedef struct journal_superblock_s
((j)->j_format_version >= 2 && \
((j)->j_superblock->s_feature_incompat & cpu_to_be32((mask))))
-#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
+#define JFS_FEATURE_COMPAT_CHECKSUM 0x00000001
+
+#define JFS_FEATURE_INCOMPAT_REVOKE 0x00000001
+#define JFS_FEATURE_INCOMPAT_ASYNC_COMMIT 0x00000004
/* Features known to this kernel version: */
-#define JFS_KNOWN_COMPAT_FEATURES 0
+#define JFS_KNOWN_COMPAT_FEATURES JFS_FEATURE_COMPAT_CHECKSUM
#define JFS_KNOWN_ROCOMPAT_FEATURES 0
-#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE
+#define JFS_KNOWN_INCOMPAT_FEATURES JFS_FEATURE_INCOMPAT_REVOKE | \
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT
#ifdef __KERNEL__
@@ -967,6 +994,8 @@ extern int journal_check_available_fe
(journal_t *, unsigned long, unsigned long, unsigned long);
extern int journal_set_features
(journal_t *, unsigned long, unsigned long, unsigned long);
+extern int journal_clear_features
+ (journal_t *, unsigned long, unsigned long, unsigned long);
extern int journal_create (journal_t *);
extern int journal_load (journal_t *journal);
extern void journal_destroy (journal_t *);
Index: linux-2.6.18-8.1.8/fs/jbd/recovery.c
===================================================================
--- linux-2.6.18-8.1.8.orig/fs/jbd/recovery.c
+++ linux-2.6.18-8.1.8/fs/jbd/recovery.c
@@ -21,6 +21,7 @@
#include <linux/jbd.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/crc32.h>
#endif
/*
@@ -307,6 +308,37 @@ int journal_skip_recovery(journal_t *jou
return err;
}
+/*
+ * calc_chksums calculates the checksums for the blocks described in the
+ * descriptor block.
+ */
+static int calc_chksums(journal_t *journal, struct buffer_head *bh,
+ unsigned long *next_log_block, __u32 *crc32_sum)
+{
+ int i, num_blks, err;
+ unsigned io_block;
+ struct buffer_head *obh;
+
+ num_blks = count_tags(bh, journal->j_blocksize);
+ /* Calculate checksum of the descriptor block. */
+ *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
+
+ for (i = 0; i < num_blks; i++) {
+ io_block = (*next_log_block)++;
+ wrap(journal, *next_log_block);
+ err = jread(&obh, journal, io_block);
+ if (err) {
+ printk(KERN_ERR "JBD: IO error %d recovering block "
+ "%u in log\n", err, io_block);
+ return 1;
+ } else {
+ *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
+ obh->b_size);
+ }
+ }
+ return 0;
+}
+
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
@@ -318,6 +350,7 @@ static int do_one_pass(journal_t *journa
struct buffer_head * bh;
unsigned int sequence;
int blocktype;
+ __u32 crc32_sum = ~0; /* Transactional Checksums */
/* Precompute the maximum metadata descriptors in a descriptor block */
int MAX_BLOCKS_PER_DESC;
@@ -409,9 +442,24 @@ static int do_one_pass(journal_t *journa
switch(blocktype) {
case JFS_DESCRIPTOR_BLOCK:
/* If it is a valid descriptor block, replay it
- * in pass REPLAY; otherwise, just skip over the
- * blocks it describes. */
+ * in pass REPLAY; if journal_checksums enabled, then
+ * calculate checksums in PASS_SCAN, otherwise,
+ * just skip over the blocks it describes. */
if (pass != PASS_REPLAY) {
+ if (pass == PASS_SCAN &&
+ JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_COMPAT_CHECKSUM) &&
+ !info->end_transaction) {
+ if (calc_chksums(journal, bh,
+ &next_log_block,
+ &crc32_sum)) {
+ brelse(bh);
+ break;
+ }
+ brelse(bh);
+ continue;
+ }
+
next_log_block +=
count_tags(bh, journal->j_blocksize);
wrap(journal, next_log_block);
@@ -506,9 +554,97 @@ static int do_one_pass(journal_t *journa
continue;
case JFS_COMMIT_BLOCK:
- /* Found an expected commit block: not much to
- * do other than move on to the next sequence
+ /* How to differentiate between interrupted commit
+ * and journal corruption ?
+ *
+ * {nth transaction}
+ * Checksum Verification Failed
+ * |
+ * ____________________
+ * | |
+ * async_commit sync_commit
+ * | |
+ * | GO TO NEXT "Journal Corruption"
+ * | TRANSACTION
+ * |
+ * {(n+1)th transanction}
+ * |
+ * _______|______________
+ * | |
+ * Commit block found Commit block not found
+ * | |
+ * "Journal Corruption" |
+ * _____________|__________
+ * | |
+ * nth trans corrupt OR nth trans
+ * and (n+1)th interrupted interrupted
+ * before commit block
+ * could reach the disk.
+ * (Cannot find the difference in above
+ * mentioned conditions. Hence assume
+ * "Interrupted Commit".)
+ */
+
+ /* Found an expected commit block: if checksums
+ * are present verify them in PASS_SCAN; else not
+ * much to do other than move on to the next sequence
* number. */
+ if (pass == PASS_SCAN &&
+ JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_COMPAT_CHECKSUM)) {
+ int chksum_err, chksum_seen;
+ struct commit_header *cbh =
+ (struct commit_header *)bh->b_data;
+ unsigned found_chksum =
+ be32_to_cpu(cbh->h_chksum[0]);
+
+ chksum_err = chksum_seen = 0;
+
+ if (info->end_transaction) {
+ printk(KERN_ERR "JBD: Transaction %u "
+ "found to be corrupt.\n",
+ next_commit_ID - 1);
+ brelse(bh);
+ break;
+ }
+
+ if (crc32_sum == found_chksum &&
+ cbh->h_chksum_type == JFS_CRC32_CHKSUM &&
+ cbh->h_chksum_size ==
+ JFS_CRC32_CHKSUM_SIZE) {
+ chksum_seen = 1;
+ } else if (!(cbh->h_chksum_type == 0 &&
+ cbh->h_chksum_size == 0 &&
+ found_chksum == 0 &&
+ !chksum_seen)) {
+ /*
+ * If fs is mounted using an old kernel and then
+ * kernel with journal_chksum is used then we
+ * get a situation where the journal flag has
+ * checksum flag set but checksums are not
+ * present i.e chksum = 0, in the individual
+ * commit blocks.
+ * Hence to avoid checksum failures, in this
+ * situation, this extra check is added.
+ */
+ chksum_err = 1;
+ }
+
+ if (chksum_err) {
+ info->end_transaction = next_commit_ID;
+
+ if (!JFS_HAS_COMPAT_FEATURE(journal,
+ JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+ printk(KERN_ERR
+ "JBD: Transaction %u "
+ "found to be corrupt.\n",
+ next_commit_ID);
+ brelse(bh);
+ break;
+ }
+ }
+ crc32_sum = ~0;
+ }
brelse(bh);
next_commit_ID++;
continue;
@@ -544,9 +680,10 @@ static int do_one_pass(journal_t *journa
* transaction marks the end of the valid log.
*/
- if (pass == PASS_SCAN)
- info->end_transaction = next_commit_ID;
- else {
+ if (pass == PASS_SCAN) {
+ if (!info->end_transaction)
+ info->end_transaction = next_commit_ID;
+ } else {
/* It's really bad news if different passes end up at
* different places (but possible due to IO errors). */
if (info->end_transaction != next_commit_ID) {
Index: linux-2.6.18-8.1.8/fs/jbd/journal.c
===================================================================
--- linux-2.6.18-8.1.8.orig/fs/jbd/journal.c
+++ linux-2.6.18-8.1.8/fs/jbd/journal.c
@@ -66,6 +66,7 @@ EXPORT_SYMBOL(journal_update_format);
EXPORT_SYMBOL(journal_check_used_features);
EXPORT_SYMBOL(journal_check_available_features);
EXPORT_SYMBOL(journal_set_features);
+EXPORT_SYMBOL(journal_clear_features);
EXPORT_SYMBOL(journal_create);
EXPORT_SYMBOL(journal_load);
EXPORT_SYMBOL(journal_destroy);
@@ -1271,6 +1272,33 @@ int journal_set_features (journal_t *jou
return 1;
}
+/**
+ * int journal_clear_features () - Clear a given journal feature in the superblock
+ * @journal: Journal to act on.
+ * @compat: bitmask of compatible features
+ * @ro: bitmask of features that force read-only mount
+ * @incompat: bitmask of incompatible features
+ *
+ * Clear a given journal feature as present on the
+ * superblock. Returns true if the requested features could be reset.
+ *
+ */
+int journal_clear_features (journal_t *journal, unsigned long compat,
+ unsigned long ro, unsigned long incompat)
+{
+ journal_superblock_t *sb;
+
+ jbd_debug(1, "Clear features 0x%lx/0x%lx/0x%lx\n",
+ compat, ro, incompat);
+
+ sb = journal->j_superblock;
+
+ sb->s_feature_compat &= ~cpu_to_be32(compat);
+ sb->s_feature_ro_compat &= ~cpu_to_be32(ro);
+ sb->s_feature_incompat &= ~cpu_to_be32(incompat);
+
+ return 1;
+}
/**
* int journal_update_format () - Update on-disk journal structure.
Index: linux-2.6.18-8.1.8/fs/Kconfig
===================================================================
--- linux-2.6.18-8.1.8.orig/fs/Kconfig
+++ linux-2.6.18-8.1.8/fs/Kconfig
@@ -140,6 +140,7 @@ config EXT3_FS_SECURITY
config JBD
tristate
+ select CRC32
help
This is a generic journaling layer for block devices. It is
currently used by the ext3 and OCFS2 file systems, but it could
Index: linux-2.6.18-8.1.8/Documentation/filesystems/ext3.txt
===================================================================
--- linux-2.6.18-8.1.8.orig/Documentation/filesystems/ext3.txt
+++ linux-2.6.18-8.1.8/Documentation/filesystems/ext3.txt
@@ -14,6 +14,16 @@ Options
When mounting an ext3 filesystem, the following option are accepted:
(*) == default
+journal_checksum Enable checksumming of the journal transactions.
+ This will allow the recovery code in e2fsck and the
+ kernel to detect corruption in the kernel. It is a
+ compatible change and will be ignored by older kernels.
+
+journal_async_commit Commit block can be written to disk without waiting
+ for descriptor blocks. If enabled older kernels cannot
+ mount the device. This will enable 'journal_checksum'
+ internally.
+
journal=update Update the ext3 file system's journal to the current
format.
...@@ -9,3 +9,4 @@ export-2.6.18-vanilla.patch ...@@ -9,3 +9,4 @@ export-2.6.18-vanilla.patch
export-show_task-2.6.18-vanilla.patch export-show_task-2.6.18-vanilla.patch
sd_iostats-2.6-rhel4.patch sd_iostats-2.6-rhel4.patch
export_symbol_numa-2.6-fc5.patch export_symbol_numa-2.6-fc5.patch
jbd-journal-chksum-2.6.18-vanilla.patch
...@@ -14,3 +14,4 @@ sd_iostats-2.6-rhel4.patch ...@@ -14,3 +14,4 @@ sd_iostats-2.6-rhel4.patch
export_symbol_numa-2.6-fc5.patch export_symbol_numa-2.6-fc5.patch
blkdev_tunables-2.6-sles10.patch blkdev_tunables-2.6-sles10.patch
jbd-stats-2.6-sles10.patch jbd-stats-2.6-sles10.patch
jbd-journal-chksum-2.6-sles10.patch
...@@ -13,3 +13,4 @@ sd_iostats-2.6-rhel4.patch ...@@ -13,3 +13,4 @@ sd_iostats-2.6-rhel4.patch
export_symbol_numa-2.6.18.patch export_symbol_numa-2.6.18.patch
jbd-16tb-overflow-fixes.patch jbd-16tb-overflow-fixes.patch
jbd-check-for-unmapped-buffer.patch jbd-check-for-unmapped-buffer.patch
jbd-journal-chksum-2.6.18-vanilla.patch
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment