Skip to content
Snippets Groups Projects
Commit 23104b05 authored by Jinshan Xiong's avatar Jinshan Xiong
Browse files

r=alex,nathan

b=11890

raid5 write zerocopy support for rhel4 2.6 kernels.
parent 816042a3
No related branches found
No related tags found
No related merge requests found
diff -pru linux-2.6.9.orig/drivers/md/raid5.c linux-2.6.9/drivers/md/raid5.c
--- linux-2.6.9.orig/drivers/md/raid5.c 2007-07-09 02:43:33.000000000 -0600
+++ linux-2.6.9/drivers/md/raid5.c 2007-07-13 00:39:15.000000000 -0600
@@ -412,6 +412,7 @@ static int raid5_end_read_request (struc
clear_buffer_uptodate(bh);
}
#endif
+ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
@@ -450,6 +451,10 @@ static int raid5_end_write_request (stru
rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
+ if (test_bit(R5_Direct, &sh->dev[i].flags)) {
+ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page);
+ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page;
+ }
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
__release_stripe(conf, sh);
@@ -621,6 +626,25 @@ static sector_t compute_blocknr(struct s
}
+static struct page *zero_copy_data(struct bio *bio, sector_t sector)
+{
+ sector_t bi_sector = bio->bi_sector;
+ struct page *page;
+ struct bio_vec *bvl;
+ int i;
+
+ bio_for_each_segment(bvl, bio, i) {
+ if (sector > bi_sector) {
+ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9;
+ continue;
+ }
+ BUG_ON(sector != bi_sector);
+ page = bio_iovec_idx(bio, i)->bv_page;
+ return PageConstant(page) ? page : NULL;
+ }
+ BUG();
+ return NULL;
+}
/*
* Copy data between a page in the stripe cache, and one or more bion
@@ -716,8 +740,9 @@ static void compute_parity(struct stripe
{
raid5_conf_t *conf = sh->raid_conf;
int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count;
- void *ptr[MAX_XOR_BLOCKS];
+ void *ptr[MAX_XOR_BLOCKS], *h_ptr[2];
struct bio *chosen;
+ struct page *page;
PRINTK("compute_parity, stripe %llu, method %d\n",
(unsigned long long)sh->sector, method);
@@ -744,13 +769,14 @@ static void compute_parity(struct stripe
break;
case RECONSTRUCT_WRITE:
memset(ptr[0], 0, STRIPE_SIZE);
- for (i= disks; i-- ;)
+ for (i= disks; i-- ;) {
if (i!=pd_idx && sh->dev[i].towrite) {
chosen = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
if (sh->dev[i].written) BUG();
sh->dev[i].written = chosen;
}
+ }
break;
case CHECK_PARITY:
break;
@@ -760,34 +786,88 @@ static void compute_parity(struct stripe
count = 1;
}
- for (i = disks; i--;)
- if (sh->dev[i].written) {
- sector_t sector = sh->dev[i].sector;
- struct bio *wbi = sh->dev[i].written;
- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
- copy_data(1, wbi, sh->dev[i].page, sector);
- wbi = r5_next_bio(wbi, sector);
+ for (i = disks; i--;) {
+ struct bio *wbi = sh->dev[i].written;
+ sector_t sector;
+
+ if (!wbi)
+ continue;
+
+ sector = sh->dev[i].sector;
+ set_bit(R5_LOCKED, &sh->dev[i].flags);
+ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags));
+
+ /* check if it's covered by a single page
+ and whole stripe is written at once.
+ * in this case we can avoid memcpy() */
+ if (!wbi->bi_next && test_bit(R5_OVERWRITE, &sh->dev[i].flags) &&
+ test_bit(R5_Insync, &sh->dev[i].flags)) {
+ page = zero_copy_data(wbi, sector);
+ if (page) {
+ atomic_inc(&conf->writes_zcopy);
+ sh->dev[i].req.bi_io_vec[0].bv_page = page;
+ set_bit(R5_Direct, &sh->dev[i].flags);
+ clear_bit(R5_UPTODATE, &sh->dev[i].flags);
+ clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
+ continue;
}
+ }
- set_bit(R5_LOCKED, &sh->dev[i].flags);
- set_bit(R5_UPTODATE, &sh->dev[i].flags);
+ atomic_inc(&conf->writes_copied);
+ test_and_clear_bit(R5_OVERWRITE, &sh->dev[i].flags);
+ set_bit(R5_UPTODATE, &sh->dev[i].flags);
+ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+ copy_data(1, wbi, sh->dev[i].page, sector);
+ wbi = r5_next_bio(wbi, sector);
}
+ }
+ h_ptr[0] = ptr[0];
switch(method) {
case RECONSTRUCT_WRITE:
case CHECK_PARITY:
- for (i=disks; i--;)
- if (i != pd_idx) {
- ptr[count++] = page_address(sh->dev[i].page);
- check_xor();
+ for (i=disks; i--;) {
+ if (i == pd_idx)
+ continue;
+ if (test_bit(R5_Direct, &sh->dev[i].flags))
+ page = sh->dev[i].req.bi_io_vec[0].bv_page;
+ else
+ page = sh->dev[i].page;
+
+ /* have to compute the parity immediately for
+ * a highmem page. it would happen for zerocopy. -jay
+ */
+ if (PageHighMem(page)) {
+ h_ptr[1] = kmap_atomic(page, KM_USER0);
+ xor_block(2, STRIPE_SIZE, h_ptr);
+ kunmap_atomic(page, KM_USER0);
+ } else {
+ ptr[count++] = page_address(page);
}
+ check_xor();
+ }
break;
case READ_MODIFY_WRITE:
- for (i = disks; i--;)
- if (sh->dev[i].written) {
- ptr[count++] = page_address(sh->dev[i].page);
- check_xor();
+ for (i = disks; i--;) {
+ if (!sh->dev[i].written)
+ continue;
+ if (test_bit(R5_Direct, &sh->dev[i].flags))
+ page = sh->dev[i].req.bi_io_vec[0].bv_page;
+ else
+ page = sh->dev[i].page;
+
+ /* have to compute the parity immediately for
+ * a highmem page. it would happen for zerocopy. -jay
+ */
+ if (PageHighMem(page)) {
+ h_ptr[1] = kmap_atomic(page, KM_USER0);
+ xor_block(2, STRIPE_SIZE, h_ptr);
+ kunmap_atomic(page, KM_USER0);
+ } else {
+ ptr[count++] = page_address(page);
}
+ check_xor();
+ }
}
if (count != 1)
xor_block(count, STRIPE_SIZE, ptr);
@@ -1059,13 +1139,15 @@ static void handle_stripe(struct stripe_
if (sh->dev[i].written) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
- test_bit(R5_UPTODATE, &dev->flags) ) {
+ (test_bit(R5_UPTODATE, &dev->flags) ||
+ test_bit(R5_Direct, &dev->flags)) ) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
PRINTK("Return write for disc %d\n", i);
spin_lock_irq(&conf->device_lock);
wbi = dev->written;
dev->written = NULL;
+ test_and_clear_bit(R5_Direct, &dev->flags);
while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
if (--wbi->bi_phys_segments == 0) {
@@ -1831,6 +1913,7 @@ memory = conf->max_nr_stripes * (sizeof(
if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
}
+ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE;
/* Ok, everything is just fine now */
mddev->array_size = mddev->size * (mddev->raid_disks - 1);
@@ -1918,9 +2001,11 @@ static void status (struct seq_file *seq
atomic_read(&conf->handled_in_raid5d),
atomic_read(&conf->out_of_stripes),
atomic_read(&conf->handle_called));
- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw",
+ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u",
atomic_read(&conf->reads_for_rmw),
- atomic_read(&conf->reads_for_rcw));
+ atomic_read(&conf->reads_for_rcw),
+ atomic_read(&conf->writes_zcopy),
+ atomic_read(&conf->writes_copied));
seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n",
atomic_read(&conf->delayed),
atomic_read(&conf->active_stripes),
diff -pru linux-2.6.9.orig/include/linux/backing-dev.h linux-2.6.9/include/linux/backing-dev.h
--- linux-2.6.9.orig/include/linux/backing-dev.h 2004-10-18 15:53:46.000000000 -0600
+++ linux-2.6.9/include/linux/backing-dev.h 2007-07-13 00:12:46.000000000 -0600
@@ -30,8 +30,11 @@ struct backing_dev_info {
void *congested_data; /* Pointer to aux data for congested func */
void (*unplug_io_fn)(struct backing_dev_info *, struct page *);
void *unplug_io_data;
+ unsigned int capabilities;
};
+#define BDI_CAP_PAGE_CONST_WRITE 0x00000001
+
extern struct backing_dev_info default_backing_dev_info;
void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page);
@@ -62,4 +65,7 @@ static inline int bdi_rw_congested(struc
(1 << BDI_write_congested));
}
+#define mapping_cap_page_constant_write(mapping) \
+ ((mapping)->backing_dev_info->capabilities & BDI_CAP_PAGE_CONST_WRITE)
+
#endif /* _LINUX_BACKING_DEV_H */
diff -pru linux-2.6.9.orig/include/linux/page-flags.h linux-2.6.9/include/linux/page-flags.h
--- linux-2.6.9.orig/include/linux/page-flags.h 2004-10-18 15:54:39.000000000 -0600
+++ linux-2.6.9/include/linux/page-flags.h 2007-07-13 00:12:46.000000000 -0600
@@ -74,6 +74,7 @@
#define PG_swapcache 16 /* Swap page: swp_entry_t in private */
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
+#define PG_constant 19 /* To mark the page is constant */
/*
@@ -298,6 +299,11 @@ extern unsigned long __read_page_state(u
#define PageSwapCache(page) 0
#endif
+#define PageConstant(page) test_bit(PG_constant, &(page)->flags)
+#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags)
+#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags))
+#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags)
+
struct page; /* forward declaration */
int test_clear_page_dirty(struct page *page);
diff -pru linux-2.6.9.orig/include/linux/pagemap.h linux-2.6.9/include/linux/pagemap.h
--- linux-2.6.9.orig/include/linux/pagemap.h 2004-10-18 15:53:06.000000000 -0600
+++ linux-2.6.9/include/linux/pagemap.h 2007-07-13 00:12:46.000000000 -0600
@@ -191,6 +191,19 @@ static inline void wait_on_page_writebac
extern void end_page_writeback(struct page *page);
+extern int set_page_constant(struct page *page);
+extern void clear_page_constant(struct page *);
+static inline int set_page_constant_lock(struct page *page)
+{
+ BUG_ON(PageLocked(page));
+ lock_page(page);
+ if (set_page_constant(page)) {
+ unlock_page(page);
+ return 1;
+ }
+ return 0;
+}
+
/*
* Fault a userspace page into pagetables. Return non-zero on a fault.
*
diff -pru linux-2.6.9.orig/include/linux/raid/raid5.h linux-2.6.9/include/linux/raid/raid5.h
--- linux-2.6.9.orig/include/linux/raid/raid5.h 2007-07-09 02:43:33.000000000 -0600
+++ linux-2.6.9/include/linux/raid/raid5.h 2007-07-13 00:39:15.000000000 -0600
@@ -153,6 +153,7 @@ struct stripe_head {
#define R5_Wantread 4 /* want to schedule a read */
#define R5_Wantwrite 5
#define R5_Syncio 6 /* this io need to be accounted as resync io */
+#define R5_Direct 7 /* use page from passed bio to avoid memcpy */
/*
* Write method
@@ -234,6 +235,8 @@ struct raid5_private_data {
atomic_t out_of_stripes;
atomic_t reads_for_rmw;
atomic_t reads_for_rcw;
+ atomic_t writes_zcopy;
+ atomic_t writes_copied;
atomic_t handle_called;
atomic_t delayed;
atomic_t in_reqs_in_queue;
diff -pru linux-2.6.9.orig/mm/filemap.c linux-2.6.9/mm/filemap.c
--- linux-2.6.9.orig/mm/filemap.c 2007-07-09 02:43:33.000000000 -0600
+++ linux-2.6.9/mm/filemap.c 2007-07-13 00:12:46.000000000 -0600
@@ -27,6 +27,8 @@
#include <linux/pagevec.h>
#include <linux/blkdev.h>
#include <linux/security.h>
+#include <linux/rmap.h>
+
/*
* This is needed for the following functions:
* - try_to_release_page
@@ -486,11 +488,52 @@ void end_page_writeback(struct page *pag
BUG();
smp_mb__after_clear_bit();
}
+ clear_page_constant(page);
wake_up_page(page);
}
EXPORT_SYMBOL(end_page_writeback);
+/* Mark a page in bio to be constant, page must be locked */
+int set_page_constant(struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+
+ /* If it's an anonymous page and haven't been added to swap cache,
+ * do it here.
+ */
+ if (PageAnon(page) && !PageSwapCache(page))
+ return 1;
+
+ BUG_ON(!PageUptodate(page));
+
+ /* I have to clear page uptodate before trying to remove
+ * it from user's page table because otherwise, the page may be
+ * reinstalled by a page access which happens between try_to_unmap()
+ * and ClearPageUptodate(). -jay
+ */
+ ClearPageUptodate(page);
+ if (page_mapped(page) && try_to_unmap(page) != SWAP_SUCCESS) {
+ SetPageUptodate(page);
+ return 1;
+ }
+ SetPageConstant(page);
+ return 0;
+}
+
+void clear_page_constant(struct page *page)
+{
+ if (PageConstant(page)) {
+ BUG_ON(!PageLocked(page));
+ BUG_ON(PageUptodate(page));
+ ClearPageConstant(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+}
+EXPORT_SYMBOL(set_page_constant);
+EXPORT_SYMBOL(clear_page_constant);
+
/*
* Get a lock on the page, assuming we need to sleep to get it.
*
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment