diff --git a/lustre/kernel_patches/patches/raid5-zerocopy.patch b/lustre/kernel_patches/patches/raid5-zerocopy.patch new file mode 100644 index 0000000000000000000000000000000000000000..09863437aceb9acafd2c7a21b16b8f6ac31ba93d --- /dev/null +++ b/lustre/kernel_patches/patches/raid5-zerocopy.patch @@ -0,0 +1,374 @@ +diff -pru linux-2.6.9.orig/drivers/md/raid5.c linux-2.6.9/drivers/md/raid5.c +--- linux-2.6.9.orig/drivers/md/raid5.c 2007-07-09 02:43:33.000000000 -0600 ++++ linux-2.6.9/drivers/md/raid5.c 2007-07-13 00:39:15.000000000 -0600 +@@ -412,6 +412,7 @@ static int raid5_end_read_request (struc + clear_buffer_uptodate(bh); + } + #endif ++ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + release_stripe(sh); +@@ -450,6 +451,10 @@ static int raid5_end_write_request (stru + + rdev_dec_pending(conf->disks[i].rdev, conf->mddev); + ++ if (test_bit(R5_Direct, &sh->dev[i].flags)) { ++ BUG_ON(sh->dev[i].req.bi_io_vec[0].bv_page == sh->dev[i].page); ++ sh->dev[i].req.bi_io_vec[0].bv_page = sh->dev[i].page; ++ } + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + __release_stripe(conf, sh); +@@ -621,6 +626,25 @@ static sector_t compute_blocknr(struct s + } + + ++static struct page *zero_copy_data(struct bio *bio, sector_t sector) ++{ ++ sector_t bi_sector = bio->bi_sector; ++ struct page *page; ++ struct bio_vec *bvl; ++ int i; ++ ++ bio_for_each_segment(bvl, bio, i) { ++ if (sector > bi_sector) { ++ bi_sector += bio_iovec_idx(bio, i)->bv_len >> 9; ++ continue; ++ } ++ BUG_ON(sector != bi_sector); ++ page = bio_iovec_idx(bio, i)->bv_page; ++ return PageConstant(page) ? page : NULL; ++ } ++ BUG(); ++ return NULL; ++} + + /* + * Copy data between a page in the stripe cache, and one or more bion +@@ -716,8 +740,9 @@ static void compute_parity(struct stripe + { + raid5_conf_t *conf = sh->raid_conf; + int i, pd_idx = sh->pd_idx, disks = conf->raid_disks, count; +- void *ptr[MAX_XOR_BLOCKS]; ++ void *ptr[MAX_XOR_BLOCKS], *h_ptr[2]; + struct bio *chosen; ++ struct page *page; + + PRINTK("compute_parity, stripe %llu, method %d\n", + (unsigned long long)sh->sector, method); +@@ -744,13 +769,14 @@ static void compute_parity(struct stripe + break; + case RECONSTRUCT_WRITE: + memset(ptr[0], 0, STRIPE_SIZE); +- for (i= disks; i-- ;) ++ for (i= disks; i-- ;) { + if (i!=pd_idx && sh->dev[i].towrite) { + chosen = sh->dev[i].towrite; + sh->dev[i].towrite = NULL; + if (sh->dev[i].written) BUG(); + sh->dev[i].written = chosen; + } ++ } + break; + case CHECK_PARITY: + break; +@@ -760,34 +786,88 @@ static void compute_parity(struct stripe + count = 1; + } + +- for (i = disks; i--;) +- if (sh->dev[i].written) { +- sector_t sector = sh->dev[i].sector; +- struct bio *wbi = sh->dev[i].written; +- while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { +- copy_data(1, wbi, sh->dev[i].page, sector); +- wbi = r5_next_bio(wbi, sector); ++ for (i = disks; i--;) { ++ struct bio *wbi = sh->dev[i].written; ++ sector_t sector; ++ ++ if (!wbi) ++ continue; ++ ++ sector = sh->dev[i].sector; ++ set_bit(R5_LOCKED, &sh->dev[i].flags); ++ BUG_ON(test_bit(R5_Direct, &sh->dev[i].flags)); ++ ++ /* check if it's covered by a single page ++ and whole stripe is written at once. ++ * in this case we can avoid memcpy() */ ++ if (!wbi->bi_next && test_bit(R5_OVERWRITE, &sh->dev[i].flags) && ++ test_bit(R5_Insync, &sh->dev[i].flags)) { ++ page = zero_copy_data(wbi, sector); ++ if (page) { ++ atomic_inc(&conf->writes_zcopy); ++ sh->dev[i].req.bi_io_vec[0].bv_page = page; ++ set_bit(R5_Direct, &sh->dev[i].flags); ++ clear_bit(R5_UPTODATE, &sh->dev[i].flags); ++ clear_bit(R5_OVERWRITE, &sh->dev[i].flags); ++ continue; + } ++ } + +- set_bit(R5_LOCKED, &sh->dev[i].flags); +- set_bit(R5_UPTODATE, &sh->dev[i].flags); ++ atomic_inc(&conf->writes_copied); ++ test_and_clear_bit(R5_OVERWRITE, &sh->dev[i].flags); ++ set_bit(R5_UPTODATE, &sh->dev[i].flags); ++ while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { ++ copy_data(1, wbi, sh->dev[i].page, sector); ++ wbi = r5_next_bio(wbi, sector); + } ++ } + ++ h_ptr[0] = ptr[0]; + switch(method) { + case RECONSTRUCT_WRITE: + case CHECK_PARITY: +- for (i=disks; i--;) +- if (i != pd_idx) { +- ptr[count++] = page_address(sh->dev[i].page); +- check_xor(); ++ for (i=disks; i--;) { ++ if (i == pd_idx) ++ continue; ++ if (test_bit(R5_Direct, &sh->dev[i].flags)) ++ page = sh->dev[i].req.bi_io_vec[0].bv_page; ++ else ++ page = sh->dev[i].page; ++ ++ /* have to compute the parity immediately for ++ * a highmem page. it would happen for zerocopy. -jay ++ */ ++ if (PageHighMem(page)) { ++ h_ptr[1] = kmap_atomic(page, KM_USER0); ++ xor_block(2, STRIPE_SIZE, h_ptr); ++ kunmap_atomic(page, KM_USER0); ++ } else { ++ ptr[count++] = page_address(page); + } ++ check_xor(); ++ } + break; + case READ_MODIFY_WRITE: +- for (i = disks; i--;) +- if (sh->dev[i].written) { +- ptr[count++] = page_address(sh->dev[i].page); +- check_xor(); ++ for (i = disks; i--;) { ++ if (!sh->dev[i].written) ++ continue; ++ if (test_bit(R5_Direct, &sh->dev[i].flags)) ++ page = sh->dev[i].req.bi_io_vec[0].bv_page; ++ else ++ page = sh->dev[i].page; ++ ++ /* have to compute the parity immediately for ++ * a highmem page. it would happen for zerocopy. -jay ++ */ ++ if (PageHighMem(page)) { ++ h_ptr[1] = kmap_atomic(page, KM_USER0); ++ xor_block(2, STRIPE_SIZE, h_ptr); ++ kunmap_atomic(page, KM_USER0); ++ } else { ++ ptr[count++] = page_address(page); + } ++ check_xor(); ++ } + } + if (count != 1) + xor_block(count, STRIPE_SIZE, ptr); +@@ -1059,13 +1139,15 @@ static void handle_stripe(struct stripe_ + if (sh->dev[i].written) { + dev = &sh->dev[i]; + if (!test_bit(R5_LOCKED, &dev->flags) && +- test_bit(R5_UPTODATE, &dev->flags) ) { ++ (test_bit(R5_UPTODATE, &dev->flags) || ++ test_bit(R5_Direct, &dev->flags)) ) { + /* We can return any write requests */ + struct bio *wbi, *wbi2; + PRINTK("Return write for disc %d\n", i); + spin_lock_irq(&conf->device_lock); + wbi = dev->written; + dev->written = NULL; ++ test_and_clear_bit(R5_Direct, &dev->flags); + while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) { + wbi2 = r5_next_bio(wbi, dev->sector); + if (--wbi->bi_phys_segments == 0) { +@@ -1831,6 +1913,7 @@ memory = conf->max_nr_stripes * (sizeof( + if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) + mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + } ++ mddev->queue->backing_dev_info.capabilities |= BDI_CAP_PAGE_CONST_WRITE; + + /* Ok, everything is just fine now */ + mddev->array_size = mddev->size * (mddev->raid_disks - 1); +@@ -1918,9 +2001,11 @@ static void status (struct seq_file *seq + atomic_read(&conf->handled_in_raid5d), + atomic_read(&conf->out_of_stripes), + atomic_read(&conf->handle_called)); +- seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw", ++ seq_printf (seq, "\n\t\treads: %u for rmw, %u for rcw. zcopy writes: %u, copied writes: %u", + atomic_read(&conf->reads_for_rmw), +- atomic_read(&conf->reads_for_rcw)); ++ atomic_read(&conf->reads_for_rcw), ++ atomic_read(&conf->writes_zcopy), ++ atomic_read(&conf->writes_copied)); + seq_printf (seq, "\n\t\t%u delayed, %u active, queues: %u in, %u out\n", + atomic_read(&conf->delayed), + atomic_read(&conf->active_stripes), +diff -pru linux-2.6.9.orig/include/linux/backing-dev.h linux-2.6.9/include/linux/backing-dev.h +--- linux-2.6.9.orig/include/linux/backing-dev.h 2004-10-18 15:53:46.000000000 -0600 ++++ linux-2.6.9/include/linux/backing-dev.h 2007-07-13 00:12:46.000000000 -0600 +@@ -30,8 +30,11 @@ struct backing_dev_info { + void *congested_data; /* Pointer to aux data for congested func */ + void (*unplug_io_fn)(struct backing_dev_info *, struct page *); + void *unplug_io_data; ++ unsigned int capabilities; + }; + ++#define BDI_CAP_PAGE_CONST_WRITE 0x00000001 ++ + extern struct backing_dev_info default_backing_dev_info; + void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page); + +@@ -62,4 +65,7 @@ static inline int bdi_rw_congested(struc + (1 << BDI_write_congested)); + } + ++#define mapping_cap_page_constant_write(mapping) \ ++ ((mapping)->backing_dev_info->capabilities & BDI_CAP_PAGE_CONST_WRITE) ++ + #endif /* _LINUX_BACKING_DEV_H */ +diff -pru linux-2.6.9.orig/include/linux/page-flags.h linux-2.6.9/include/linux/page-flags.h +--- linux-2.6.9.orig/include/linux/page-flags.h 2004-10-18 15:54:39.000000000 -0600 ++++ linux-2.6.9/include/linux/page-flags.h 2007-07-13 00:12:46.000000000 -0600 +@@ -74,6 +74,7 @@ + #define PG_swapcache 16 /* Swap page: swp_entry_t in private */ + #define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ + #define PG_reclaim 18 /* To be reclaimed asap */ ++#define PG_constant 19 /* To mark the page is constant */ + + + /* +@@ -298,6 +299,11 @@ extern unsigned long __read_page_state(u + #define PageSwapCache(page) 0 + #endif + ++#define PageConstant(page) test_bit(PG_constant, &(page)->flags) ++#define SetPageConstant(page) set_bit(PG_constant, &(page)->flags) ++#define ClearPageConstant(page) clear_bit(PG_constant, &(page->flags)) ++#define TestSetPageConstant(page) test_and_set_bit(PG_constant, &(page)->flags) ++ + struct page; /* forward declaration */ + + int test_clear_page_dirty(struct page *page); +diff -pru linux-2.6.9.orig/include/linux/pagemap.h linux-2.6.9/include/linux/pagemap.h +--- linux-2.6.9.orig/include/linux/pagemap.h 2004-10-18 15:53:06.000000000 -0600 ++++ linux-2.6.9/include/linux/pagemap.h 2007-07-13 00:12:46.000000000 -0600 +@@ -191,6 +191,19 @@ static inline void wait_on_page_writebac + + extern void end_page_writeback(struct page *page); + ++extern int set_page_constant(struct page *page); ++extern void clear_page_constant(struct page *); ++static inline int set_page_constant_lock(struct page *page) ++{ ++ BUG_ON(PageLocked(page)); ++ lock_page(page); ++ if (set_page_constant(page)) { ++ unlock_page(page); ++ return 1; ++ } ++ return 0; ++} ++ + /* + * Fault a userspace page into pagetables. Return non-zero on a fault. + * +diff -pru linux-2.6.9.orig/include/linux/raid/raid5.h linux-2.6.9/include/linux/raid/raid5.h +--- linux-2.6.9.orig/include/linux/raid/raid5.h 2007-07-09 02:43:33.000000000 -0600 ++++ linux-2.6.9/include/linux/raid/raid5.h 2007-07-13 00:39:15.000000000 -0600 +@@ -153,6 +153,7 @@ struct stripe_head { + #define R5_Wantread 4 /* want to schedule a read */ + #define R5_Wantwrite 5 + #define R5_Syncio 6 /* this io need to be accounted as resync io */ ++#define R5_Direct 7 /* use page from passed bio to avoid memcpy */ + + /* + * Write method +@@ -234,6 +235,8 @@ struct raid5_private_data { + atomic_t out_of_stripes; + atomic_t reads_for_rmw; + atomic_t reads_for_rcw; ++ atomic_t writes_zcopy; ++ atomic_t writes_copied; + atomic_t handle_called; + atomic_t delayed; + atomic_t in_reqs_in_queue; +diff -pru linux-2.6.9.orig/mm/filemap.c linux-2.6.9/mm/filemap.c +--- linux-2.6.9.orig/mm/filemap.c 2007-07-09 02:43:33.000000000 -0600 ++++ linux-2.6.9/mm/filemap.c 2007-07-13 00:12:46.000000000 -0600 +@@ -27,6 +27,8 @@ + #include <linux/pagevec.h> + #include <linux/blkdev.h> + #include <linux/security.h> ++#include <linux/rmap.h> ++ + /* + * This is needed for the following functions: + * - try_to_release_page +@@ -486,11 +488,52 @@ void end_page_writeback(struct page *pag + BUG(); + smp_mb__after_clear_bit(); + } ++ clear_page_constant(page); + wake_up_page(page); + } + + EXPORT_SYMBOL(end_page_writeback); + ++/* Mark a page in bio to be constant, page must be locked */ ++int set_page_constant(struct page *page) ++{ ++ BUG_ON(!PageLocked(page)); ++ ++ /* If it's an anonymous page and haven't been added to swap cache, ++ * do it here. ++ */ ++ if (PageAnon(page) && !PageSwapCache(page)) ++ return 1; ++ ++ BUG_ON(!PageUptodate(page)); ++ ++ /* I have to clear page uptodate before trying to remove ++ * it from user's page table because otherwise, the page may be ++ * reinstalled by a page access which happens between try_to_unmap() ++ * and ClearPageUptodate(). -jay ++ */ ++ ClearPageUptodate(page); ++ if (page_mapped(page) && try_to_unmap(page) != SWAP_SUCCESS) { ++ SetPageUptodate(page); ++ return 1; ++ } ++ SetPageConstant(page); ++ return 0; ++} ++ ++void clear_page_constant(struct page *page) ++{ ++ if (PageConstant(page)) { ++ BUG_ON(!PageLocked(page)); ++ BUG_ON(PageUptodate(page)); ++ ClearPageConstant(page); ++ SetPageUptodate(page); ++ unlock_page(page); ++ } ++} ++EXPORT_SYMBOL(set_page_constant); ++EXPORT_SYMBOL(clear_page_constant); ++ + /* + * Get a lock on the page, assuming we need to sleep to get it. + *