Commit 9fe4b52a authored by Jinshan Xiong's avatar Jinshan Xiong Committed by Andreas Dilger
Browse files

LU-1030 osc: new IO engine implementation



New IO engine to manage dirty pages with osc_extent.

Osc_extent is a data structure to manage a series of contiguous
blocks; however, the pages in an extent is not required to be
contiguous. An extent must be written out in one RPC.

The purpose of introducing extents are:
1. make grants work for extent-based OSD;
2. form better IO by picking up contiguous pages to compose RPC;
3. reimplement ll_writepages() with CIT_FSYNC.
Signed-off-by: default avatarJinshan Xiong <jinshan.xiong@whamcloud.com>
Change-Id: I3ef619c1f07eefd201236ab55e5fd858791d41e0
Reviewed-on: http://review.whamcloud.com/2270

Reviewed-by: default avatarJohann Lombardi <johann@whamcloud.com>
Tested-by: Hudson
Tested-by: default avatarMaloo <whamcloud.maloo@gmail.com>
Reviewed-by: default avatarAndreas Dilger <adilger@whamcloud.com>
parent 921fa0f2
......@@ -62,6 +62,7 @@
#include <linux/types.h>
#include <asm/timex.h>
#include <linux/sched.h> /* THREAD_SIZE */
#include <linux/rbtree.h>
#define CFS_THREAD_SIZE THREAD_SIZE
#define LUSTRE_TRACE_SIZE (THREAD_SIZE >> 5)
......
......@@ -415,6 +415,78 @@ static inline void radix_tree_preload_end(void)
{
}
/***************************************************************************
*
* Linux kernel red black tree emulation.
*
***************************************************************************/
struct rb_node {
unsigned long rb_parent_color;
#define RB_RED 0
#define RB_BLACK 1
struct rb_node *rb_right;
struct rb_node *rb_left;
};
struct rb_root {
struct rb_node *rb_node;
};
#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~3))
#define rb_color(r) ((r)->rb_parent_color & 1)
#define rb_is_red(r) (!rb_color(r))
#define rb_is_black(r) rb_color(r)
#define rb_set_red(r) do { (r)->rb_parent_color &= ~1; } while (0)
#define rb_set_black(r) do { (r)->rb_parent_color |= 1; } while (0)
static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
}
static inline void rb_set_color(struct rb_node *rb, int color)
{
rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
}
#define RB_ROOT ((struct rb_root) { NULL, })
#define rb_entry(ptr, type, member) container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))
static inline void rb_init_node(struct rb_node *rb)
{
rb->rb_parent_color = 0;
rb->rb_right = NULL;
rb->rb_left = NULL;
RB_CLEAR_NODE(rb);
}
extern void rb_insert_color(struct rb_node *, struct rb_root *);
extern void rb_erase(struct rb_node *, struct rb_root *);
/* Find logical next and previous nodes in a tree */
extern struct rb_node *rb_next(const struct rb_node *);
extern struct rb_node *rb_prev(const struct rb_node *);
extern struct rb_node *rb_first(const struct rb_root *);
extern struct rb_node *rb_last(const struct rb_root *);
static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
{
node->rb_parent_color = (unsigned long)parent;
node->rb_left = node->rb_right = NULL;
*rb_link = node;
}
/***************************************************************************
*
* End of Linux kernel red black tree emulation.
*
***************************************************************************/
typedef ssize_t (*read_actor_t)();
#define CFS_IFSHIFT 12
......
......@@ -44,7 +44,8 @@ if LIBLUSTRE
noinst_LIBRARIES= libcfs.a
libcfs_a_SOURCES= posix/posix-debug.c user-prim.c user-lock.c user-tcpip.c \
prng.c user-bitops.c user-mem.c hash.c kernel_user_comm.c \
workitem.c fail.c libcfs_cpu.c libcfs_mem.c libcfs_lock.c
workitem.c fail.c libcfs_cpu.c libcfs_mem.c libcfs_lock.c \
posix/rbtree.c
libcfs_a_CPPFLAGS = $(LLCPPFLAGS)
libcfs_a_CFLAGS = $(LLCFLAGS)
endif
......@@ -70,7 +71,7 @@ nodist_libcfs_SOURCES := darwin/darwin-sync.c darwin/darwin-mem.c \
darwin/darwin-debug.c darwin/darwin-proc.c \
darwin/darwin-tracefile.c darwin/darwin-module.c \
posix/posix-debug.c module.c tracefile.c nidstrings.c watchdog.c \
kernel_user_comm.c hash.c
kernel_user_comm.c hash.c posix/rbtree.c
libcfs_CFLAGS := $(EXTRA_KCFLAGS)
libcfs_LDFLAGS := $(EXTRA_KLDFLAGS)
......
/*
Red Black Trees
(C) 1999 Andrea Arcangeli <andrea@suse.de>
(C) 2002 David Woodhouse <dwmw2@infradead.org>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
linux/lib/rbtree.c
*/
#include <libcfs/libcfs.h>
static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
{
struct rb_node *right = node->rb_right;
struct rb_node *parent = rb_parent(node);
node->rb_right = right->rb_left;
if (node->rb_right != NULL)
rb_set_parent(right->rb_left, node);
right->rb_left = node;
rb_set_parent(right, parent);
if (parent) {
if (node == parent->rb_left)
parent->rb_left = right;
else
parent->rb_right = right;
} else
root->rb_node = right;
rb_set_parent(node, right);
}
static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
{
struct rb_node *left = node->rb_left;
struct rb_node *parent = rb_parent(node);
node->rb_left = left->rb_right;
if (node->rb_left != NULL)
rb_set_parent(left->rb_right, node);
left->rb_right = node;
rb_set_parent(left, parent);
if (parent) {
if (node == parent->rb_right)
parent->rb_right = left;
else
parent->rb_left = left;
} else
root->rb_node = left;
rb_set_parent(node, left);
}
void rb_insert_color(struct rb_node *node, struct rb_root *root)
{
struct rb_node *parent, *gparent;
while ((parent = rb_parent(node)) != NULL && rb_is_red(parent)) {
gparent = rb_parent(parent);
if (parent == gparent->rb_left) {
register struct rb_node *uncle = gparent->rb_right;
if (uncle && rb_is_red(uncle)) {
rb_set_black(uncle);
rb_set_black(parent);
rb_set_red(gparent);
node = gparent;
continue;
}
if (parent->rb_right == node) {
register struct rb_node *tmp;
__rb_rotate_left(parent, root);
tmp = parent;
parent = node;
node = tmp;
}
rb_set_black(parent);
rb_set_red(gparent);
__rb_rotate_right(gparent, root);
} else {
register struct rb_node *uncle = gparent->rb_left;
if (uncle && rb_is_red(uncle)) {
rb_set_black(uncle);
rb_set_black(parent);
rb_set_red(gparent);
node = gparent;
continue;
}
if (parent->rb_left == node) {
register struct rb_node *tmp;
__rb_rotate_right(parent, root);
tmp = parent;
parent = node;
node = tmp;
}
rb_set_black(parent);
rb_set_red(gparent);
__rb_rotate_left(gparent, root);
}
}
rb_set_black(root->rb_node);
}
static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
struct rb_root *root)
{
struct rb_node *ptr;
while ((!node || rb_is_black(node)) && node != root->rb_node) {
if (parent->rb_left == node) {
ptr = parent->rb_right;
if (rb_is_red(ptr)) {
rb_set_black(ptr);
rb_set_red(parent);
__rb_rotate_left(parent, root);
ptr = parent->rb_right;
}
if ((!ptr->rb_left || rb_is_black(ptr->rb_left)) &&
(!ptr->rb_right || rb_is_black(ptr->rb_right))) {
rb_set_red(ptr);
node = parent;
parent = rb_parent(node);
} else {
if (!ptr->rb_right ||
rb_is_black(ptr->rb_right)) {
rb_set_black(ptr->rb_left);
rb_set_red(ptr);
__rb_rotate_right(ptr, root);
ptr = parent->rb_right;
}
rb_set_color(ptr, rb_color(parent));
rb_set_black(parent);
rb_set_black(ptr->rb_right);
__rb_rotate_left(parent, root);
node = root->rb_node;
break;
}
} else {
ptr = parent->rb_left;
if (rb_is_red(ptr)) {
rb_set_black(ptr);
rb_set_red(parent);
__rb_rotate_right(parent, root);
ptr = parent->rb_left;
}
if ((!ptr->rb_left || rb_is_black(ptr->rb_left)) &&
(!ptr->rb_right || rb_is_black(ptr->rb_right))) {
rb_set_red(ptr);
node = parent;
parent = rb_parent(node);
} else {
if (!ptr->rb_left ||
rb_is_black(ptr->rb_left)) {
rb_set_black(ptr->rb_right);
rb_set_red(ptr);
__rb_rotate_left(ptr, root);
ptr = parent->rb_left;
}
rb_set_color(ptr, rb_color(parent));
rb_set_black(parent);
rb_set_black(ptr->rb_left);
__rb_rotate_right(parent, root);
node = root->rb_node;
break;
}
}
}
if (node)
rb_set_black(node);
}
void rb_erase(struct rb_node *node, struct rb_root *root)
{
struct rb_node *child, *parent;
int color;
if (!node->rb_left)
child = node->rb_right;
else if (!node->rb_right)
child = node->rb_left;
else {
struct rb_node *old = node, *left;
node = node->rb_right;
while ((left = node->rb_left) != NULL)
node = left;
if (rb_parent(old)) {
if (rb_parent(old)->rb_left == old)
rb_parent(old)->rb_left = node;
else
rb_parent(old)->rb_right = node;
} else
root->rb_node = node;
child = node->rb_right;
parent = rb_parent(node);
color = rb_color(node);
if (parent == old) {
parent = node;
} else {
if (child)
rb_set_parent(child, parent);
parent->rb_left = child;
node->rb_right = old->rb_right;
rb_set_parent(old->rb_right, node);
}
node->rb_parent_color = old->rb_parent_color;
node->rb_left = old->rb_left;
rb_set_parent(old->rb_left, node);
goto color;
}
parent = rb_parent(node);
color = rb_color(node);
if (child)
rb_set_parent(child, parent);
if (parent) {
if (parent->rb_left == node)
parent->rb_left = child;
else
parent->rb_right = child;
} else
root->rb_node = child;
color:
if (color == RB_BLACK)
__rb_erase_color(child, parent, root);
}
/*
* This function returns the first node (in sort order) of the tree.
*/
struct rb_node *rb_first(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_left)
n = n->rb_left;
return n;
}
struct rb_node *rb_last(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_right)
n = n->rb_right;
return n;
}
struct rb_node *rb_next(const struct rb_node *node)
{
struct rb_node *parent;
if (rb_parent(node) == node)
return NULL;
/* If we have a right-hand child, go down and then left as far
as we can. */
if (node->rb_right) {
node = node->rb_right;
while (node->rb_left)
node = node->rb_left;
return (struct rb_node *)node;
}
/* No right-hand children. Everything down and left is
smaller than us, so any 'next' node must be in the general
direction of our parent. Go up the tree; any time the
ancestor is a right-hand child of its parent, keep going
up. First time it's a left-hand child of its parent, said
parent is our 'next' node. */
while ((parent = rb_parent(node)) && node == parent->rb_right)
node = parent;
return parent;
}
struct rb_node *rb_prev(const struct rb_node *node)
{
struct rb_node *parent;
if (rb_parent(node) == node)
return NULL;
/* If we have a left-hand child, go down and then right as far
as we can. */
if (node->rb_left) {
node = node->rb_left;
while (node->rb_right)
node = node->rb_right;
return (struct rb_node *)node;
}
/* No left-hand children. Go up till we find an ancestor which
is a right-hand child of its parent */
while ((parent = rb_parent(node)) && node == parent->rb_left)
node = parent;
return parent;
}
......@@ -1054,6 +1054,15 @@ struct cl_page_operations {
*/
int (*cpo_cancel)(const struct lu_env *env,
const struct cl_page_slice *slice);
/**
* Write out a page by kernel. This is only called by ll_writepage
* right now.
*
* \see cl_page_flush()
*/
int (*cpo_flush)(const struct lu_env *env,
const struct cl_page_slice *slice,
struct cl_io *io);
/** @} transfer */
};
......@@ -1960,11 +1969,6 @@ enum cl_io_state {
CIS_FINI
};
enum cl_req_priority {
CRP_NORMAL,
CRP_CANCEL
};
/**
* IO state private for a layer.
*
......@@ -2082,8 +2086,7 @@ struct cl_io_operations {
int (*cio_submit)(const struct lu_env *env,
const struct cl_io_slice *slice,
enum cl_req_type crt,
struct cl_2queue *queue,
enum cl_req_priority priority);
struct cl_2queue *queue);
} req_op[CRT_NR];
/**
* Read missing page.
......@@ -2245,6 +2248,18 @@ enum cl_io_lock_dmd {
CILR_PEEK
};
enum cl_fsync_mode {
/** start writeback, do not wait for them to finish */
CL_FSYNC_NONE = 0,
/** start writeback and wait for them to finish */
CL_FSYNC_LOCAL = 1,
/** discard all of dirty pages in a specific file range */
CL_FSYNC_DISCARD = 2,
/** start writeback and make sure they have reached storage before
* return. OST_SYNC RPC must be issued and finished */
CL_FSYNC_ALL = 3
};
struct cl_io_rw_common {
loff_t crw_pos;
size_t crw_count;
......@@ -2291,6 +2306,7 @@ struct cl_io {
struct cl_wr_io {
struct cl_io_rw_common wr;
int wr_append;
int wr_sync;
} ci_wr;
struct cl_io_rw_common ci_rw;
struct cl_setattr_io {
......@@ -2318,6 +2334,9 @@ struct cl_io {
struct obd_capa *fi_capa;
/** file system level fid */
struct lu_fid *fi_fid;
enum cl_fsync_mode fi_mode;
/* how many pages were written/discarded */
unsigned int fi_nr_written;
} ci_fsync;
} u;
struct cl_2queue ci_queue;
......@@ -2769,6 +2788,8 @@ int cl_page_cache_add (const struct lu_env *env, struct cl_io *io,
void cl_page_clip (const struct lu_env *env, struct cl_page *pg,
int from, int to);
int cl_page_cancel (const struct lu_env *env, struct cl_page *page);
int cl_page_flush (const struct lu_env *env, struct cl_io *io,
struct cl_page *pg);
/** @} transfer */
......@@ -2815,9 +2836,19 @@ struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
const struct cl_lock_descr *need,
const char *scope, const void *source);
struct cl_lock *cl_lock_at_page(const struct lu_env *env, struct cl_object *obj,
struct cl_page *page, struct cl_lock *except,
int pending, int canceld);
struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
struct cl_object *obj, pgoff_t index,
struct cl_lock *except, int pending,
int canceld);
static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env,
struct cl_object *obj,
struct cl_page *page,
struct cl_lock *except,
int pending, int canceld)
{
return cl_lock_at_pgoff(env, obj, page->cp_index, except,
pending, canceld);
}
const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
const struct lu_device_type *dtype);
......@@ -2899,8 +2930,7 @@ int cl_lock_mutex_try (const struct lu_env *env, struct cl_lock *lock);
void cl_lock_mutex_put (const struct lu_env *env, struct cl_lock *lock);
int cl_lock_is_mutexed (struct cl_lock *lock);
int cl_lock_nr_mutexed (const struct lu_env *env);
int cl_lock_page_out (const struct lu_env *env, struct cl_lock *lock,
int discard);
int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock);
int cl_lock_ext_match (const struct cl_lock_descr *has,
const struct cl_lock_descr *need);
int cl_lock_descr_match(const struct cl_lock_descr *has,
......@@ -2958,11 +2988,10 @@ int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
int cl_io_commit_write (const struct lu_env *env, struct cl_io *io,
struct cl_page *page, unsigned from, unsigned to);
int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io,
enum cl_req_type iot, struct cl_2queue *queue,
enum cl_req_priority priority);
enum cl_req_type iot, struct cl_2queue *queue);
int cl_io_submit_sync (const struct lu_env *env, struct cl_io *io,
enum cl_req_type iot, struct cl_2queue *queue,
enum cl_req_priority priority, long timeout);
enum cl_req_type iot, struct cl_2queue *queue,
long timeout);
void cl_io_rw_advance (const struct lu_env *env, struct cl_io *io,
size_t nob);
int cl_io_cancel (const struct lu_env *env, struct cl_io *io,
......@@ -2977,6 +3006,16 @@ static inline int cl_io_is_append(const struct cl_io *io)
return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append;
}
static inline int cl_io_is_sync_write(const struct cl_io *io)
{
return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync;
}
static inline int cl_io_is_mkwrite(const struct cl_io *io)
{
return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite;
}
/**
* True, iff \a io is a truncate(2).
*/
......
......@@ -54,6 +54,7 @@
# include <linux/lustre_intent.h>
#endif
#define CLIENT_OBD_LIST_LOCK_DEBUG 1
typedef struct {
cfs_spinlock_t lock;
......
......@@ -244,7 +244,7 @@ union ptlrpc_async_args {
* least big enough for that.
*/
void *pointer_arg[11];
__u64 space[6];
__u64 space[7];
};
struct ptlrpc_request_set;
......
......@@ -453,10 +453,22 @@ struct client_obd {
long cl_dirty_transit; /* dirty synchronous */
long cl_avail_grant; /* bytes of credit for ost */
long cl_lost_grant; /* lost credits (trunc) */
cfs_list_t cl_cache_waiters; /* waiting for cache/grant */