From 2ce82af177059e9fe56f1a48a38b2b9338b74b4c Mon Sep 17 00:00:00 2001 From: pravins <pravins> Date: Thu, 10 Apr 2008 05:17:55 +0000 Subject: [PATCH] b=14340 i=yury.umanets i=adilger this patch extends lu_dirent->lde_hash element, which is required for dmu based mdd. but I am keeping MEA_MAGIC_HASH_SEGMENT hash within 32 bit. --- lustre/include/dt_object.h | 4 +- lustre/include/lu_object.h | 2 +- lustre/include/lustre/lustre_idl.h | 8 ++-- lustre/include/obd.h | 5 ++- lustre/liblustre/dir.c | 8 ++-- lustre/llite/dir.c | 22 +++++------ lustre/lmv/lmv_obd.c | 63 ++++++++++++++++++++---------- lustre/mdd/mdd_object.c | 10 ++--- lustre/obdclass/mea.c | 8 ++-- lustre/osd/osd_handler.c | 4 +- 10 files changed, 78 insertions(+), 56 deletions(-) diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h index 563adc16ef..3d2d9204e9 100644 --- a/lustre/include/dt_object.h +++ b/lustre/include/dt_object.h @@ -381,10 +381,10 @@ struct dt_index_operations { const struct dt_it *di); struct dt_rec *(*rec)(const struct lu_env *env, const struct dt_it *di); - __u32 (*store)(const struct lu_env *env, + __u64 (*store)(const struct lu_env *env, const struct dt_it *di); int (*load)(const struct lu_env *env, - const struct dt_it *di, __u32 hash); + const struct dt_it *di, __u64 hash); } dio_it; }; diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index 5c64d90595..ef3645dab1 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -836,7 +836,7 @@ static inline __u32 lu_object_attr(const struct lu_object *o) struct lu_rdpg { /* input params, should be filled out by mdt */ - __u32 rp_hash; /* hash */ + __u64 rp_hash; /* hash */ int rp_count; /* count in bytes */ int rp_npages; /* number of pages */ struct page **rp_pages; /* pointers to pages */ diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 65e1f9179f..f8061b920e 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -351,15 +351,15 @@ static inline int lu_fid_eq(const struct lu_fid *f0, */ struct lu_dirent { struct lu_fid lde_fid; - __u32 lde_hash; + __u64 lde_hash; __u16 lde_reclen; __u16 lde_namelen; char lde_name[0]; }; struct lu_dirpage { - __u32 ldp_hash_start; - __u32 ldp_hash_end; + __u64 ldp_hash_start; + __u64 ldp_hash_end; __u16 ldp_flags; __u32 ldp_pad0; struct lu_dirent ldp_entries[0]; @@ -398,7 +398,7 @@ static inline int lu_dirent_size(struct lu_dirent *ent) return le16_to_cpu(ent->lde_reclen); } -#define DIR_END_OFF 0xfffffffeUL +#define DIR_END_OFF 0xfffffffffffffffeULL struct lustre_handle { __u64 cookie; diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 2f4f7713aa..15d9377943 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -1256,8 +1256,9 @@ enum { #define MEA_MAGIC_ALL_CHARS 0xb222a11c #define MEA_MAGIC_HASH_SEGMENT 0xb222a11b -#define MAX_HASH_SIZE 0x7fffffffUL -#define MAX_HASH_HIGHEST_BIT 0x10000000 +#define MAX_HASH_SIZE_32 0x7fffffffUL +#define MAX_HASH_SIZE 0x7fffffffffffffffULL +#define MAX_HASH_HIGHEST_BIT 0x1000000000000000 struct lustre_md { struct mdt_body *body; diff --git a/lustre/liblustre/dir.c b/lustre/liblustre/dir.c index a297c9be62..72b5157322 100644 --- a/lustre/liblustre/dir.c +++ b/lustre/liblustre/dir.c @@ -238,8 +238,8 @@ ssize_t llu_iop_filldirentries(struct inode *dir, _SYSIO_OFF_T *basep, * If page is empty (end of directoryis reached), * use this value. */ - __u32 hash = DIR_END_OFF; - __u32 next; + __u64 hash = DIR_END_OFF; + __u64 next; dp = page->addr; for (ent = lu_dirent_start(dp); ent != NULL && !done; @@ -249,7 +249,7 @@ ssize_t llu_iop_filldirentries(struct inode *dir, _SYSIO_OFF_T *basep, struct lu_fid fid; ino_t ino; - hash = le32_to_cpu(ent->lde_hash); + hash = le64_to_cpu(ent->lde_hash); namelen = le16_to_cpu(ent->lde_namelen); if (hash < pos) @@ -274,7 +274,7 @@ ssize_t llu_iop_filldirentries(struct inode *dir, _SYSIO_OFF_T *basep, (loff_t)hash, ino, DT_UNKNOWN, &filled); } - next = le32_to_cpu(dp->ldp_hash_end); + next = le64_to_cpu(dp->ldp_hash_end); OBD_PAGE_FREE(page); if (!done) { pos = next; diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index c230e029a0..74f8abca9b 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -201,7 +201,7 @@ static inline void ll_put_page(struct page *page) * Find, kmap and return page that contains given hash. */ static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash, - __u32 *start, __u32 *end) + __u64 *start, __u64 *end) { struct address_space *mapping = dir->i_mapping; /* @@ -232,8 +232,8 @@ static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash, wait_on_page(page); if (PageUptodate(page)) { dp = kmap(page); - *start = le32_to_cpu(dp->ldp_hash_start); - *end = le32_to_cpu(dp->ldp_hash_end); + *start = le64_to_cpu(dp->ldp_hash_start); + *end = le64_to_cpu(dp->ldp_hash_end); LASSERT(*start <= hash); if (hash > *end || (*end != *start && hash == *end)) { kunmap(page); @@ -265,8 +265,8 @@ static struct page *ll_get_dir_page(struct inode *dir, __u32 hash, int exact, struct page *page; ldlm_mode_t mode; int rc; - __u32 start; - __u32 end; + __u64 start; + __u64 end; mode = LCK_PR; rc = md_lock_match(ll_i2sbi(dir)->ll_md_exp, LDLM_FL_BLOCK_GRANTED, @@ -354,8 +354,8 @@ static struct page *ll_get_dir_page(struct inode *dir, __u32 hash, int exact, hash_collision: dp = page_address(page); - start = le32_to_cpu(dp->ldp_hash_start); - end = le32_to_cpu(dp->ldp_hash_end); + start = le64_to_cpu(dp->ldp_hash_start); + end = le64_to_cpu(dp->ldp_hash_end); if (end == start) { LASSERT(start == hash); CWARN("Page-wide hash collision: %#lx\n", (unsigned long)end); @@ -415,8 +415,8 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) * If page is empty (end of directoryis reached), * use this value. */ - __u32 hash = DIR_END_OFF; - __u32 next; + __u64 hash = DIR_END_OFF; + __u64 next; dp = page_address(page); for (ent = lu_dirent_start(dp); ent != NULL && !done; @@ -430,7 +430,7 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) * XXX: implement correct swabbing here. */ - hash = le32_to_cpu(ent->lde_hash); + hash = le64_to_cpu(ent->lde_hash); namelen = le16_to_cpu(ent->lde_namelen); if (hash < pos) @@ -454,7 +454,7 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) done = filldir(cookie, name, namelen, (loff_t)hash, ino, DT_UNKNOWN); } - next = le32_to_cpu(dp->ldp_hash_end); + next = le64_to_cpu(dp->ldp_hash_end); ll_put_page(page); if (!done) { pos = next; diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 34e539ca2c..501592ce03 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -2182,15 +2182,15 @@ int lmv_blocking_ast(struct ldlm_lock *lock, RETURN(0); } -static void lmv_hash_adjust(__u32 *hash, __u32 hash_adj) +static void lmv_hash_adjust(__u64 *hash, __u64 hash_adj) { - __u32 val; + __u64 val; - val = le32_to_cpu(*hash); + val = le64_to_cpu(*hash); if (val < hash_adj) val += MAX_HASH_SIZE; if (val != DIR_END_OFF) - *hash = cpu_to_le32(val - hash_adj); + *hash = cpu_to_le64(val - hash_adj); } static __u32 lmv_node_rank(struct obd_export *exp, const struct lu_fid *fid) @@ -2219,42 +2219,63 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid, struct obd_export *tgt_exp; struct lu_fid rid = *fid; struct lmv_obj *obj; - __u32 offset0; - __u32 offset; - __u32 hash_adj = 0; + __u64 offset; + __u64 hash_adj = 0; __u32 rank = 0; - __u32 seg_size = 0; + __u64 seg_size = 0; + __u64 tgt_tmp = 0; int tgt = 0; int tgt0 = 0; int rc; int nr = 0; ENTRY; - offset0 = offset = offset64; - /* - * Check that offset is representable by 32bit number. - */ - LASSERT((__u64)offset == offset64); + offset = offset64; rc = lmv_check_connect(obd); if (rc) RETURN(rc); - CDEBUG(D_INFO, "READPAGE at %x from "DFID"\n", offset, PFID(&rid)); + CDEBUG(D_INFO, "READPAGE at %llx from "DFID"\n", offset, PFID(&rid)); obj = lmv_obj_grab(obd, fid); if (obj) { + + /* + * This case handle directory lookup in clustered metadata case (i.e. + * split directory is located on multiple md servers.) + * each server keeps directory entries for certain range of hashes. + * E.g. we have N server and suppose hash range is 0 to MAX_HASH. + * first server will keep records with hashes [ 0 ... MAX_HASH / N - 1], + * second one with hashes [MAX_HASH / N ... 2 * MAX_HASH / N] and + * so on.... + * readdir can simply start reading entries from 0 - N server in + * order but that will not scale well as all client will request dir in + * to server in same order. + * Following algorithm does optimization: + * Instead of doing readdir in 1, 2, ...., N order, client with a + * rank R does readdir in R, R + 1, ..., N, 1, ... R - 1 order. + * (every client has rank R) + * But ll_readdir() expect offset range [0 to MAX_HASH/N) but + * since client ask dir from MDS{R} client has pages with offsets + * [R*MAX_HASH/N ... (R + 1)*MAX_HASH/N] there for we do hash_adj + * on hash values that we get. + */ + struct lmv_inode *loi; lmv_obj_lock(obj); nr = obj->lo_objcount; LASSERT(nr > 0); - seg_size = MAX_HASH_SIZE / nr; + seg_size = MAX_HASH_SIZE; + do_div(seg_size, nr); loi = obj->lo_inodes; rank = lmv_node_rank(lmv_get_export(lmv, loi[0].li_mds), fid) % nr; - tgt0 = (offset / seg_size) % nr; + tgt_tmp = offset; + do_div(tgt_tmp, seg_size); + tgt0 = do_div(tgt_tmp, nr); tgt = (tgt0 + rank) % nr; if (tgt < tgt0) @@ -2270,10 +2291,10 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid, hash_adj += rank * seg_size; - CDEBUG(D_INFO, "hash_adj: %x %x %x/%x -> %x/%x\n", + CDEBUG(D_INFO, "hash_adj: %x %llx %llx/%x -> %llx/%x\n", rank, hash_adj, offset, tgt0, offset + hash_adj, tgt); - offset = (offset + hash_adj) % MAX_HASH_SIZE; + offset = (offset + hash_adj) & MAX_HASH_SIZE; rid = obj->lo_inodes[tgt].li_fid; tgt_exp = lmv_get_export(lmv, loi[tgt].li_mds); @@ -2296,7 +2317,7 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid, lmv_hash_adjust(&dp->ldp_hash_start, hash_adj); lmv_hash_adjust(&dp->ldp_hash_end, hash_adj); - LASSERT(cpu_to_le32(dp->ldp_hash_start) <= offset0); + LASSERT(cpu_to_le32(dp->ldp_hash_start) <= offset64); for (ent = lu_dirent_start(dp); ent != NULL; ent = lu_dirent_next(ent)) @@ -2309,9 +2330,9 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid, if (end == DIR_END_OFF) { dp->ldp_hash_end = cpu_to_le32(seg_size * (tgt0 + 1)); - CDEBUG(D_INFO, ""DFID" reset end %x tgt %d\n", + CDEBUG(D_INFO, ""DFID" reset end %llx tgt %d\n", PFID(&rid), - le32_to_cpu(dp->ldp_hash_end), tgt); + le64_to_cpu(dp->ldp_hash_end), tgt); } } cfs_kunmap(page); diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c index 6400e41858..2cc5974744 100644 --- a/lustre/mdd/mdd_object.c +++ b/lustre/mdd/mdd_object.c @@ -1315,7 +1315,7 @@ static int mdd_readpage_sanity_check(const struct lu_env *env, static int mdd_dir_page_build(const struct lu_env *env, int first, void *area, int nob, struct dt_it_ops *iops, - struct dt_it *it, __u32 *start, __u32 *end, + struct dt_it *it, __u64 *start, __u64 *end, struct lu_dirent **last) { struct lu_fid *fid = &mdd_env_info(env)->mti_fid2; @@ -1338,7 +1338,7 @@ static int mdd_dir_page_build(const struct lu_env *env, int first, char *name; int len; int recsize; - __u32 hash; + __u64 hash; name = (char *)iops->key(env, it); len = iops->key_size(env, it); @@ -1352,7 +1352,7 @@ static int mdd_dir_page_build(const struct lu_env *env, int first, hash = iops->store(env, it); *end = hash; - CDEBUG(D_INFO, "%p %p %d "DFID": %#8.8x (%d) \"%*.*s\"\n", + CDEBUG(D_INFO, "%p %p %d "DFID": "LPU64" (%d) \"%*.*s\"\n", name, ent, nob, PFID(fid), hash, len, len, len, name); if (nob >= recsize) { @@ -1394,8 +1394,8 @@ static int __mdd_readpage(const struct lu_env *env, struct mdd_object *obj, int i; int rc; int nob; - __u32 hash_start; - __u32 hash_end; + __u64 hash_start; + __u64 hash_end; LASSERT(rdpg->rp_pages != NULL); LASSERT(next->do_index_ops != NULL); diff --git a/lustre/obdclass/mea.c b/lustre/obdclass/mea.c index 15f15d895c..a7bd3e4315 100644 --- a/lustre/obdclass/mea.c +++ b/lustre/obdclass/mea.c @@ -117,8 +117,8 @@ static __u32 hash_build(const char *name, int namelen) { __u32 hash; - hash = (hash_build0(name, namelen) << 1) & MAX_HASH_SIZE; - if (hash > MAX_HASH_SIZE - HASH_GRAY_AREA) + hash = (hash_build0(name, namelen) << 1) & MAX_HASH_SIZE_32; + if (hash > MAX_HASH_SIZE_32 - HASH_GRAY_AREA) hash &= HASH_GRAY_AREA - 1; return hash; } @@ -127,9 +127,9 @@ static int mea_hash_segment(int count, const char *name, int namelen) { __u32 hash; - LASSERT(IS_PO2(MAX_HASH_SIZE + 1)); + LASSERT(IS_PO2(MAX_HASH_SIZE_32 + 1)); - hash = hash_build(name, namelen) / (MAX_HASH_SIZE / count); + hash = hash_build(name, namelen) / (MAX_HASH_SIZE_32 / count); LASSERTF(hash < count, "hash %x count %d \n", hash, count); return hash; diff --git a/lustre/osd/osd_handler.c b/lustre/osd/osd_handler.c index 3c59117646..68f2e3e758 100644 --- a/lustre/osd/osd_handler.c +++ b/lustre/osd/osd_handler.c @@ -1909,7 +1909,7 @@ static struct dt_rec *osd_it_rec(const struct lu_env *env, return (struct dt_rec *)iam_it_rec_get(&it->oi_it); } -static __u32 osd_it_store(const struct lu_env *env, const struct dt_it *di) +static __u64 osd_it_store(const struct lu_env *env, const struct dt_it *di) { struct osd_it *it = (struct osd_it *)di; @@ -1917,7 +1917,7 @@ static __u32 osd_it_store(const struct lu_env *env, const struct dt_it *di) } static int osd_it_load(const struct lu_env *env, - const struct dt_it *di, __u32 hash) + const struct dt_it *di, __u64 hash) { struct osd_it *it = (struct osd_it *)di; -- GitLab