diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h index 4fa2f66f0b3d9a49aca754b31356933ee14ffdf9..84c6bd0039632247d09a951560655f8f1cc0ea83 100644 --- a/lnet/include/lnet/api.h +++ b/lnet/include/lnet/api.h @@ -78,6 +78,7 @@ int LNetNIFini(void); int LNetGetId(unsigned int index, struct lnet_process_id *id); int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order); lnet_nid_t LNetPrimaryNID(lnet_nid_t nid); +bool LNetIsPeerLocal(lnet_nid_t nid); /** @} lnet_addr */ diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index e912e58eaca519328ef8667f23d2142d8db17da1..aac37c093aad3e8f872752c99a9293c8b6ab247a 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -2933,6 +2933,35 @@ void LNetDebugPeer(struct lnet_process_id id) } EXPORT_SYMBOL(LNetDebugPeer); +/** + * Determine if the specified peer \a nid is on the local node. + * + * \param nid peer nid to check + * + * \retval true If peer NID is on the local node. + * \retval false If peer NID is not on the local node. + */ +bool LNetIsPeerLocal(lnet_nid_t nid) +{ + struct lnet_net *net; + struct lnet_ni *ni; + int cpt; + + cpt = lnet_net_lock_current(); + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (ni->ni_nid == nid) { + lnet_net_unlock(cpt); + return true; + } + } + } + lnet_net_unlock(cpt); + + return false; +} +EXPORT_SYMBOL(LNetIsPeerLocal); + /** * Retrieve the struct lnet_process_id ID of LNet interface at \a index. * Note that all interfaces share a same PID, as requested by LNetNIInit(). diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h index c79cc0263a606a89f25f06df8f30f779afd5ef4d..436139fa128c30385e3b40c4e5ea818e56f63c31 100644 --- a/lustre/include/dt_object.h +++ b/lustre/include/dt_object.h @@ -1060,6 +1060,13 @@ struct dt_object_operations { const struct lu_buf *buf, struct thandle *th); }; +enum dt_bufs_type { + DT_BUFS_TYPE_READ = 0x0000, + DT_BUFS_TYPE_WRITE = 0x0001, + DT_BUFS_TYPE_READAHEAD = 0x0002, + DT_BUFS_TYPE_LOCAL = 0x0004, +}; + /** * Per-dt-object operations on "file body" - unstructure raw data. */ @@ -1177,7 +1184,7 @@ struct dt_body_operations { loff_t pos, ssize_t len, struct niobuf_local *lb, - int rw); + enum dt_bufs_type rw); /** * Release reference granted by ->dbo_bufs_get(). @@ -2379,7 +2386,7 @@ static inline int dt_ref_del(const struct lu_env *env, static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d, struct niobuf_remote *rnb, - struct niobuf_local *lnb, int rw) + struct niobuf_local *lnb, enum dt_bufs_type rw) { LASSERT(d); LASSERT(d->do_body_ops); diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 8c918afc3d170c821574c1f58a51265c39d73e59..2e342ee4d3b12fe0a5801f58ac2d8a6dd8dc9d7a 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -2015,6 +2015,30 @@ int ptlrpc_connection_init(void); void ptlrpc_connection_fini(void); extern lnet_pid_t ptl_get_pid(void); +/* + * Check if the peer connection is on the local node. We need to use GFP_NOFS + * for requests from a local client to avoid recursing into the filesystem + * as we might end up waiting on a page sent in the request we're serving. + * + * Use __GFP_HIGHMEM so that the pages can use all of the available memory + * on 32-bit machines. Use more aggressive GFP_HIGHUSER flags from non-local + * clients to be able to generate more memory pressure on the OSS and allow + * inactive pages to be reclaimed, since it doesn't have any other processes + * or allocations that generate memory reclaim pressure. + * + * See b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details. + */ +static inline bool ptlrpc_connection_is_local(struct ptlrpc_connection *conn) +{ + if (!conn) + return false; + + if (conn->c_peer.nid == conn->c_self) + return true; + + RETURN(LNetIsPeerLocal(conn->c_peer.nid)); +} + /* ptlrpc/niobuf.c */ /** * Actual interfacing with LNet to put/get/register/unregister stuff diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index 2476f9e1f599382e8bb4a0f7a29ee5a5b80865d0..aecaa8edc0f830b5504ea888bbdc854d935f51dc 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -2116,13 +2116,13 @@ out: static int ofd_ladvise_prefetch(const struct lu_env *env, struct ofd_object *fo, struct niobuf_local *lnb, - __u64 start, __u64 end) + __u64 start, __u64 end, enum dt_bufs_type dbt) { - struct ofd_thread_info *info = ofd_info(env); - pgoff_t start_index, end_index, pages; - struct niobuf_remote rnb; - unsigned long nr_local; - int rc = 0; + struct ofd_thread_info *info = ofd_info(env); + pgoff_t start_index, end_index, pages; + struct niobuf_remote rnb; + unsigned long nr_local; + int rc = 0; if (end <= start) RETURN(-EINVAL); @@ -2150,7 +2150,7 @@ static int ofd_ladvise_prefetch(const struct lu_env *env, PTLRPC_MAX_BRW_PAGES; rnb.rnb_offset = start_index << PAGE_SHIFT; rnb.rnb_len = nr_local << PAGE_SHIFT; - rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, 0); + rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, dbt); if (unlikely(rc < 0)) break; nr_local = rc; @@ -2188,7 +2188,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) struct ptlrpc_thread *svc_thread = req->rq_svc_thread; const struct lu_env *env = svc_thread->t_env; struct tgt_thread_big_cache *tbc = svc_thread->t_data; - int rc = 0; + enum dt_bufs_type dbt = DT_BUFS_TYPE_READAHEAD; struct lu_ladvise *ladvise; int num_advise; struct ladvise_hdr *ladvise_hdr; @@ -2199,6 +2199,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) struct dt_object *dob; __u64 start; __u64 end; + int rc = 0; ENTRY; CFS_FAIL_TIMEOUT(OBD_FAIL_OST_LADVISE_PAUSE, cfs_fail_val); @@ -2247,6 +2248,9 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) LASSERT(fo != NULL); dob = ofd_object_child(fo); + if (ptlrpc_connection_is_local(exp->exp_connection)) + dbt |= DT_BUFS_TYPE_LOCAL; + for (i = 0; i < num_advise; i++, ladvise++) { start = ladvise->lla_start; end = ladvise->lla_end; @@ -2274,7 +2278,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi) req->rq_status = ofd_ladvise_prefetch(env, fo, tbc->local, - start, end); + start, end, dbt); tgt_extent_unlock(&lockh, LCK_PR); break; case LU_LADVISE_DONTNEED: diff --git a/lustre/ofd/ofd_io.c b/lustre/ofd/ofd_io.c index 5ccdca550394c19e124004260266a45e94e0a4de..53136558a5f634a78ddfd5f055866485e34e4ead 100644 --- a/lustre/ofd/ofd_io.c +++ b/lustre/ofd/ofd_io.c @@ -453,8 +453,9 @@ static int ofd_preprw_read(const struct lu_env *env, struct obd_export *exp, struct niobuf_remote *rnb, int *nr_local, struct niobuf_local *lnb, char *jobid) { - struct ofd_object *fo; - int i, j, rc, tot_bytes = 0; + struct ofd_object *fo; + int i, j, rc, tot_bytes = 0; + enum dt_bufs_type dbt = DT_BUFS_TYPE_READ; ENTRY; LASSERT(env != NULL); @@ -474,10 +475,12 @@ static int ofd_preprw_read(const struct lu_env *env, struct obd_export *exp, GOTO(unlock, rc); } - *nr_local = 0; - for (i = 0, j = 0; i < niocount; i++) { + if (ptlrpc_connection_is_local(exp->exp_connection)) + dbt |= DT_BUFS_TYPE_LOCAL; + + for (*nr_local = 0, i = 0, j = 0; i < niocount; i++) { rc = dt_bufs_get(env, ofd_object_child(fo), rnb + i, - lnb + j, 0); + lnb + j, dbt); if (unlikely(rc < 0)) GOTO(buf_put, rc); LASSERT(rc <= PTLRPC_MAX_BRW_PAGES); @@ -538,8 +541,9 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp, struct niobuf_remote *rnb, int *nr_local, struct niobuf_local *lnb, char *jobid) { - struct ofd_object *fo; - int i, j, k, rc = 0, tot_bytes = 0; + struct ofd_object *fo; + int i, j, k, rc = 0, tot_bytes = 0; + enum dt_bufs_type dbt = DT_BUFS_TYPE_WRITE; ENTRY; LASSERT(env != NULL); @@ -628,11 +632,13 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp, * space back if possible */ tgt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt); + if (ptlrpc_connection_is_local(exp->exp_connection)) + dbt |= DT_BUFS_TYPE_LOCAL; + /* parse remote buffers to local buffers and prepare the latter */ - *nr_local = 0; - for (i = 0, j = 0; i < obj->ioo_bufcnt; i++) { + for (*nr_local = 0, i = 0, j = 0; i < obj->ioo_bufcnt; i++) { rc = dt_bufs_get(env, ofd_object_child(fo), - rnb + i, lnb + j, 1); + rnb + i, lnb + j, dbt); if (unlikely(rc < 0)) GOTO(err, rc); LASSERT(rc <= PTLRPC_MAX_BRW_PAGES); diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index c3545a40643bcc1c5a6b488ccc2cfac1e0c2dec2..56e5231a61603ac8e5732645d0afae7916e833d3 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -421,16 +421,18 @@ static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages, RETURN(0); } -static struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw) +static struct page *osd_get_page(struct dt_object *dt, loff_t offset, + gfp_t gfp_mask) { - struct inode *inode = osd_dt_obj(dt)->oo_inode; - struct osd_device *d = osd_obj2dev(osd_dt_obj(dt)); - struct page *page; + struct inode *inode = osd_dt_obj(dt)->oo_inode; + struct osd_device *d = osd_obj2dev(osd_dt_obj(dt)); + struct page *page; LASSERT(inode); page = find_or_create_page(inode->i_mapping, offset >> PAGE_SHIFT, - GFP_NOFS | __GFP_HIGHMEM); + gfp_mask); + if (unlikely(page == NULL)) lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1); @@ -504,7 +506,7 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, * \param pos byte offset of IO start * \param len number of bytes of IO * \param lnb array of extents undergoing IO - * \param rw read or write operation? + * \param rw read or write operation, and other flags * \param capa capabilities * * \retval pages (zero or more) loaded successfully @@ -512,17 +514,22 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt, */ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, loff_t pos, ssize_t len, struct niobuf_local *lnb, - int rw) + enum dt_bufs_type rw) { - struct osd_object *obj = osd_dt_obj(dt); + struct osd_object *obj = osd_dt_obj(dt); int npages, i, rc = 0; + gfp_t gfp_mask; LASSERT(obj->oo_inode); osd_map_remote_to_local(pos, len, &npages, lnb); + /* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */ + gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) : + GFP_HIGHUSER; for (i = 0; i < npages; i++, lnb++) { - lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset, rw); + lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset, + gfp_mask); if (lnb->lnb_page == NULL) GOTO(cleanup, rc = -ENOMEM); diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 082d7ddef3fb8c0d7e8113c53c9155a9e418b7bf..41e6ee7d48afe525dd16a628c00fcc42dc8affc8 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -313,7 +313,7 @@ static inline struct page *kmem_to_page(void *addr) * \retval negative error number of failure */ static int osd_bufs_get_read(const struct lu_env *env, struct osd_object *obj, - loff_t off, ssize_t len, struct niobuf_local *lnb) + loff_t off, ssize_t len, struct niobuf_local *lnb) { struct osd_device *osd = osd_obj2dev(obj); unsigned long start = cfs_time_current(); @@ -420,7 +420,7 @@ static inline arc_buf_t *osd_request_arcbuf(dnode_t *dn, size_t bs) } static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj, - loff_t off, ssize_t len, struct niobuf_local *lnb) + loff_t off, ssize_t len, struct niobuf_local *lnb) { struct osd_device *osd = osd_obj2dev(obj); int plen, off_in_block, sz_in_block; @@ -525,7 +525,7 @@ out_err: static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, loff_t offset, ssize_t len, struct niobuf_local *lnb, - int rw) + enum dt_bufs_type rw) { struct osd_object *obj = osd_dt_obj(dt); int rc; @@ -533,10 +533,10 @@ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt, LASSERT(dt_object_exists(dt)); LASSERT(obj->oo_dn); - if (rw == 0) - rc = osd_bufs_get_read(env, obj, offset, len, lnb); - else + if (rw & DT_BUFS_TYPE_WRITE) rc = osd_bufs_get_write(env, obj, offset, len, lnb); + else + rc = osd_bufs_get_read(env, obj, offset, len, lnb); return rc; } diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index 5ceaaf1d5f666fd346a2d42be89379f34aa241eb..24c37f2321d87d1f86438a361dab06eb39afcd05 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -2183,7 +2183,7 @@ int tgt_brw_write(struct tgt_session_info *tsi) RETURN(err_serious(-EPROTO)); if ((remote_nb[0].rnb_flags & OBD_BRW_MEMALLOC) && - (exp->exp_connection->c_peer.nid == exp->exp_connection->c_self)) + ptlrpc_connection_is_local(exp->exp_connection)) memory_pressure_set(); req_capsule_set_size(&req->rq_pill, &RMF_RCS, RCL_SERVER,