From a4c7545f6e77229a3eabe537eb9ed161ff3c88ee Mon Sep 17 00:00:00 2001
From: Andreas Dilger <andreas.dilger@intel.com>
Date: Fri, 30 Jun 2017 14:37:07 -0600
Subject: [PATCH] LU-9728 osd: use GFP_HIGHUSER for non-local IO

When the obdfilter code was split into separate OFD and OSD modules,
the bulk IO page allocation was implemented to use GFP_NOFS to avoid
allocations recursing into the filesystem and causing deadlocks.

However, this is only possible if the RPC is coming from a local
client, as we might end up waiting on a page sent in the request we're
serving. Local RPCs use __GFP_HIGHMEM so that the pages can use all of
the available memory on the OSS on 32-bit machines.

It is possible to use more aggressive GFP_HIGHUSER flags for non-local
clients to be able to generate more memory pressure on the OSS and
allow inactive pages to be reclaimed, since the OSS doesn't have any
other processes or allocations that generate memory reclaim pressure.

See also b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details.

The patch also implements an LNet function to determine if a client NID
is local or not.  This becomes more complex in the LNet Multi-Rail world
and it is really LNet's job to handle NIDs, not that of Lustre.

Lustre-change: https://review.whamcloud.com/27908
Lustre-commit: b0ab95d6133e783acacc6329c025d17fb282775e

Signed-off-by: Andreas Dilger <andreas.dilger@intel.com>
Change-Id: I2806c9c5c2fe269669eafdafaf2310924c3ebbe5
Reviewed-by: Dmitry Eremin <dmitry.eremin@intel.com>
Reviewed-by: Patrick Farrell <paf@cray.com>
Signed-off-by: Minh Diep <minh.diep@intel.com>
Reviewed-on: https://review.whamcloud.com/28318
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: John L. Hammond <john.hammond@intel.com>
---
 lnet/include/lnet/api.h     |  1 +
 lnet/lnet/api-ni.c          | 29 +++++++++++++++++++++++++++++
 lustre/include/dt_object.h  | 11 +++++++++--
 lustre/include/lustre_net.h | 24 ++++++++++++++++++++++++
 lustre/ofd/ofd_dev.c        | 22 +++++++++++++---------
 lustre/ofd/ofd_io.c         | 26 ++++++++++++++++----------
 lustre/osd-ldiskfs/osd_io.c | 25 ++++++++++++++++---------
 lustre/osd-zfs/osd_io.c     | 12 ++++++------
 lustre/target/tgt_handler.c |  2 +-
 9 files changed, 115 insertions(+), 37 deletions(-)

diff --git a/lnet/include/lnet/api.h b/lnet/include/lnet/api.h
index 4fa2f66f0b..84c6bd0039 100644
--- a/lnet/include/lnet/api.h
+++ b/lnet/include/lnet/api.h
@@ -78,6 +78,7 @@ int LNetNIFini(void);
 int LNetGetId(unsigned int index, struct lnet_process_id *id);
 int LNetDist(lnet_nid_t nid, lnet_nid_t *srcnid, __u32 *order);
 lnet_nid_t LNetPrimaryNID(lnet_nid_t nid);
+bool LNetIsPeerLocal(lnet_nid_t nid);
 
 /** @} lnet_addr */
 
diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c
index e912e58eac..aac37c093a 100644
--- a/lnet/lnet/api-ni.c
+++ b/lnet/lnet/api-ni.c
@@ -2933,6 +2933,35 @@ void LNetDebugPeer(struct lnet_process_id id)
 }
 EXPORT_SYMBOL(LNetDebugPeer);
 
+/**
+ * Determine if the specified peer \a nid is on the local node.
+ *
+ * \param nid	peer nid to check
+ *
+ * \retval true		If peer NID is on the local node.
+ * \retval false	If peer NID is not on the local node.
+ */
+bool LNetIsPeerLocal(lnet_nid_t nid)
+{
+	struct lnet_net *net;
+	struct lnet_ni *ni;
+	int cpt;
+
+	cpt = lnet_net_lock_current();
+	list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+		list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+			if (ni->ni_nid == nid) {
+				lnet_net_unlock(cpt);
+				return true;
+			}
+		}
+	}
+	lnet_net_unlock(cpt);
+
+	return false;
+}
+EXPORT_SYMBOL(LNetIsPeerLocal);
+
 /**
  * Retrieve the struct lnet_process_id ID of LNet interface at \a index.
  * Note that all interfaces share a same PID, as requested by LNetNIInit().
diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h
index c79cc0263a..436139fa12 100644
--- a/lustre/include/dt_object.h
+++ b/lustre/include/dt_object.h
@@ -1060,6 +1060,13 @@ struct dt_object_operations {
 				const struct lu_buf *buf, struct thandle *th);
 };
 
+enum dt_bufs_type {
+	DT_BUFS_TYPE_READ	= 0x0000,
+	DT_BUFS_TYPE_WRITE	= 0x0001,
+	DT_BUFS_TYPE_READAHEAD	= 0x0002,
+	DT_BUFS_TYPE_LOCAL	= 0x0004,
+};
+
 /**
  * Per-dt-object operations on "file body" - unstructure raw data.
  */
@@ -1177,7 +1184,7 @@ struct dt_body_operations {
 			    loff_t pos,
 			    ssize_t len,
 			    struct niobuf_local *lb,
-			    int rw);
+			    enum dt_bufs_type rw);
 
 	/**
 	 * Release reference granted by ->dbo_bufs_get().
@@ -2379,7 +2386,7 @@ static inline int dt_ref_del(const struct lu_env *env,
 
 static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d,
 			      struct niobuf_remote *rnb,
-			      struct niobuf_local *lnb, int rw)
+			      struct niobuf_local *lnb, enum dt_bufs_type rw)
 {
 	LASSERT(d);
 	LASSERT(d->do_body_ops);
diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h
index 8c918afc3d..2e342ee4d3 100644
--- a/lustre/include/lustre_net.h
+++ b/lustre/include/lustre_net.h
@@ -2015,6 +2015,30 @@ int ptlrpc_connection_init(void);
 void ptlrpc_connection_fini(void);
 extern lnet_pid_t ptl_get_pid(void);
 
+/*
+ * Check if the peer connection is on the local node.  We need to use GFP_NOFS
+ * for requests from a local client to avoid recursing into the filesystem
+ * as we might end up waiting on a page sent in the request we're serving.
+ *
+ * Use __GFP_HIGHMEM so that the pages can use all of the available memory
+ * on 32-bit machines.  Use more aggressive GFP_HIGHUSER flags from non-local
+ * clients to be able to generate more memory pressure on the OSS and allow
+ * inactive pages to be reclaimed, since it doesn't have any other processes
+ * or allocations that generate memory reclaim pressure.
+ *
+ * See b=17576 (bdf50dc9) and b=19529 (3dcf18d3) for details.
+ */
+static inline bool ptlrpc_connection_is_local(struct ptlrpc_connection *conn)
+{
+	if (!conn)
+		return false;
+
+	if (conn->c_peer.nid == conn->c_self)
+		return true;
+
+	RETURN(LNetIsPeerLocal(conn->c_peer.nid));
+}
+
 /* ptlrpc/niobuf.c */
 /**
  * Actual interfacing with LNet to put/get/register/unregister stuff
diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c
index 2476f9e1f5..aecaa8edc0 100644
--- a/lustre/ofd/ofd_dev.c
+++ b/lustre/ofd/ofd_dev.c
@@ -2116,13 +2116,13 @@ out:
 static int ofd_ladvise_prefetch(const struct lu_env *env,
 				struct ofd_object *fo,
 				struct niobuf_local *lnb,
-				__u64 start, __u64 end)
+				__u64 start, __u64 end, enum dt_bufs_type dbt)
 {
-	struct ofd_thread_info	*info = ofd_info(env);
-	pgoff_t			 start_index, end_index, pages;
-	struct niobuf_remote	 rnb;
-	unsigned long		 nr_local;
-	int			 rc = 0;
+	struct ofd_thread_info *info = ofd_info(env);
+	pgoff_t start_index, end_index, pages;
+	struct niobuf_remote rnb;
+	unsigned long nr_local;
+	int rc = 0;
 
 	if (end <= start)
 		RETURN(-EINVAL);
@@ -2150,7 +2150,7 @@ static int ofd_ladvise_prefetch(const struct lu_env *env,
 			PTLRPC_MAX_BRW_PAGES;
 		rnb.rnb_offset = start_index << PAGE_SHIFT;
 		rnb.rnb_len = nr_local << PAGE_SHIFT;
-		rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, 0);
+		rc = dt_bufs_get(env, ofd_object_child(fo), &rnb, lnb, dbt);
 		if (unlikely(rc < 0))
 			break;
 		nr_local = rc;
@@ -2188,7 +2188,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi)
 	struct ptlrpc_thread *svc_thread = req->rq_svc_thread;
 	const struct lu_env *env = svc_thread->t_env;
 	struct tgt_thread_big_cache *tbc = svc_thread->t_data;
-	int rc = 0;
+	enum dt_bufs_type dbt = DT_BUFS_TYPE_READAHEAD;
 	struct lu_ladvise *ladvise;
 	int num_advise;
 	struct ladvise_hdr *ladvise_hdr;
@@ -2199,6 +2199,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi)
 	struct dt_object *dob;
 	__u64 start;
 	__u64 end;
+	int rc = 0;
 	ENTRY;
 
 	CFS_FAIL_TIMEOUT(OBD_FAIL_OST_LADVISE_PAUSE, cfs_fail_val);
@@ -2247,6 +2248,9 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi)
 	LASSERT(fo != NULL);
 	dob = ofd_object_child(fo);
 
+	if (ptlrpc_connection_is_local(exp->exp_connection))
+		dbt |= DT_BUFS_TYPE_LOCAL;
+
 	for (i = 0; i < num_advise; i++, ladvise++) {
 		start = ladvise->lla_start;
 		end = ladvise->lla_end;
@@ -2274,7 +2278,7 @@ static int ofd_ladvise_hdl(struct tgt_session_info *tsi)
 
 			req->rq_status = ofd_ladvise_prefetch(env, fo,
 							      tbc->local,
-							      start, end);
+							      start, end, dbt);
 			tgt_extent_unlock(&lockh, LCK_PR);
 			break;
 		case LU_LADVISE_DONTNEED:
diff --git a/lustre/ofd/ofd_io.c b/lustre/ofd/ofd_io.c
index 5ccdca5503..53136558a5 100644
--- a/lustre/ofd/ofd_io.c
+++ b/lustre/ofd/ofd_io.c
@@ -453,8 +453,9 @@ static int ofd_preprw_read(const struct lu_env *env, struct obd_export *exp,
 			   struct niobuf_remote *rnb, int *nr_local,
 			   struct niobuf_local *lnb, char *jobid)
 {
-	struct ofd_object	*fo;
-	int			 i, j, rc, tot_bytes = 0;
+	struct ofd_object *fo;
+	int i, j, rc, tot_bytes = 0;
+	enum dt_bufs_type dbt = DT_BUFS_TYPE_READ;
 
 	ENTRY;
 	LASSERT(env != NULL);
@@ -474,10 +475,12 @@ static int ofd_preprw_read(const struct lu_env *env, struct obd_export *exp,
 			GOTO(unlock, rc);
 	}
 
-	*nr_local = 0;
-	for (i = 0, j = 0; i < niocount; i++) {
+	if (ptlrpc_connection_is_local(exp->exp_connection))
+		dbt |= DT_BUFS_TYPE_LOCAL;
+
+	for (*nr_local = 0, i = 0, j = 0; i < niocount; i++) {
 		rc = dt_bufs_get(env, ofd_object_child(fo), rnb + i,
-				 lnb + j, 0);
+				 lnb + j, dbt);
 		if (unlikely(rc < 0))
 			GOTO(buf_put, rc);
 		LASSERT(rc <= PTLRPC_MAX_BRW_PAGES);
@@ -538,8 +541,9 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp,
 			    struct niobuf_remote *rnb, int *nr_local,
 			    struct niobuf_local *lnb, char *jobid)
 {
-	struct ofd_object	*fo;
-	int			 i, j, k, rc = 0, tot_bytes = 0;
+	struct ofd_object *fo;
+	int i, j, k, rc = 0, tot_bytes = 0;
+	enum dt_bufs_type dbt = DT_BUFS_TYPE_WRITE;
 
 	ENTRY;
 	LASSERT(env != NULL);
@@ -628,11 +632,13 @@ static int ofd_preprw_write(const struct lu_env *env, struct obd_export *exp,
 	 * space back if possible */
 	tgt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt);
 
+	if (ptlrpc_connection_is_local(exp->exp_connection))
+		dbt |= DT_BUFS_TYPE_LOCAL;
+
 	/* parse remote buffers to local buffers and prepare the latter */
-	*nr_local = 0;
-	for (i = 0, j = 0; i < obj->ioo_bufcnt; i++) {
+	for (*nr_local = 0, i = 0, j = 0; i < obj->ioo_bufcnt; i++) {
 		rc = dt_bufs_get(env, ofd_object_child(fo),
-				 rnb + i, lnb + j, 1);
+				 rnb + i, lnb + j, dbt);
 		if (unlikely(rc < 0))
 			GOTO(err, rc);
 		LASSERT(rc <= PTLRPC_MAX_BRW_PAGES);
diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c
index c3545a4064..56e5231a61 100644
--- a/lustre/osd-ldiskfs/osd_io.c
+++ b/lustre/osd-ldiskfs/osd_io.c
@@ -421,16 +421,18 @@ static int osd_map_remote_to_local(loff_t offset, ssize_t len, int *nrpages,
         RETURN(0);
 }
 
-static struct page *osd_get_page(struct dt_object *dt, loff_t offset, int rw)
+static struct page *osd_get_page(struct dt_object *dt, loff_t offset,
+				 gfp_t gfp_mask)
 {
-        struct inode      *inode = osd_dt_obj(dt)->oo_inode;
-        struct osd_device *d = osd_obj2dev(osd_dt_obj(dt));
-        struct page       *page;
+	struct inode *inode = osd_dt_obj(dt)->oo_inode;
+	struct osd_device *d = osd_obj2dev(osd_dt_obj(dt));
+	struct page *page;
 
         LASSERT(inode);
 
 	page = find_or_create_page(inode->i_mapping, offset >> PAGE_SHIFT,
-                                   GFP_NOFS | __GFP_HIGHMEM);
+				   gfp_mask);
+
         if (unlikely(page == NULL))
                 lprocfs_counter_add(d->od_stats, LPROC_OSD_NO_PAGE, 1);
 
@@ -504,7 +506,7 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
  * \param pos		byte offset of IO start
  * \param len		number of bytes of IO
  * \param lnb		array of extents undergoing IO
- * \param rw		read or write operation?
+ * \param rw		read or write operation, and other flags
  * \param capa		capabilities
  *
  * \retval pages	(zero or more) loaded successfully
@@ -512,17 +514,22 @@ static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
  */
 static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
 			loff_t pos, ssize_t len, struct niobuf_local *lnb,
-			int rw)
+			enum dt_bufs_type rw)
 {
-	struct osd_object   *obj    = osd_dt_obj(dt);
+	struct osd_object *obj = osd_dt_obj(dt);
 	int npages, i, rc = 0;
+	gfp_t gfp_mask;
 
 	LASSERT(obj->oo_inode);
 
 	osd_map_remote_to_local(pos, len, &npages, lnb);
 
+	/* this could also try less hard for DT_BUFS_TYPE_READAHEAD pages */
+	gfp_mask = rw & DT_BUFS_TYPE_LOCAL ? (GFP_NOFS | __GFP_HIGHMEM) :
+					     GFP_HIGHUSER;
 	for (i = 0; i < npages; i++, lnb++) {
-		lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset, rw);
+		lnb->lnb_page = osd_get_page(dt, lnb->lnb_file_offset,
+					     gfp_mask);
 		if (lnb->lnb_page == NULL)
 			GOTO(cleanup, rc = -ENOMEM);
 
diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c
index 082d7ddef3..41e6ee7d48 100644
--- a/lustre/osd-zfs/osd_io.c
+++ b/lustre/osd-zfs/osd_io.c
@@ -313,7 +313,7 @@ static inline struct page *kmem_to_page(void *addr)
  * \retval		negative error number of failure
  */
 static int osd_bufs_get_read(const struct lu_env *env, struct osd_object *obj,
-				loff_t off, ssize_t len, struct niobuf_local *lnb)
+			     loff_t off, ssize_t len, struct niobuf_local *lnb)
 {
 	struct osd_device *osd = osd_obj2dev(obj);
 	unsigned long	   start = cfs_time_current();
@@ -420,7 +420,7 @@ static inline arc_buf_t *osd_request_arcbuf(dnode_t *dn, size_t bs)
 }
 
 static int osd_bufs_get_write(const struct lu_env *env, struct osd_object *obj,
-				loff_t off, ssize_t len, struct niobuf_local *lnb)
+			      loff_t off, ssize_t len, struct niobuf_local *lnb)
 {
 	struct osd_device *osd = osd_obj2dev(obj);
 	int                plen, off_in_block, sz_in_block;
@@ -525,7 +525,7 @@ out_err:
 
 static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
 			loff_t offset, ssize_t len, struct niobuf_local *lnb,
-			int rw)
+			enum dt_bufs_type rw)
 {
 	struct osd_object *obj  = osd_dt_obj(dt);
 	int                rc;
@@ -533,10 +533,10 @@ static int osd_bufs_get(const struct lu_env *env, struct dt_object *dt,
 	LASSERT(dt_object_exists(dt));
 	LASSERT(obj->oo_dn);
 
-	if (rw == 0)
-		rc = osd_bufs_get_read(env, obj, offset, len, lnb);
-	else
+	if (rw & DT_BUFS_TYPE_WRITE)
 		rc = osd_bufs_get_write(env, obj, offset, len, lnb);
+	else
+		rc = osd_bufs_get_read(env, obj, offset, len, lnb);
 
 	return rc;
 }
diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c
index 5ceaaf1d5f..24c37f2321 100644
--- a/lustre/target/tgt_handler.c
+++ b/lustre/target/tgt_handler.c
@@ -2183,7 +2183,7 @@ int tgt_brw_write(struct tgt_session_info *tsi)
 		RETURN(err_serious(-EPROTO));
 
 	if ((remote_nb[0].rnb_flags & OBD_BRW_MEMALLOC) &&
-	    (exp->exp_connection->c_peer.nid == exp->exp_connection->c_self))
+	    ptlrpc_connection_is_local(exp->exp_connection))
 		memory_pressure_set();
 
 	req_capsule_set_size(&req->rq_pill, &RMF_RCS, RCL_SERVER,
-- 
GitLab