From 61aa09e1ed8463ccda1f5d83d2c5aff8080a6116 Mon Sep 17 00:00:00 2001
From: Olaf Weber <olaf@sgi.com>
Date: Fri, 27 Jan 2017 16:13:53 +0100
Subject: [PATCH] LU-9119 socklnd: propagate errors on send failure

When an attempt to send a message fails, for example because no
connection could be established with the remote address, socklnd
drops the message. For a PUT or REPLY message with non-zero
payload, ksocknal_tx_done() calls lnet_finalize() with -EIO
as the error code. But for an ACK or GET message there is no
payload, and lnet_finalize() is called with 0 (no error) as the
error code. This leaves upper layers to rely on other means to
determine that sending the message did actually fail, and that
(for example) no REPLY will ever answer a failed GET.

Add an error code parameter to ksocknal_tx_done().

In ksocknal_txlist_done() change the 0/1 'error' indicator to be
an actual error code that is passed on the ksocknal_tx_done().
Update the callers of ksocknal_txlist_done() to pass in the error
code if they have encountered an error.

Test-Parameters: trivial
Signed-off-by: Olaf Weber <olaf@sgi.com>
Change-Id: I66b897a31e537e70dcc2622ffdfcc6e96fa93193
Reviewed-on: https://review.whamcloud.com/26691
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Doug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
---
 lnet/klnds/socklnd/socklnd.c    | 11 ++++--
 lnet/klnds/socklnd/socklnd.h    |  4 +--
 lnet/klnds/socklnd/socklnd_cb.c | 61 ++++++++++++++++-----------------
 3 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c
index 99f97816ef..d0e073ca5a 100644
--- a/lnet/klnds/socklnd/socklnd.c
+++ b/lnet/klnds/socklnd/socklnd.c
@@ -618,7 +618,7 @@ ksocknal_del_peer(struct lnet_ni *ni, struct lnet_process_id id, __u32 ip)
 
 	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
-	ksocknal_txlist_done(ni, &zombies, 1);
+	ksocknal_txlist_done(ni, &zombies, -ENETDOWN);
 
 	return rc;
 }
@@ -1030,6 +1030,7 @@ ksocknal_create_conn(struct lnet_ni *ni, ksock_route_t *route,
         ksock_tx_t        *tx;
         ksock_tx_t        *txtmp;
         int                rc;
+	int                rc2;
         int                active;
         char              *warn = NULL;
 
@@ -1384,7 +1385,13 @@ failed_2:
 		write_unlock_bh(global_lock);
         }
 
-        ksocknal_txlist_done(ni, &zombies, 1);
+	/*
+	 * If we get here without an error code, just use -EALREADY.
+	 * Depending on how we got here, the error may be positive
+	 * or negative. Normalize the value for ksocknal_txlist_done().
+	 */
+	rc2 = (rc == 0 ? -EALREADY : (rc > 0 ? -rc : rc));
+	ksocknal_txlist_done(ni, &zombies, rc2);
         ksocknal_peer_decref(peer_ni);
 
 failed_1:
diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h
index e3634c0344..544690b953 100644
--- a/lnet/klnds/socklnd/socklnd.h
+++ b/lnet/klnds/socklnd/socklnd.h
@@ -564,14 +564,14 @@ ksocknal_tx_addref (ksock_tx_t *tx)
 }
 
 extern void ksocknal_tx_prep (ksock_conn_t *, ksock_tx_t *tx);
-extern void ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx);
+extern void ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int error);
 
 static inline void
 ksocknal_tx_decref (ksock_tx_t *tx)
 {
 	LASSERT (atomic_read(&tx->tx_refcount) > 0);
 	if (atomic_dec_and_test(&tx->tx_refcount))
-		ksocknal_tx_done(NULL, tx);
+		ksocknal_tx_done(NULL, tx, 0);
 }
 
 static inline void
diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c
index 06bfe3e098..2da283add8 100644
--- a/lnet/klnds/socklnd/socklnd_cb.c
+++ b/lnet/klnds/socklnd/socklnd_cb.c
@@ -389,25 +389,24 @@ ksocknal_receive (ksock_conn_t *conn)
 }
 
 void
-ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx)
+ksocknal_tx_done(struct lnet_ni *ni, ksock_tx_t *tx, int rc)
 {
 	struct lnet_msg *lnetmsg = tx->tx_lnetmsg;
-        int          rc = (tx->tx_resid == 0 && !tx->tx_zc_aborted) ? 0 : -EIO;
         ENTRY;
 
-        LASSERT(ni != NULL || tx->tx_conn != NULL);
+	LASSERT(ni != NULL || tx->tx_conn != NULL);
 
-        if (tx->tx_conn != NULL)
-                ksocknal_conn_decref(tx->tx_conn);
+	if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted))
+		rc = -EIO;
 
-        if (ni == NULL && tx->tx_conn != NULL)
-                ni = tx->tx_conn->ksnc_peer->ksnp_ni;
+	if (tx->tx_conn != NULL)
+		ksocknal_conn_decref(tx->tx_conn);
 
 	ksocknal_free_tx(tx);
 	if (lnetmsg != NULL) /* KSOCK_MSG_NOOP go without lnetmsg */
 		lnet_finalize(lnetmsg, rc);
 
-        EXIT;
+	EXIT;
 }
 
 void
@@ -418,21 +417,21 @@ ksocknal_txlist_done(struct lnet_ni *ni, struct list_head *txlist, int error)
 	while (!list_empty(txlist)) {
 		tx = list_entry(txlist->next, ksock_tx_t, tx_list);
 
-                if (error && tx->tx_lnetmsg != NULL) {
-                        CNETERR("Deleting packet type %d len %d %s->%s\n",
-                                le32_to_cpu (tx->tx_lnetmsg->msg_hdr.type),
-                                le32_to_cpu (tx->tx_lnetmsg->msg_hdr.payload_length),
-                                libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
-                                libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
-                } else if (error) {
-                        CNETERR("Deleting noop packet\n");
-                }
+		if (error && tx->tx_lnetmsg != NULL) {
+			CNETERR("Deleting packet type %d len %d %s->%s\n",
+				le32_to_cpu(tx->tx_lnetmsg->msg_hdr.type),
+				le32_to_cpu(tx->tx_lnetmsg->msg_hdr.payload_length),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.src_nid)),
+				libcfs_nid2str(le64_to_cpu(tx->tx_lnetmsg->msg_hdr.dest_nid)));
+		} else if (error) {
+			CNETERR("Deleting noop packet\n");
+		}
 
 		list_del(&tx->tx_list);
 
-		LASSERT (atomic_read(&tx->tx_refcount) == 1);
-                ksocknal_tx_done (ni, tx);
-        }
+		LASSERT(atomic_read(&tx->tx_refcount) == 1);
+		ksocknal_tx_done(ni, tx, error);
+	}
 }
 
 static void
@@ -2000,9 +1999,9 @@ ksocknal_connect (ksock_route_t *route)
 
 	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
-        ksocknal_peer_failed(peer_ni);
-        ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, 1);
-        return 0;
+	ksocknal_peer_failed(peer_ni);
+	ksocknal_txlist_done(peer_ni->ksnp_ni, &zombies, rc);
+	return 0;
 }
 
 /*
@@ -2332,26 +2331,26 @@ ksocknal_find_timed_out_conn (ksock_peer_ni_t *peer_ni)
 static inline void
 ksocknal_flush_stale_txs(ksock_peer_ni_t *peer_ni)
 {
-        ksock_tx_t        *tx;
-	struct list_head        stale_txs = LIST_HEAD_INIT(stale_txs);
+	ksock_tx_t	  *tx;
+	struct list_head	stale_txs = LIST_HEAD_INIT(stale_txs);
 
 	write_lock_bh(&ksocknal_data.ksnd_global_lock);
 
 	while (!list_empty(&peer_ni->ksnp_tx_queue)) {
 		tx = list_entry(peer_ni->ksnp_tx_queue.next,
-                                     ksock_tx_t, tx_list);
+				     ksock_tx_t, tx_list);
 
-                if (!cfs_time_aftereq(cfs_time_current(),
-                                      tx->tx_deadline))
-                        break;
+		if (!cfs_time_aftereq(cfs_time_current(),
+				      tx->tx_deadline))
+			break;
 
 		list_del(&tx->tx_list);
 		list_add_tail(&tx->tx_list, &stale_txs);
-        }
+	}
 
 	write_unlock_bh(&ksocknal_data.ksnd_global_lock);
 
-        ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, 1);
+	ksocknal_txlist_done(peer_ni->ksnp_ni, &stale_txs, -ETIMEDOUT);
 }
 
 static int
-- 
GitLab