From 3a092d5b13bc0a86a8ffede1ce47f4a92c18fe2b Mon Sep 17 00:00:00 2001 From: isaac <isaac> Date: Fri, 14 Mar 2008 18:18:06 +0000 Subject: [PATCH] b=14425 i=liangzhen, i=maxim - fixed a deadlock in o2iblnd/ptllnd credit flow. --- lnet/ChangeLog | 6 +++ lnet/klnds/o2iblnd/o2iblnd.c | 11 +++-- lnet/klnds/o2iblnd/o2iblnd.h | 31 ++++++++++-- lnet/klnds/o2iblnd/o2iblnd_cb.c | 63 ++++++++++++++---------- lnet/klnds/ptllnd/ptllnd.c | 6 +++ lnet/klnds/ptllnd/ptllnd.h | 7 ++- lnet/klnds/ptllnd/ptllnd_cb.c | 2 +- lnet/klnds/ptllnd/ptllnd_peer.c | 82 +++++++++++++++++++++---------- lnet/klnds/ptllnd/ptllnd_rx_buf.c | 20 ++++++-- 9 files changed, 164 insertions(+), 64 deletions(-) diff --git a/lnet/ChangeLog b/lnet/ChangeLog index 983b21b47e..6aff8fefa0 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -11,6 +11,12 @@ tbd Sun Microsystems, Inc. gmlnd - GM 2.1.22 and later, mxlnd - MX 1.2.1 or later, ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x + +Severity : major +Bugzilla : 14425 +Description: o2iblnd/ptllnd credit deadlock in a routed config. +Details : o2iblnd/ptllnd credit deadlock in a routed config. + Severity : normal Bugzilla : 14956 Description: High load after starting lnet diff --git a/lnet/klnds/o2iblnd/o2iblnd.c b/lnet/klnds/o2iblnd/o2iblnd.c index e5369ff6f6..b8a994a412 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.c +++ b/lnet/klnds/o2iblnd/o2iblnd.c @@ -598,6 +598,10 @@ kiblnd_debug_conn (kib_conn_t *conn) list_for_each(tmp, &conn->ibc_early_rxs) kiblnd_debug_rx(list_entry(tmp, kib_rx_t, rx_list)); + CDEBUG(D_CONSOLE, " tx_noops:\n"); + list_for_each(tmp, &conn->ibc_tx_noops) + kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); + CDEBUG(D_CONSOLE, " tx_queue_nocred:\n"); list_for_each(tmp, &conn->ibc_tx_queue_nocred) kiblnd_debug_tx(list_entry(tmp, kib_tx_t, tx_list)); @@ -666,6 +670,7 @@ kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, int state) conn->ibc_cmid = cmid; INIT_LIST_HEAD(&conn->ibc_early_rxs); + INIT_LIST_HEAD(&conn->ibc_tx_noops); INIT_LIST_HEAD(&conn->ibc_tx_queue); INIT_LIST_HEAD(&conn->ibc_tx_queue_rsrvd); INIT_LIST_HEAD(&conn->ibc_tx_queue_nocred); @@ -741,9 +746,8 @@ kiblnd_create_conn (kib_peer_t *peer, struct rdma_cm_id *cmid, int state) memset(init_qp_attr, 0, sizeof(*init_qp_attr)); init_qp_attr->event_handler = kiblnd_qp_event; init_qp_attr->qp_context = conn; - init_qp_attr->cap.max_send_wr = (*kiblnd_tunables.kib_concurrent_sends) * - (1 + IBLND_MAX_RDMA_FRAGS); - init_qp_attr->cap.max_recv_wr = IBLND_RX_MSGS; + init_qp_attr->cap.max_send_wr = IBLND_SEND_WRS; + init_qp_attr->cap.max_recv_wr = IBLND_RECV_WRS; init_qp_attr->cap.max_send_sge = 1; init_qp_attr->cap.max_recv_sge = 1; init_qp_attr->sq_sig_type = IB_SIGNAL_REQ_WR; @@ -850,6 +854,7 @@ kiblnd_destroy_conn (kib_conn_t *conn) LASSERT (!in_interrupt()); LASSERT (atomic_read(&conn->ibc_refcount) == 0); LASSERT (list_empty(&conn->ibc_early_rxs)); + LASSERT (list_empty(&conn->ibc_tx_noops)); LASSERT (list_empty(&conn->ibc_tx_queue)); LASSERT (list_empty(&conn->ibc_tx_queue_rsrvd)); LASSERT (list_empty(&conn->ibc_tx_queue_nocred)); diff --git a/lnet/klnds/o2iblnd/o2iblnd.h b/lnet/klnds/o2iblnd/o2iblnd.h index 2ada24f18d..f7926c7e34 100644 --- a/lnet/klnds/o2iblnd/o2iblnd.h +++ b/lnet/klnds/o2iblnd/o2iblnd.h @@ -98,13 +98,15 @@ typedef int gfp_t; #define IBLND_TX_MSG_PAGES() ((IBLND_TX_MSG_BYTES() + PAGE_SIZE - 1)/PAGE_SIZE) /* RX messages (per connection) */ -#define IBLND_RX_MSGS (IBLND_MSG_QUEUE_SIZE*2) +#define IBLND_RX_MSGS (IBLND_MSG_QUEUE_SIZE * 2) #define IBLND_RX_MSG_BYTES (IBLND_RX_MSGS * IBLND_MSG_SIZE) #define IBLND_RX_MSG_PAGES ((IBLND_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) -#define IBLND_CQ_ENTRIES() (IBLND_RX_MSGS + \ - (*kiblnd_tunables.kib_concurrent_sends) * \ +/* WRs and CQEs (per connection) */ +#define IBLND_RECV_WRS IBLND_RX_MSGS +#define IBLND_SEND_WRS ((*kiblnd_tunables.kib_concurrent_sends) * \ (1 + IBLND_MAX_RDMA_FRAGS)) +#define IBLND_CQ_ENTRIES() (IBLND_RECV_WRS + IBLND_SEND_WRS) typedef struct { @@ -393,6 +395,7 @@ typedef struct kib_conn int ibc_ready:1; /* CQ callback fired */ unsigned long ibc_last_send; /* time of last send */ struct list_head ibc_early_rxs; /* rxs completed before ESTABLISHED */ + struct list_head ibc_tx_noops; /* IBLND_MSG_NOOPs */ struct list_head ibc_tx_queue; /* sends that need a credit */ struct list_head ibc_tx_queue_nocred;/* sends that don't need a credit */ struct list_head ibc_tx_queue_rsrvd; /* sends that need to reserve an ACK/DONE msg */ @@ -507,6 +510,28 @@ kiblnd_send_keepalive(kib_conn_t *conn) *kiblnd_tunables.kib_keepalive*HZ); } +static inline int +kiblnd_send_noop(kib_conn_t *conn) +{ + LASSERT (conn->ibc_state >= IBLND_CONN_ESTABLISHED); + + if (conn->ibc_outstanding_credits < IBLND_CREDIT_HIGHWATER && + !kiblnd_send_keepalive(conn)) + return 0; /* No need to send NOOP */ + + if (!list_empty(&conn->ibc_tx_noops) || /* NOOP already queued */ + !list_empty(&conn->ibc_tx_queue_nocred) || /* can be piggybacked */ + conn->ibc_credits == 0) /* no credit */ + return 0; + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + return 0; + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&conn->ibc_tx_queue) || conn->ibc_credits == 1); +} + static inline void kiblnd_abort_receives(kib_conn_t *conn) { diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index e4504ae858..3b7d6c6498 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -376,6 +376,10 @@ kiblnd_handle_rx (kib_rx_t *rx) conn->ibc_credits += credits; + /* This ensures the credit taken by NOOP can be returned */ + if (msg->ibm_type == IBLND_MSG_NOOP) + conn->ibc_outstanding_credits++; + spin_unlock(&conn->ibc_lock); kiblnd_check_sends(conn); } @@ -389,7 +393,10 @@ kiblnd_handle_rx (kib_rx_t *rx) break; case IBLND_MSG_NOOP: - post_credit = IBLND_POSTRX_PEER_CREDIT; + if (credits != 0) /* credit already posted */ + post_credit = IBLND_POSTRX_NO_CREDIT; + else /* a keepalive NOOP */ + post_credit = IBLND_POSTRX_PEER_CREDIT; break; case IBLND_MSG_IMMEDIATE: @@ -887,10 +894,7 @@ kiblnd_check_sends (kib_conn_t *conn) conn->ibc_reserved_credits--; } - if (list_empty(&conn->ibc_tx_queue) && - list_empty(&conn->ibc_tx_queue_nocred) && - (conn->ibc_outstanding_credits >= IBLND_CREDIT_HIGHWATER || - kiblnd_send_keepalive(conn))) { + if (kiblnd_send_noop(conn)) { spin_unlock(&conn->ibc_lock); tx = kiblnd_get_idle_tx(ni); @@ -904,13 +908,17 @@ kiblnd_check_sends (kib_conn_t *conn) } for (;;) { - if (!list_empty (&conn->ibc_tx_queue_nocred)) { - tx = list_entry (conn->ibc_tx_queue_nocred.next, - kib_tx_t, tx_list); + if (!list_empty(&conn->ibc_tx_queue_nocred)) { + tx = list_entry(conn->ibc_tx_queue_nocred.next, + kib_tx_t, tx_list); consume_cred = 0; - } else if (!list_empty (&conn->ibc_tx_queue)) { - tx = list_entry (conn->ibc_tx_queue.next, - kib_tx_t, tx_list); + } else if (!list_empty(&conn->ibc_tx_noops)) { + tx = list_entry(conn->ibc_tx_noops.next, + kib_tx_t, tx_list); + consume_cred = 1; + } else if (!list_empty(&conn->ibc_tx_queue)) { + tx = list_entry(conn->ibc_tx_queue.next, + kib_tx_t, tx_list); consume_cred = 1; } else { /* nothing to send right now */ @@ -939,27 +947,25 @@ kiblnd_check_sends (kib_conn_t *conn) if (conn->ibc_credits == 0) { /* no credits */ CDEBUG(D_NET, "%s: no credits\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); - break; + break; /* NB ibc_tx_queue_nocred checked */ } - if (conn->ibc_credits == 1 && /* last credit reserved for */ - conn->ibc_outstanding_credits == 0) { /* giving back credits */ + /* Last credit reserved for NOOP */ + if (conn->ibc_credits == 1 && + tx->tx_msg->ibm_type != IBLND_MSG_NOOP) { CDEBUG(D_NET, "%s: not using last credit\n", libcfs_nid2str(conn->ibc_peer->ibp_nid)); - break; + break; /* NB ibc_tx_noops checked */ } } - list_del (&tx->tx_list); + list_del(&tx->tx_list); tx->tx_queued = 0; /* NB don't drop ibc_lock before bumping tx_sending */ if (tx->tx_msg->ibm_type == IBLND_MSG_NOOP && - (!list_empty(&conn->ibc_tx_queue) || - !list_empty(&conn->ibc_tx_queue_nocred) || - (conn->ibc_outstanding_credits < IBLND_CREDIT_HIGHWATER && - !kiblnd_send_keepalive(conn)))) { + !kiblnd_send_noop(conn)) { /* redundant NOOP */ spin_unlock(&conn->ibc_lock); kiblnd_tx_done(ni, tx); @@ -1304,6 +1310,9 @@ kiblnd_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) break; case IBLND_MSG_NOOP: + q = &conn->ibc_tx_noops; + break; + case IBLND_MSG_IMMEDIATE: q = &conn->ibc_tx_queue; break; @@ -1906,6 +1915,7 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error) return; /* already being handled */ if (error == 0 && + list_empty(&conn->ibc_tx_noops) && list_empty(&conn->ibc_tx_queue) && list_empty(&conn->ibc_tx_queue_rsrvd) && list_empty(&conn->ibc_tx_queue_nocred) && @@ -1913,9 +1923,10 @@ kiblnd_close_conn_locked (kib_conn_t *conn, int error) CDEBUG(D_NET, "closing conn to %s\n", libcfs_nid2str(peer->ibp_nid)); } else { - CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s\n", + CDEBUG(D_NETERROR, "Closing conn to %s: error %d%s%s%s%s%s\n", libcfs_nid2str(peer->ibp_nid), error, list_empty(&conn->ibc_tx_queue) ? "" : "(sending)", + list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)", list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)", list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)", list_empty(&conn->ibc_active_txs) ? "" : "(waiting)"); @@ -2030,6 +2041,7 @@ kiblnd_finalise_conn (kib_conn_t *conn) /* Complete all tx descs not waiting for sends to complete. * NB we should be safe from RDMA now that the QP has changed state */ + kiblnd_abort_txs(conn, &conn->ibc_tx_noops); kiblnd_abort_txs(conn, &conn->ibc_tx_queue); kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd); kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred); @@ -2334,8 +2346,8 @@ kiblnd_passive_connect (struct rdma_cm_id *cmid, void *priv, int priv_nob) /* conn now "owns" cmid, so I return success from here on to ensure the * CM callback doesn't destroy cmid. */ - conn->ibc_incarnation = reqmsg->ibm_srcstamp; - conn->ibc_credits = IBLND_MSG_QUEUE_SIZE; + conn->ibc_incarnation = reqmsg->ibm_srcstamp; + conn->ibc_credits = IBLND_MSG_QUEUE_SIZE; conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE; LASSERT (conn->ibc_credits + conn->ibc_reserved_credits <= IBLND_RX_MSGS); @@ -2544,8 +2556,8 @@ kiblnd_check_connreply (kib_conn_t *conn, void *priv, int priv_nob) goto failed; } - conn->ibc_incarnation = msg->ibm_srcstamp; - conn->ibc_credits = IBLND_MSG_QUEUE_SIZE; + conn->ibc_incarnation = msg->ibm_srcstamp; + conn->ibc_credits = IBLND_MSG_QUEUE_SIZE; conn->ibc_reserved_credits = IBLND_MSG_QUEUE_SIZE; LASSERT (conn->ibc_credits + conn->ibc_reserved_credits <= IBLND_RX_MSGS); @@ -2808,6 +2820,7 @@ int kiblnd_conn_timed_out (kib_conn_t *conn) { return kiblnd_check_txs(conn, &conn->ibc_tx_queue) || + kiblnd_check_txs(conn, &conn->ibc_tx_noops) || kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) || kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) || kiblnd_check_txs(conn, &conn->ibc_active_txs); diff --git a/lnet/klnds/ptllnd/ptllnd.c b/lnet/klnds/ptllnd/ptllnd.c index f020fac693..0a556877ca 100755 --- a/lnet/klnds/ptllnd/ptllnd.c +++ b/lnet/klnds/ptllnd/ptllnd.c @@ -475,6 +475,12 @@ kptllnd_startup (lnet_ni_t *ni) return -EINVAL; } + /* kptl_msg_t::ptlm_credits is only a __u8 */ + if (*kptllnd_tunables.kptl_peercredits > 255) { + CERROR("kptl_peercredits must be <= 255\n"); + return -EINVAL; + } + *kptllnd_tunables.kptl_max_msg_size &= ~7; if (*kptllnd_tunables.kptl_max_msg_size < PTLLND_MIN_BUFFER_SIZE) *kptllnd_tunables.kptl_max_msg_size = PTLLND_MIN_BUFFER_SIZE; diff --git a/lnet/klnds/ptllnd/ptllnd.h b/lnet/klnds/ptllnd/ptllnd.h index 2e6e8a49d2..b1d436015d 100755 --- a/lnet/klnds/ptllnd/ptllnd.h +++ b/lnet/klnds/ptllnd/ptllnd.h @@ -136,6 +136,10 @@ typedef struct kptl_rx /* receive message */ char rx_space[0]; /* copy of incoming request */ } kptl_rx_t; +#define PTLLND_POSTRX_DONT_POST 0 /* don't post */ +#define PTLLND_POSTRX_NO_CREDIT 1 /* post: no credits */ +#define PTLLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */ + typedef struct kptl_rx_buffer_pool { spinlock_t rxbp_lock; @@ -217,6 +221,7 @@ struct kptl_peer atomic_t peer_refcount; /* The current refrences */ enum kptllnd_peer_state peer_state; spinlock_t peer_lock; /* serialize */ + struct list_head peer_noops; /* PTLLND_MSG_TYPE_NOOP txs */ struct list_head peer_sendq; /* txs waiting for mh handles */ struct list_head peer_activeq; /* txs awaiting completion */ lnet_process_id_t peer_id; /* Peer's LNET id */ @@ -401,8 +406,8 @@ kptllnd_rx_buffer_decref(kptl_rx_buffer_t *rxb) /* * RX SUPPORT FUNCTIONS */ -void kptllnd_rx_done(kptl_rx_t *rx); void kptllnd_rx_parse(kptl_rx_t *rx); +void kptllnd_rx_done(kptl_rx_t *rx, int post_credit); /* * PEER SUPPORT FUNCTIONS diff --git a/lnet/klnds/ptllnd/ptllnd_cb.c b/lnet/klnds/ptllnd/ptllnd_cb.c index ed3bb958b9..1903fc68b3 100644 --- a/lnet/klnds/ptllnd/ptllnd_cb.c +++ b/lnet/klnds/ptllnd/ptllnd_cb.c @@ -598,7 +598,7 @@ kptllnd_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, /* * We're done with the RX */ - kptllnd_rx_done(rx); + kptllnd_rx_done(rx, PTLLND_POSTRX_PEER_CREDIT); return rc; } diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c index 98e174fd32..f4e67f4bd7 100644 --- a/lnet/klnds/ptllnd/ptllnd_peer.c +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -158,6 +158,7 @@ kptllnd_peer_allocate (lnet_process_id_t lpid, ptl_process_id_t ppid) memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + INIT_LIST_HEAD (&peer->peer_noops); INIT_LIST_HEAD (&peer->peer_sendq); INIT_LIST_HEAD (&peer->peer_activeq); spin_lock_init (&peer->peer_lock); @@ -205,6 +206,7 @@ kptllnd_peer_destroy (kptl_peer_t *peer) LASSERT (atomic_read(&peer->peer_refcount) == 0); LASSERT (peer->peer_state == PEER_STATE_ALLOCATED || peer->peer_state == PEER_STATE_ZOMBIE); + LASSERT (list_empty(&peer->peer_noops)); LASSERT (list_empty(&peer->peer_sendq)); LASSERT (list_empty(&peer->peer_activeq)); @@ -245,6 +247,7 @@ kptllnd_peer_cancel_txs(kptl_peer_t *peer, struct list_head *txs) spin_lock_irqsave(&peer->peer_lock, flags); + kptllnd_cancel_txlist(&peer->peer_noops, txs); kptllnd_cancel_txlist(&peer->peer_sendq, txs); kptllnd_cancel_txlist(&peer->peer_activeq, txs); @@ -519,7 +522,9 @@ kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag) tx->tx_msg_mdh = msg_mdh; /* Ensure HELLO is sent first */ - if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) + if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP) + list_add(&tx->tx_list, &peer->peer_noops); + else if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) list_add(&tx->tx_list, &peer->peer_sendq); else list_add_tail(&tx->tx_list, &peer->peer_sendq); @@ -527,12 +532,26 @@ kptllnd_post_tx(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag) spin_unlock_irqrestore(&peer->peer_lock, flags); } +static inline int +kptllnd_peer_send_noop (kptl_peer_t *peer) +{ + if (!peer->peer_sent_hello || + peer->peer_credits == 0 || + !list_empty(&peer->peer_noops) || + peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER) + return 0; + + /* No tx to piggyback NOOP onto or no credit to send a tx */ + return (list_empty(&peer->peer_sendq) || peer->peer_credits == 1); +} + void kptllnd_peer_check_sends (kptl_peer_t *peer) { ptl_handle_me_t meh; kptl_tx_t *tx; int rc; + int msg_type; unsigned long flags; LASSERT(!in_interrupt()); @@ -541,10 +560,7 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) peer->peer_retry_noop = 0; - if (list_empty(&peer->peer_sendq) && - peer->peer_outstanding_credits >= PTLLND_CREDIT_HIGHWATER && - peer->peer_credits != 0) { - + if (kptllnd_peer_send_noop(peer)) { /* post a NOOP to return credits */ spin_unlock_irqrestore(&peer->peer_lock, flags); @@ -561,8 +577,18 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) peer->peer_retry_noop = (tx == NULL); } - while (!list_empty(&peer->peer_sendq)) { - tx = list_entry (peer->peer_sendq.next, kptl_tx_t, tx_list); + for (;;) { + if (!list_empty(&peer->peer_noops)) { + LASSERT (peer->peer_sent_hello); + tx = list_entry(peer->peer_noops.next, + kptl_tx_t, tx_list); + } else if (!list_empty(&peer->peer_sendq)) { + tx = list_entry(peer->peer_sendq.next, + kptl_tx_t, tx_list); + } else { + /* nothing to send right now */ + break; + } LASSERT (tx->tx_active); LASSERT (!PtlHandleIsEqual(tx->tx_msg_mdh, PTL_INVALID_HANDLE)); @@ -575,32 +601,37 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) *kptllnd_tunables.kptl_peercredits); LASSERT (peer->peer_credits >= 0); - /* Ensure HELLO is sent first */ - if (!peer->peer_sent_hello) { - if (tx->tx_msg->ptlm_type != PTLLND_MSG_TYPE_HELLO) - break; - peer->peer_sent_hello = 1; - } + msg_type = tx->tx_msg->ptlm_type; + + /* Ensure HELLO is sent first */ + if (!peer->peer_sent_hello) { + LASSERT (list_empty(&peer->peer_noops)); + if (msg_type != PTLLND_MSG_TYPE_HELLO) + break; + peer->peer_sent_hello = 1; + } if (peer->peer_credits == 0) { - CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %p\n", + CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: no credits for %s[%p]\n", libcfs_id2str(peer->peer_id), peer->peer_credits, peer->peer_outstanding_credits, - peer->peer_sent_credits, tx); + peer->peer_sent_credits, + kptllnd_msgtype2str(msg_type), tx); break; } - /* Don't use the last credit unless I've got credits to - * return */ + /* Last/Initial credit reserved for NOOP/HELLO */ if (peer->peer_credits == 1 && - peer->peer_outstanding_credits == 0) { + msg_type != PTLLND_MSG_TYPE_HELLO && + msg_type != PTLLND_MSG_TYPE_NOOP) { CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: " - "not using last credit for %p\n", + "not using last credit for %s[%p]\n", libcfs_id2str(peer->peer_id), peer->peer_credits, peer->peer_outstanding_credits, - peer->peer_sent_credits, tx); + peer->peer_sent_credits, + kptllnd_msgtype2str(msg_type), tx); break; } @@ -608,10 +639,8 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) /* Discard any NOOP I queued if I'm not at the high-water mark * any more or more messages have been queued */ - if (tx->tx_msg->ptlm_type == PTLLND_MSG_TYPE_NOOP && - (!list_empty(&peer->peer_sendq) || - peer->peer_outstanding_credits < PTLLND_CREDIT_HIGHWATER)) { - + if (msg_type == PTLLND_MSG_TYPE_NOOP && + !kptllnd_peer_send_noop(peer)) { tx->tx_active = 0; spin_unlock_irqrestore(&peer->peer_lock, flags); @@ -636,7 +665,7 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) tx->tx_msg->ptlm_u.rdma.kptlrm_matchbits = peer->peer_next_matchbits++; } - + peer->peer_sent_credits += peer->peer_outstanding_credits; peer->peer_outstanding_credits = 0; peer->peer_credits--; @@ -644,8 +673,7 @@ kptllnd_peer_check_sends (kptl_peer_t *peer) CDEBUG(D_NETTRACE, "%s[%d/%d+%d]: %s tx=%p nob=%d cred=%d\n", libcfs_id2str(peer->peer_id), peer->peer_credits, peer->peer_outstanding_credits, peer->peer_sent_credits, - kptllnd_msgtype2str(tx->tx_msg->ptlm_type), - tx, tx->tx_msg->ptlm_nob, + kptllnd_msgtype2str(msg_type), tx, tx->tx_msg->ptlm_nob, tx->tx_msg->ptlm_credits); list_add_tail(&tx->tx_list, &peer->peer_activeq); diff --git a/lnet/klnds/ptllnd/ptllnd_rx_buf.c b/lnet/klnds/ptllnd/ptllnd_rx_buf.c index 847e265e28..356660c0a2 100644 --- a/lnet/klnds/ptllnd/ptllnd_rx_buf.c +++ b/lnet/klnds/ptllnd/ptllnd_rx_buf.c @@ -331,12 +331,15 @@ kptllnd_rx_alloc(void) } void -kptllnd_rx_done(kptl_rx_t *rx) +kptllnd_rx_done(kptl_rx_t *rx, int post_credit) { kptl_rx_buffer_t *rxb = rx->rx_rxb; kptl_peer_t *peer = rx->rx_peer; unsigned long flags; + LASSERT (post_credit == PTLLND_POSTRX_NO_CREDIT || + post_credit == PTLLND_POSTRX_PEER_CREDIT); + CDEBUG(D_NET, "rx=%p rxb %p peer %p\n", rx, rxb, peer); if (rxb != NULL) @@ -346,7 +349,9 @@ kptllnd_rx_done(kptl_rx_t *rx) /* Update credits (after I've decref-ed the buffer) */ spin_lock_irqsave(&peer->peer_lock, flags); - peer->peer_outstanding_credits++; + if (post_credit == PTLLND_POSTRX_PEER_CREDIT) + peer->peer_outstanding_credits++; + LASSERT (peer->peer_outstanding_credits + peer->peer_sent_credits <= *kptllnd_tunables.kptl_peercredits); @@ -515,6 +520,7 @@ void kptllnd_rx_parse(kptl_rx_t *rx) { kptl_msg_t *msg = rx->rx_msg; + int post_credit = PTLLND_POSTRX_PEER_CREDIT; kptl_peer_t *peer; int rc; unsigned long flags; @@ -642,7 +648,7 @@ kptllnd_rx_parse(kptl_rx_t *rx) int c = peer->peer_credits; int oc = peer->peer_outstanding_credits; int sc = peer->peer_sent_credits; - + spin_unlock_irqrestore(&peer->peer_lock, flags); CERROR("%s: buffer overrun [%d/%d+%d]\n", @@ -655,6 +661,12 @@ kptllnd_rx_parse(kptl_rx_t *rx) * buffers after the startup handshake. */ peer->peer_credits += msg->ptlm_credits; + /* This ensures the credit taken by NOOP can be returned */ + if (msg->ptlm_type == PTLLND_MSG_TYPE_NOOP) { + peer->peer_outstanding_credits++; + post_credit = PTLLND_POSTRX_NO_CREDIT; + } + spin_unlock_irqrestore(&peer->peer_lock, flags); /* See if something can go out now that credits have come in */ @@ -723,5 +735,5 @@ kptllnd_rx_parse(kptl_rx_t *rx) if (rx->rx_peer == NULL) /* drop ref on peer */ kptllnd_peer_decref(peer); /* unless rx_done will */ rx_done: - kptllnd_rx_done(rx); + kptllnd_rx_done(rx, post_credit); } -- GitLab