diff --git a/lnet/ChangeLog b/lnet/ChangeLog index a5aeb9125d1ff0c4fd227be8c59cb076243a1793..0bbc28ceaf00d3b27ba44a97ada54e2b870daadc 100644 --- a/lnet/ChangeLog +++ b/lnet/ChangeLog @@ -13,6 +13,15 @@ TBD Cluster File Systems, Inc. <info@clusterfs.com> ptllnd - Portals 3.3 / UNICOS/lc 1.5.x, 2.0.x * bug fixes +Severity : major +Frequency : after Ptllnd timeouts and portals congestion +Bugzilla : 11659 +Description: Credit overflows +Details : This was a bug in ptllnd connection establishment. The fix + implements better peer stamps to disambiguate connection + establishment and ensure both peers enter the credit flow + state machine consistently. + Severity : major Frequency : rare Bugzilla : 11394 diff --git a/lnet/klnds/ptllnd/ptllnd.c b/lnet/klnds/ptllnd/ptllnd.c index a82babeea83fc3845fd114486879e796d4a6f3fc..5723c8aaec07e2bec939c5c5608b8159c6e39545 100755 --- a/lnet/klnds/ptllnd/ptllnd.c +++ b/lnet/klnds/ptllnd/ptllnd.c @@ -198,7 +198,7 @@ kptllnd_msg_pack(kptl_msg_t *msg, kptl_peer_t *peer) /* msg->ptlm_nob Filled in kptllnd_init_msg() */ msg->ptlm_cksum = 0; msg->ptlm_srcnid = kptllnd_data.kptl_ni->ni_nid; - msg->ptlm_srcstamp = kptllnd_data.kptl_incarnation; + msg->ptlm_srcstamp = peer->peer_myincarnation; msg->ptlm_dstnid = peer->peer_id.nid; msg->ptlm_dststamp = peer->peer_incarnation; msg->ptlm_srcpid = the_lnet.ln_pid; @@ -527,9 +527,9 @@ kptllnd_startup (lnet_ni_t *ni) kptllnd_ptlid2str(kptllnd_data.kptl_portals_id), libcfs_nid2str(ni->ni_nid)); - /* - * Initialized the incarnation - */ + /* Initialized the incarnation - it must be for-all-time unique, even + * accounting for the fact that we increment it when we disconnect a + * peer that's using it */ do_gettimeofday(&tv); kptllnd_data.kptl_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; diff --git a/lnet/klnds/ptllnd/ptllnd.h b/lnet/klnds/ptllnd/ptllnd.h index 4072a41671a6c2cd58e80e250a318d05f4282777..598c4b811c53150b7e0694db43b1f7039c5f9273 100755 --- a/lnet/klnds/ptllnd/ptllnd.h +++ b/lnet/klnds/ptllnd/ptllnd.h @@ -217,9 +217,11 @@ struct kptl_peer lnet_process_id_t peer_id; /* Peer's LNET id */ ptl_process_id_t peer_ptlid; /* Peer's portals id */ __u64 peer_incarnation; /* peer's incarnation */ + __u64 peer_myincarnation; /* my incarnation at HELLO */ int peer_sent_hello; /* have I sent HELLO? */ int peer_credits; /* number of send credits */ - int peer_outstanding_credits;/* number of peer credits */ + int peer_outstanding_credits;/* number of peer credits to return */ + int peer_active_rxs; /* # rx-es being handled */ int peer_error; /* errno on closing this peer */ cfs_time_t peer_last_alive; /* when (in jiffies) I was last alive */ __u64 peer_next_matchbits; /* Next value to register RDMA from peer */ diff --git a/lnet/klnds/ptllnd/ptllnd_peer.c b/lnet/klnds/ptllnd/ptllnd_peer.c index f8999e0b14f2e816de929f71b02223ec3b0b160f..0f9e7e046bc756a2972d96a8868624470704f677 100644 --- a/lnet/klnds/ptllnd/ptllnd_peer.c +++ b/lnet/klnds/ptllnd/ptllnd_peer.c @@ -169,11 +169,14 @@ kptllnd_peer_allocate (lnet_process_id_t lpid, ptl_process_id_t ppid) peer->peer_credits = 1; /* enough for HELLO */ peer->peer_next_matchbits = PTL_RESERVED_MATCHBITS; peer->peer_outstanding_credits = *kptllnd_tunables.kptl_peercredits - 1; + peer->peer_active_rxs = 0; atomic_set(&peer->peer_refcount, 1); /* 1 ref for caller */ write_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags); + peer->peer_myincarnation = kptllnd_data.kptl_incarnation; + /* Only increase # peers under lock, to guarantee we dont grow it * during shutdown */ if (kptllnd_data.kptl_shutdown) { @@ -198,6 +201,7 @@ kptllnd_peer_destroy (kptl_peer_t *peer) LASSERT (!in_interrupt()); LASSERT (atomic_read(&peer->peer_refcount) == 0); + LASSERT (peer->peer_active_rxs == 0); LASSERT (peer->peer_state == PEER_STATE_ALLOCATED || peer->peer_state == PEER_STATE_ZOMBIE); LASSERT (list_empty(&peer->peer_sendq)); @@ -350,6 +354,11 @@ kptllnd_peer_close_locked(kptl_peer_t *peer, int why) case PEER_STATE_WAITING_HELLO: case PEER_STATE_ACTIVE: + /* Ensure new peers see a new incarnation of me */ + LASSERT(peer->peer_myincarnation <= kptllnd_data.kptl_incarnation); + if (peer->peer_myincarnation == kptllnd_data.kptl_incarnation) + kptllnd_data.kptl_incarnation++; + /* Removing from peer table */ kptllnd_data.kptl_n_active_peers--; LASSERT (kptllnd_data.kptl_n_active_peers >= 0); @@ -946,15 +955,42 @@ kptllnd_peer_handle_hello (ptl_process_id_t initiator, /* Completing HELLO handshake */ LASSERT(peer->peer_incarnation == 0); + if (msg->ptlm_dststamp != 0 && + msg->ptlm_dststamp != peer->peer_myincarnation) { + write_unlock_irqrestore(g_lock, flags); + + CERROR("Ignoring HELLO from %s: unexpected " + "dststamp "LPX64" ("LPX64" wanted)\n", + libcfs_id2str(lpid), + msg->ptlm_dststamp, + peer->peer_myincarnation); + kptllnd_peer_decref(peer); + return NULL; + } + + /* Concurrent initiation or response to my HELLO */ peer->peer_state = PEER_STATE_ACTIVE; peer->peer_incarnation = msg->ptlm_srcstamp; peer->peer_next_matchbits = safe_matchbits; - + write_unlock_irqrestore(g_lock, flags); return peer; } - /* remove old incarnation of this peer */ + if (msg->ptlm_dststamp != 0 && + msg->ptlm_dststamp <= peer->peer_myincarnation) { + write_unlock_irqrestore(g_lock, flags); + + CERROR("Ignoring stale HELLO from %s: " + "dststamp "LPX64" (current "LPX64")\n", + libcfs_id2str(lpid), + msg->ptlm_dststamp, + peer->peer_myincarnation); + kptllnd_peer_decref(peer); + return NULL; + } + + /* Brand new connection attempt: remove old incarnation */ kptllnd_peer_close_locked(peer, 0); } diff --git a/lnet/klnds/ptllnd/ptllnd_rx_buf.c b/lnet/klnds/ptllnd/ptllnd_rx_buf.c index b83ab518344cb16f8c9a3471e943e6b56405708a..ad0f05d9460cfb3fdae47b6151a640b5395dcdf2 100644 --- a/lnet/klnds/ptllnd/ptllnd_rx_buf.c +++ b/lnet/klnds/ptllnd/ptllnd_rx_buf.c @@ -344,6 +344,9 @@ kptllnd_rx_done(kptl_rx_t *rx) /* Update credits (after I've decref-ed the buffer) */ spin_lock_irqsave(&peer->peer_lock, flags); + peer->peer_active_rxs--; + LASSERT (peer->peer_active_rxs >= 0); + peer->peer_outstanding_credits++; LASSERT (peer->peer_outstanding_credits <= *kptllnd_tunables.kptl_peercredits); @@ -581,11 +584,8 @@ kptllnd_rx_parse(kptl_rx_t *rx) if (msg->ptlm_type == PTLLND_MSG_TYPE_HELLO) { peer = kptllnd_peer_handle_hello(rx->rx_initiator, msg); - if (peer == NULL) { - CWARN("No peer for %s\n", - kptllnd_ptlid2str(rx->rx_initiator)); + if (peer == NULL) goto rx_done; - } } else { peer = kptllnd_id2peer(srcid); if (peer == NULL) { @@ -596,61 +596,74 @@ kptllnd_rx_parse(kptl_rx_t *rx) goto rx_done; } - /* Ignore anything else while I'm waiting for HELLO */ - if (peer->peer_state == PEER_STATE_WAITING_HELLO) { + /* Ignore anything apart from HELLO while I'm waiting for it and + * any messages for a previous incarnation of the connection */ + if (peer->peer_state == PEER_STATE_WAITING_HELLO || + msg->ptlm_dststamp < peer->peer_myincarnation) { kptllnd_peer_decref(peer); goto rx_done; } + + if (msg->ptlm_srcstamp != peer->peer_incarnation) { + CERROR("%s: Unexpected srcstamp "LPX64" " + "("LPX64" expected)\n", + libcfs_id2str(peer->peer_id), + msg->ptlm_srcstamp, + peer->peer_incarnation); + rc = -EPROTO; + goto failed; + } + + if (msg->ptlm_dststamp != peer->peer_myincarnation) { + CERROR("%s: Unexpected dststamp "LPX64" " + "("LPX64" expected)\n", + libcfs_id2str(peer->peer_id), msg->ptlm_dststamp, + peer->peer_myincarnation); + rc = -EPROTO; + goto failed; + } } LASSERT (msg->ptlm_srcnid == peer->peer_id.nid && msg->ptlm_srcpid == peer->peer_id.pid); - if (msg->ptlm_srcstamp != peer->peer_incarnation) { - CERROR("Stale rx from %s srcstamp "LPX64" expected "LPX64"\n", + spin_lock_irqsave(&peer->peer_lock, flags); + + if (peer->peer_active_rxs == *kptllnd_tunables.kptl_peercredits) { + spin_unlock_irqrestore(&peer->peer_lock, flags); + + CERROR("Message overflow from %s: handling %d already\n", libcfs_id2str(peer->peer_id), - msg->ptlm_srcstamp, - peer->peer_incarnation); + *kptllnd_tunables.kptl_peercredits); rc = -EPROTO; goto failed; } + + if (msg->ptlm_credits != 0 && + peer->peer_credits + msg->ptlm_credits > + *kptllnd_tunables.kptl_peercredits) { + credits = peer->peer_credits; + spin_unlock_irqrestore(&peer->peer_lock, flags); - if (msg->ptlm_dststamp != kptllnd_data.kptl_incarnation && - (msg->ptlm_type != PTLLND_MSG_TYPE_HELLO || /* HELLO sends a */ - msg->ptlm_dststamp != 0)) { /* zero dststamp */ - CERROR("Stale rx from %s dststamp "LPX64" expected "LPX64"\n", - libcfs_id2str(peer->peer_id), msg->ptlm_dststamp, - kptllnd_data.kptl_incarnation); + CERROR("Credit overflow from %s: %d + %d > %d\n", + libcfs_id2str(peer->peer_id), + credits, msg->ptlm_credits, + *kptllnd_tunables.kptl_peercredits); rc = -EPROTO; goto failed; } - if (msg->ptlm_credits != 0) { - spin_lock_irqsave(&peer->peer_lock, flags); - - if (peer->peer_credits + msg->ptlm_credits > - *kptllnd_tunables.kptl_peercredits) { - credits = peer->peer_credits; - spin_unlock_irqrestore(&peer->peer_lock, flags); - - CERROR("Credit overflow from %s: %d + %d > %d\n", - libcfs_id2str(peer->peer_id), - credits, msg->ptlm_credits, - *kptllnd_tunables.kptl_peercredits); - rc = -EPROTO; - goto failed; - } - - peer->peer_credits += msg->ptlm_credits; + /* ptllnd-level protocol correct: account credits */ + peer->peer_credits += msg->ptlm_credits; + peer->peer_active_rxs++; - spin_unlock_irqrestore(&peer->peer_lock, flags); + spin_unlock_irqrestore(&peer->peer_lock, flags); + /* See if something can go out now that credits have come in */ + if (msg->ptlm_credits != 0) kptllnd_peer_check_sends(peer); - } - /* ptllnd-level protocol correct - rx takes my ref on peer and increments - * peer_outstanding_credits when it completes */ - rx->rx_peer = peer; + rx->rx_peer = peer; /* do buffer accounting on rxdone */ kptllnd_peer_alive(peer); switch (msg->ptlm_type) {