Skip to content
Snippets Groups Projects
Commit 43f00580 authored by Amir Shehata's avatar Amir Shehata Committed by Oleg Drokin
Browse files

LU-9549 lnet: prevent assert on ln_state


lnet_peer_primary_nid() is called from lnet_parse. It checks
ln_state outside the net lock, causing a race condition
during shutdown where the code expects the state to be
running, but it's stopping or shutdown.

Fixed the issue by renaming lnet_peer_primary_nid() to
lnet_peer_primary_nid_locked(). This function is now called
when lnet_net_lock is held in lnet_parse().

In lnet_create_reply_msg() we already have access to the
msg_txpeer, so we lookup the primary_nid directly

Signed-off-by: default avatarAmir Shehata <amir.shehata@intel.com>
Change-Id: I0518cdbec95b38bd8690517320b601676ae259f0
Reviewed-on: https://review.whamcloud.com/27262


Tested-by: Jenkins
Tested-by: default avatarMaloo <hpdd-maloo@intel.com>
Reviewed-by: default avatarDoug Oucharek <doug.s.oucharek@intel.com>
Reviewed-by: default avatarSonia Sharma <sonia.sharma@intel.com>
Reviewed-by: default avatarOlaf Weber <olaf.weber@hpe.com>
Reviewed-by: default avatarOleg Drokin <oleg.drokin@intel.com>
parent 01d7ddd0
No related branches found
No related tags found
No related merge requests found
...@@ -805,7 +805,7 @@ struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, int cpt); ...@@ -805,7 +805,7 @@ struct lnet_peer_ni *lnet_nid2peerni_locked(lnet_nid_t nid, int cpt);
struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt); struct lnet_peer_ni *lnet_nid2peerni_ex(lnet_nid_t nid, int cpt);
struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid); struct lnet_peer_ni *lnet_find_peer_ni_locked(lnet_nid_t nid);
void lnet_peer_net_added(struct lnet_net *net); void lnet_peer_net_added(struct lnet_net *net);
lnet_nid_t lnet_peer_primary_nid(lnet_nid_t nid); lnet_nid_t lnet_peer_primary_nid_locked(lnet_nid_t nid);
void lnet_peer_tables_cleanup(struct lnet_net *net); void lnet_peer_tables_cleanup(struct lnet_net *net);
void lnet_peer_uninit(void); void lnet_peer_uninit(void);
int lnet_peer_tables_create(void); int lnet_peer_tables_create(void);
......
...@@ -2524,8 +2524,6 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, ...@@ -2524,8 +2524,6 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
msg->msg_hdr.dest_pid = dest_pid; msg->msg_hdr.dest_pid = dest_pid;
msg->msg_hdr.payload_length = payload_length; msg->msg_hdr.payload_length = payload_length;
} }
/* Multi-Rail: Primary NID of source. */
msg->msg_initiator = lnet_peer_primary_nid(src_nid);
lnet_net_lock(cpt); lnet_net_lock(cpt);
lpni = lnet_nid2peerni_locked(from_nid, cpt); lpni = lnet_nid2peerni_locked(from_nid, cpt);
...@@ -2544,6 +2542,8 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, ...@@ -2544,6 +2542,8 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid,
msg->msg_rxpeer = lpni; msg->msg_rxpeer = lpni;
msg->msg_rxni = ni; msg->msg_rxni = ni;
lnet_ni_addref_locked(ni, cpt); lnet_ni_addref_locked(ni, cpt);
/* Multi-Rail: Primary NID of source. */
msg->msg_initiator = lnet_peer_primary_nid_locked(src_nid);
if (lnet_isrouter(msg->msg_rxpeer)) { if (lnet_isrouter(msg->msg_rxpeer)) {
lnet_peer_set_alive(msg->msg_rxpeer); lnet_peer_set_alive(msg->msg_rxpeer);
...@@ -2845,8 +2845,7 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg) ...@@ -2845,8 +2845,7 @@ lnet_create_reply_msg(struct lnet_ni *ni, struct lnet_msg *getmsg)
libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd); libcfs_nid2str(ni->ni_nid), libcfs_id2str(peer_id), getmd);
/* setup information for lnet_build_msg_event */ /* setup information for lnet_build_msg_event */
msg->msg_initiator = lnet_peer_primary_nid(peer_id.nid); msg->msg_initiator = getmsg->msg_txpeer->lpni_peer_net->lpn_peer->lp_primary_nid;
/* Cheaper: msg->msg_initiator = getmsg->msg_txpeer->lp_nid; */
msg->msg_from = peer_id.nid; msg->msg_from = peer_id.nid;
msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */ msg->msg_type = LNET_MSG_GET; /* flag this msg as an "optimized" GET */
msg->msg_hdr.src_nid = peer_id.nid; msg->msg_hdr.src_nid = peer_id.nid;
......
...@@ -587,19 +587,16 @@ lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni) ...@@ -587,19 +587,16 @@ lnet_peer_is_ni_pref_locked(struct lnet_peer_ni *lpni, struct lnet_ni *ni)
} }
lnet_nid_t lnet_nid_t
lnet_peer_primary_nid(lnet_nid_t nid) lnet_peer_primary_nid_locked(lnet_nid_t nid)
{ {
struct lnet_peer_ni *lpni; struct lnet_peer_ni *lpni;
lnet_nid_t primary_nid = nid; lnet_nid_t primary_nid = nid;
int cpt;
cpt = lnet_net_lock_current();
lpni = lnet_find_peer_ni_locked(nid); lpni = lnet_find_peer_ni_locked(nid);
if (lpni) { if (lpni) {
primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid; primary_nid = lpni->lpni_peer_net->lpn_peer->lp_primary_nid;
lnet_peer_ni_decref_locked(lpni); lnet_peer_ni_decref_locked(lpni);
} }
lnet_net_unlock(cpt);
return primary_nid; return primary_nid;
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment