diff --git a/lnet/autoconf/lustre-lnet.m4 b/lnet/autoconf/lustre-lnet.m4 index 41f42b5f3e84c2836d72c8c57a1e6deb8380b548..a20d639fbf8fbfe29bbcf9679133b3471a0d4576 100644 --- a/lnet/autoconf/lustre-lnet.m4 +++ b/lnet/autoconf/lustre-lnet.m4 @@ -186,6 +186,39 @@ AC_SUBST(IIBCPPFLAGS) AC_SUBST(IIBNAL) ]) +# +# LP_CONFIG_VIB +# +# check for Voltaire infiniband support +# +AC_DEFUN([LP_CONFIG_VIB], +[AC_MSG_CHECKING([if Voltaire IB kernel headers are present]) +VIBCPPFLAGS="-I/usr/local/include/ibhost-kdevel -DCPU_BE=0 -DCPU_LE=1 -DGSI_PASS_PORT_NUM" +EXTRA_KCFLAGS_save="$EXTRA_KCFLAGS" +EXTRA_KCFLAGS="$EXTRA_KCFLAGS $VIBCPPFLAGS" +LB_LINUX_TRY_COMPILE([ + #include <linux/list.h> + #include <vverbs.h> +],[ + vv_hca_h_t kib_hca; + vv_return_t retval; + + retval = vv_hca_open("ANY_HCA", NULL, &kib_hca); + + return retval == vv_return_ok ? 0 : 1; +],[ + AC_MSG_RESULT([yes]) + VIBNAL="vibnal" +],[ + AC_MSG_RESULT([no]) + VIBNAL="" + VIBCPPFLAGS="" +]) +EXTRA_KCFLAGS="$EXTRA_KCFLAGS_save" +AC_SUBST(VIBCPPFLAGS) +AC_SUBST(VIBNAL) +]) + # # LP_CONFIG_RANAL # @@ -336,6 +369,7 @@ if test $linux25 = 'no' ; then LP_CONFIG_OPENIB fi LP_CONFIG_IIB +LP_CONFIG_VIB LP_CONFIG_RANAL LP_STRUCT_PAGE_LIST @@ -474,6 +508,7 @@ AC_DEFUN([LP_CONDITIONALS], AM_CONDITIONAL(BUILD_GMNAL, test x$GMNAL = "xgmnal") AM_CONDITIONAL(BUILD_OPENIBNAL, test x$OPENIBNAL = "xopenibnal") AM_CONDITIONAL(BUILD_IIBNAL, test x$IIBNAL = "xiibnal") +AM_CONDITIONAL(BUILD_VIBNAL, test x$VIBNAL = "xvibnal") AM_CONDITIONAL(BUILD_RANAL, test x$RANAL = "xranal") ]) @@ -496,12 +531,14 @@ portals/knals/Makefile portals/knals/autoMakefile portals/knals/gmnal/Makefile portals/knals/gmnal/autoMakefile +portals/knals/openibnal/Makefile +portals/knals/openibnal/autoMakefile portals/knals/iibnal/Makefile portals/knals/iibnal/autoMakefile +portals/knals/vibnal/Makefile +portals/knals/vibnal/autoMakefile portals/knals/lonal/Makefile portals/knals/lonal/autoMakefile -portals/knals/openibnal/Makefile -portals/knals/openibnal/autoMakefile portals/knals/qswnal/Makefile portals/knals/qswnal/autoMakefile portals/knals/ranal/Makefile diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index 85284ce9f4033e78eb7df5c1a78c3a3f6b569b55..e5905144279c8c6eada4257236c166ed43ff3dce 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -653,6 +653,7 @@ enum { IIBNAL = 8, LONAL = 9, RANAL = 10, + VIBNAL = 11, NAL_ENUM_END_MARKER }; diff --git a/lnet/klnds/Makefile.in b/lnet/klnds/Makefile.in index 7e2e601e762830060d0d7e46ac6f681a896f5f98..f494a302b0c1d7349c2eb19ccc29394919279df7 100644 --- a/lnet/klnds/Makefile.in +++ b/lnet/klnds/Makefile.in @@ -2,6 +2,7 @@ @BUILD_RANAL_TRUE@subdir-m += ranal @BUILD_OPENIBNAL_TRUE@subdir-m += openibnal @BUILD_IIBNAL_TRUE@subdir-m += iibnal +@BUILD_VIBNAL_TRUE@subdir-m += vibnal @BUILD_QSWNAL_TRUE@subdir-m += qswnal subdir-m += socknal subdir-m += lonal diff --git a/lnet/klnds/autoMakefile.am b/lnet/klnds/autoMakefile.am index 4638188104d346873c330ebc5d6bc75a1c92d4cd..d28e3655fa962365d8f0d374587c41ab8c2a4728 100644 --- a/lnet/klnds/autoMakefile.am +++ b/lnet/klnds/autoMakefile.am @@ -3,4 +3,4 @@ # This code is issued under the GNU General Public License. # See the file COPYING in this distribution -SUBDIRS = gmnal iibnal openibnal qswnal socknal lonal ranal +SUBDIRS = lonal socknal qswnal gmnal openibnal iibnal vibnal ranal diff --git a/lnet/klnds/iiblnd/iiblnd.c b/lnet/klnds/iiblnd/iiblnd.c index 09908c9f2c956fb61c25506aeaa3cec61805f5ee..e59d066c26ead8a042584f4854d2ebd3a811c5a4 100644 --- a/lnet/klnds/iiblnd/iiblnd.c +++ b/lnet/klnds/iiblnd/iiblnd.c @@ -1381,7 +1381,7 @@ kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kibnal_data.kib_init = IBNAL_INIT_DATA; /*****************************************************/ - process_id.pid = 0; + process_id.pid = requested_pid; process_id.nid = kibnal_data.kib_nid; rc = lib_init(&kibnal_lib, nal, process_id, @@ -1690,7 +1690,7 @@ kibnal_module_init (void) } /* Pure gateways want the NAL started up at module load time... */ - rc = PtlNIInit(IIBNAL, 0, NULL, NULL, &kibnal_ni); + rc = PtlNIInit(IIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); if (rc != PTL_OK && rc != PTL_IFACE_DUP) { ptl_unregister_nal(IIBNAL); return (-ENODEV); diff --git a/lnet/klnds/iiblnd/iiblnd.h b/lnet/klnds/iiblnd/iiblnd.h index 324215838a5765a860995bca816ca1b6acf9baa9..4f0454291bccde4f3b6c112dfcbb1d7aa9eff9ba 100644 --- a/lnet/klnds/iiblnd/iiblnd.h +++ b/lnet/klnds/iiblnd/iiblnd.h @@ -247,7 +247,6 @@ typedef struct typedef struct { - __u32 rd_key; /* remote key */ __u32 rd_nob; /* # of bytes */ __u64 rd_addr; /* remote io vaddr */ } kib_rdma_desc_t __attribute__((packed)); @@ -267,6 +266,7 @@ typedef struct ptl_hdr_t ibrm_hdr; /* portals header */ __u64 ibrm_cookie; /* opaque completion cookie */ __u32 ibrm_num_descs; /* how many descs */ + __u32 rd_key; /* remote key */ kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */ } kib_rdma_msg_t __attribute__((packed)); @@ -317,7 +317,7 @@ typedef struct kib_rx /* receive message */ __u64 rx_vaddr; /* pre-mapped buffer (hca vaddr) */ kib_msg_t *rx_msg; /* pre-mapped buffer (host vaddr) */ IB_WORK_REQ rx_wrq; - IB_LOCAL_DATASEGMENT rx_gl; /* and it's memory */ + IB_LOCAL_DATASEGMENT rx_gl; /* and its memory */ } kib_rx_t; typedef struct kib_tx /* transmit message */ @@ -370,7 +370,6 @@ typedef struct kib_connreq IB_PATH_RECORD cr_path; CM_REQUEST_INFO cr_cmreq; CM_CONN_INFO cr_discarded; - CM_REJECT_INFO cr_rej_info; } kib_connreq_t; typedef struct kib_conn diff --git a/lnet/klnds/iiblnd/iiblnd_cb.c b/lnet/klnds/iiblnd/iiblnd_cb.c index a827ba588e758b44f80735941ef188bbe45f87d6..16ed9379a04da677aa18a15191e3cf7ea70828bf 100644 --- a/lnet/klnds/iiblnd/iiblnd_cb.c +++ b/lnet/klnds/iiblnd/iiblnd_cb.c @@ -485,17 +485,20 @@ kibnal_rx_callback (IB_WORK_COMPLETION *wc) goto failed; } + if (flipped) { + __swab32(msg->ibm_u.rdma.rd_key); + } + for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) { kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i]; if (flipped) { - __swab32(desc->rd_key); __swab32(desc->rd_nob); __swab64(desc->rd_addr); } CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n", - desc->rd_key, desc->rd_addr, desc->rd_nob); + msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob); } break; @@ -628,9 +631,9 @@ kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset, desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs]; if (active) - desc->rd_key = kibnal_data.kib_md.md_lkey; + ibrm->rd_key = kibnal_data.kib_md.md_lkey; else - desc->rd_key = kibnal_data.kib_md.md_rkey; + ibrm->rd_key = kibnal_data.kib_md.md_rkey; desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */ desc->rd_addr = kibnal_page2phys(page) + page_offset + kibnal_data.kib_md.md_addr; @@ -845,7 +848,7 @@ kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access, tx->tx_mapped = KIB_TX_MAPPED; #endif } else { - CERROR ("Can't map phys: %d\n", rc); + CERROR ("Can't map phys: %d\n", frc); rc = -EFAULT; } @@ -1090,6 +1093,10 @@ kibnal_ca_callback (void *ca_arg, void *cq_arg) for(;;) { while (iibt_cq_poll(cq, &wc) == FSUCCESS) { + + /* We will need to rearm the CQ to avoid a potential race. */ + armed = 0; + if (kibnal_wreqid_is_rx(wc.WorkReqId)) kibnal_rx_callback(&wc); else @@ -1306,7 +1313,7 @@ kibnal_start_passive_rdma (int type, ptl_nid_t nid, ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; /* map_kiov alrady filled the rdma descs for the whole_mem case */ if (!kibnal_whole_mem()) { - ibmsg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_rkey; + ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey; ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; ibmsg->ibm_u.rdma.ibrm_num_descs = 1; @@ -1408,7 +1415,7 @@ kibnal_start_active_rdma (int type, int status, } if (!kibnal_whole_mem()) { - tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_key = tx->tx_md.md_lkey; + tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey; tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1; @@ -1439,7 +1446,7 @@ kibnal_start_active_rdma (int type, int status, ds->Address = ldesc->rd_addr; ds->Length = ldesc->rd_nob; - ds->Lkey = ldesc->rd_key; + ds->Lkey = tx->tx_msg->ibm_u.rdma.rd_key; memset(wrq, 0, sizeof(*wrq)); wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0); @@ -1453,7 +1460,7 @@ kibnal_start_active_rdma (int type, int status, wrq->Req.SendRC.Options.s.ImmediateData = 0; wrq->Req.SendRC.Options.s.Fence = 0; wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr; - wrq->Req.SendRC.RemoteDS.Rkey = rdesc->rd_key; + wrq->Req.SendRC.RemoteDS.Rkey = rxmsg->ibm_u.rdma.rd_key; /* only the last rdma post triggers tx completion */ if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1) @@ -2394,7 +2401,9 @@ kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg) PORTAL_ALLOC(rep, sizeof(*rep)); PORTAL_ALLOC(rcv, sizeof(*rcv)); if (rep == NULL || rcv == NULL) { - CERROR ("can't reply and receive buffers\n"); + if (rep) PORTAL_FREE(rep, sizeof(*rep)); + if (rcv) PORTAL_FREE(rcv, sizeof(*rcv)); + CERROR ("can't allocate reply and receive buffers\n"); GOTO(out, reason = RC_INSUFFICIENT_RESP_RES); } diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c index 2873caa0a8c7787411b19458b7c9210350e7e7e7..02c33630e73108bf088ab9eae4939167f0bc71ed 100644 --- a/lnet/klnds/ralnd/ralnd.c +++ b/lnet/klnds/ralnd/ralnd.c @@ -696,7 +696,7 @@ kranal_active_conn_handshake(kra_peer_t *peer, kra_conn_t **connp) /* spread connections over all devices using both peer NIDs to ensure * all nids use all devices */ - idx = (peer->rap_nid + kranal_lib.libnal_ni.ni_pid.nid) + idx = peer->rap_nid + kranal_lib.libnal_ni.ni_pid.nid; dev = &kranal_data.kra_devices[idx % kranal_data.kra_ndevs]; rc = kranal_create_conn(&conn, dev); @@ -1550,7 +1550,7 @@ kranal_cmd(struct portals_cfg *pcfg, void * private) else { rc = 0; pcfg->pcfg_nid = conn->rac_peer->rap_nid; - pcfg->pcfg_id = 0; + pcfg->pcfg_id = conn->rac_device->rad_id; pcfg->pcfg_misc = 0; pcfg->pcfg_flags = 0; kranal_conn_decref(conn); diff --git a/lnet/klnds/viblnd/.cvsignore b/lnet/klnds/viblnd/.cvsignore new file mode 100644 index 0000000000000000000000000000000000000000..5ed596bbf5a8bc84d4ce3514700a939431df4da6 --- /dev/null +++ b/lnet/klnds/viblnd/.cvsignore @@ -0,0 +1,10 @@ +.deps +Makefile +.*.cmd +autoMakefile.in +autoMakefile +*.ko +*.mod.c +.*.flags +.tmp_versions +.depend diff --git a/lnet/klnds/viblnd/Makefile.in b/lnet/klnds/viblnd/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..fd7bb0575a2656e705a43a4da0b32509a3ecc357 --- /dev/null +++ b/lnet/klnds/viblnd/Makefile.in @@ -0,0 +1,6 @@ +MODULES := kvibnal +kvibnal-objs := vibnal.o vibnal_cb.o vibnal_sa.o + +EXTRA_POST_CFLAGS := @VIBCPPFLAGS@ + +@INCLUDE_RULES@ diff --git a/lnet/klnds/viblnd/Makefile.mk b/lnet/klnds/viblnd/Makefile.mk new file mode 100644 index 0000000000000000000000000000000000000000..d08633a82a5b3d08f89bc6f290eb098adf31a44f --- /dev/null +++ b/lnet/klnds/viblnd/Makefile.mk @@ -0,0 +1,10 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +include $(src)/../../Kernelenv + +obj-y += kvibnal.o +kvibnal-objs := vibnal.o vibnal_cb.o vibnal_sa.o + diff --git a/lnet/klnds/viblnd/autoMakefile.am b/lnet/klnds/viblnd/autoMakefile.am new file mode 100644 index 0000000000000000000000000000000000000000..eb654128a670e17ecfbb60ad17e80fbb40ab04a8 --- /dev/null +++ b/lnet/klnds/viblnd/autoMakefile.am @@ -0,0 +1,15 @@ +# Copyright (C) 2001 Cluster File Systems, Inc. +# +# This code is issued under the GNU General Public License. +# See the file COPYING in this distribution + +if MODULES +if !CRAY_PORTALS +if BUILD_VIBNAL +modulenet_DATA = kvibnal$(KMODEXT) +endif +endif +endif + +MOSTLYCLEANFILES = *.o *.ko *.mod.c +DIST_SOURCES = $(kvibnal-objs:%.o=%.c) vibnal.h diff --git a/lnet/klnds/viblnd/viblnd.c b/lnet/klnds/viblnd/viblnd.c new file mode 100644 index 0000000000000000000000000000000000000000..50e1149e48708f1155e4392026df7ef8f759fdc5 --- /dev/null +++ b/lnet/klnds/viblnd/viblnd.c @@ -0,0 +1,1693 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton <eric@bartonsoftware.com> + * Author: Frank Zago <fzago@systemfabricworks.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "vibnal.h" + +nal_t kibnal_api; +ptl_handle_ni_t kibnal_ni; +kib_tunables_t kibnal_tunables; + +kib_data_t kibnal_data = { + .kib_service_id = IBNAL_SERVICE_NUMBER, +}; + +#ifdef CONFIG_SYSCTL +#define IBNAL_SYSCTL 202 + +#define IBNAL_SYSCTL_TIMEOUT 1 + +static ctl_table kibnal_ctl_table[] = { + {IBNAL_SYSCTL_TIMEOUT, "timeout", + &kibnal_tunables.kib_io_timeout, sizeof (int), + 0644, NULL, &proc_dointvec}, + { 0 } +}; + +static ctl_table kibnal_top_ctl_table[] = { + {IBNAL_SYSCTL, "vibnal", NULL, 0, 0555, kibnal_ctl_table}, + { 0 } +}; +#endif + +#ifdef unused +void +print_service(IB_SERVICE_RECORD *service, char *tag, int rc) +{ + char name[32]; + + if (service == NULL) + { + CWARN("tag : %s\n" + "status : %d (NULL)\n", tag, rc); + return; + } + strncpy (name, service->ServiceName, sizeof(name)-1); + name[sizeof(name)-1] = 0; + + CWARN("tag : %s\n" + "status : %d\n" + "service id: "LPX64"\n" + "name : %s\n" + "NID : "LPX64"\n", tag, rc, + service->RID.ServiceID, name, + *kibnal_service_nid_field(service)); +} +#endif + +/* + * method is SUBN_ADM_SET, SUBN_ADM_GET, SUBN_ADM_DELETE. Tables not supported. + * nid is the nid to advertize/query/unadvertize + */ +static void fill_sa_request(struct sa_request *request, int method, ptl_nid_t nid) +{ + gsi_dtgrm_t *dtgrm = request->dtgrm_req; + sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad; + ib_service_record_v2_t *sr = (ib_service_record_v2_t *) mad->payload; + + memset(mad, 0, MAD_BLOCK_SIZE); + + request->mad = mad; + + dtgrm->rlid = kibnal_data.kib_port_attr.port_sma_address_info.sm_lid; + dtgrm->sl = kibnal_data.kib_port_attr.port_sma_address_info.service_level; + + mad->hdr.base_ver = MAD_IB_BASE_VERSION; + mad->hdr.class = MAD_CLASS_SUBN_ADM; + mad->hdr.class_ver = 2; + mad->hdr.m.ms.method = method; + mad->hdr.attrib_id = SA_SERVICE_RECORD; /* something(?) will swap that field */ + + /* Note: the transaction ID is set by the Voltaire stack if it is 0. */ + + /* TODO: change the 40 to sizeof(something) */ + mad->payload_len = cpu_to_be32(0x40 /*header size */ + + sizeof (ib_service_record_v2_t)); + + + mad->component_mask = cpu_to_be64( + (1ull << 0) | /* service_id */ + (1ull << 2) | /* service_pkey */ + (1ull << 6) | /* service_name */ + (1ull << 7) | /* service_data8[0] */ + (1ull << 8) | /* service_data8[1] */ + (1ull << 9) | /* service_data8[2] */ + (1ull << 10) | /* service_data8[3] */ + (1ull << 11) | /* service_data8[4] */ + (1ull << 12) | /* service_data8[5] */ + (1ull << 13) | /* service_data8[6] */ + (1ull << 14) /* service_data8[7] */ + ); + + sr->service_id = cpu_to_be64(kibnal_data.kib_service_id); + sr->service_pkey = cpu_to_be16(kibnal_data.kib_port_pkey); + + /* Set the service name and the data (bytes 0 to 7) in data8 */ + kibnal_set_service_keys(sr, nid); + + if (method == SUBN_ADM_SET) { + mad->component_mask |= cpu_to_be64( + (1ull << 1) | /* service_gid */ + (1ull << 4) /* service_lease */ + ); + + sr->service_gid = kibnal_data.kib_port_gid; + gid_swap(&sr->service_gid); + sr->service_lease = cpu_to_be32(0xffffffff); + } + + CDEBUG(D_NET, "SA request %02x for service id "LPX64" %s:"LPX64"\n", + mad->hdr.m.ms.method, + sr->service_id, + sr->service_name, + *kibnal_service_nid_field(sr)); +} + +/* Do an advertizement operation: + * SUBN_ADM_GET = 0x01 (i.e. query), + * SUBN_ADM_SET = 0x02 (i.e. advertize), + * SUBN_ADM_DELETE = 0x15 (i.e. un-advertize). + * If callback is NULL, the function is synchronous (and context is ignored). + */ +int kibnal_advertize_op(ptl_nid_t nid, int op, sa_request_cb_t callback, void *context) +{ + struct sa_request *request; + int ret; + + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + CDEBUG(D_NET, "kibnal_advertize_op: nid="LPX64", op=%d\n", nid, op); + + request = alloc_sa_request(); + if (request == NULL) { + CERROR("Cannot allocate a SA request"); + return -ENOMEM; + } + + fill_sa_request(request, op, nid); + + if (callback) { + request->callback = callback; + request->context = context; + } else { + init_completion(&request->signal); + } + + ret = vibnal_start_sa_request(request); + if (ret) { + CERROR("vibnal_send_sa failed: %d\n", ret); + free_sa_request(request); + } else { + if (callback) { + /* Return. The callback will have to free the SA request. */ + ret = 0; + } else { + wait_for_completion(&request->signal); + + ret = request->status; + + if (ret != 0) { + CERROR ("Error %d in advertising operation %d for NID "LPX64"\n", + ret, op, kibnal_data.kib_nid); + } + + free_sa_request(request); + } + } + + return ret; +} + +static int +kibnal_set_mynid(ptl_nid_t nid) +{ + struct timeval tv; + lib_ni_t *ni = &kibnal_lib.libnal_ni; + int rc; + vv_return_t retval; + + CDEBUG(D_IOCTL, "setting mynid to "LPX64" (old nid="LPX64")\n", + nid, ni->ni_pid.nid); + + do_gettimeofday(&tv); + + down (&kibnal_data.kib_nid_mutex); + + if (nid == kibnal_data.kib_nid) { + /* no change of NID */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + CDEBUG(D_NET, "NID "LPX64"("LPX64")\n", + kibnal_data.kib_nid, nid); + + /* Unsubscribes the current NID */ + if (kibnal_data.kib_nid != PTL_NID_ANY) { + + rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL); + + if (rc) { + CERROR("Error %d unadvertising NID "LPX64"\n", + rc, kibnal_data.kib_nid); + } + } + + kibnal_data.kib_nid = ni->ni_pid.nid = nid; + kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec; + + /* Destroys the current endpoint, if any. */ + if (kibnal_data.kib_cep) { + retval = cm_cancel(kibnal_data.kib_cep); + if (retval) + CERROR ("Error %d stopping listener\n", retval); + + retval = cm_destroy_cep(kibnal_data.kib_cep); + if (retval) + CERROR ("Error %d destroying CEP\n", retval); + + kibnal_data.kib_cep = NULL; + } + + /* Delete all existing peers and their connections after new + * NID/incarnation set to ensure no old connections in our brave + * new world. */ + kibnal_del_peer (PTL_NID_ANY, 0); + + if (kibnal_data.kib_nid == PTL_NID_ANY) { + /* No new NID to install. The driver is shuting down. */ + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + /* remove any previous advert (crashed node etc) */ + kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_DELETE, NULL, NULL); + + kibnal_data.kib_cep = cm_create_cep(cm_cep_transp_rc); + if (kibnal_data.kib_cep == NULL) { + CERROR ("Can't create CEP\n"); + rc = -ENOMEM; + } else { + cm_return_t cmret; + cm_listen_data_t info; + + CDEBUG(D_NET, "Created CEP %p for listening\n", kibnal_data.kib_cep); + + memset(&info, 0, sizeof(info)); + info.listen_addr.end_pt.sid = kibnal_data.kib_service_id; + + cmret = cm_listen(kibnal_data.kib_cep, &info, + kibnal_listen_callback, NULL); + if (cmret) { + CERROR ("cm_listen error: %d\n", cmret); + rc = -EINVAL; + } else { + rc = 0; + } + } + + if (rc == 0) { + rc = kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_SET, NULL, NULL); + if (rc == 0) { +#ifdef IBNAL_CHECK_ADVERT + kibnal_advertize_op(kibnal_data.kib_nid, SUBN_ADM_GET, NULL, NULL); +#endif + up (&kibnal_data.kib_nid_mutex); + return (0); + } + + retval = cm_cancel (kibnal_data.kib_cep); + if (retval) + CERROR("cm_cancel failed: %d\n", retval); + + retval = cm_destroy_cep (kibnal_data.kib_cep); + if (retval) + CERROR("cm_destroy_cep failed: %d\n", retval); + + /* remove any peers that sprung up while I failed to + * advertise myself */ + kibnal_del_peer (PTL_NID_ANY, 0); + } + + kibnal_data.kib_nid = PTL_NID_ANY; + up (&kibnal_data.kib_nid_mutex); + return (rc); +} + +kib_peer_t * +kibnal_create_peer (ptl_nid_t nid) +{ + kib_peer_t *peer; + + LASSERT (nid != PTL_NID_ANY); + + PORTAL_ALLOC(peer, sizeof (*peer)); + if (peer == NULL) { + CERROR("Canot allocate perr\n"); + return (NULL); + } + + memset(peer, 0, sizeof(*peer)); /* zero flags etc */ + + peer->ibp_nid = nid; + atomic_set (&peer->ibp_refcount, 1); /* 1 ref for caller */ + + INIT_LIST_HEAD (&peer->ibp_list); /* not in the peer table yet */ + INIT_LIST_HEAD (&peer->ibp_conns); + INIT_LIST_HEAD (&peer->ibp_tx_queue); + + peer->ibp_reconnect_time = jiffies; + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + + atomic_inc (&kibnal_data.kib_npeers); + return (peer); +} + +void +kibnal_destroy_peer (kib_peer_t *peer) +{ + + LASSERT (atomic_read (&peer->ibp_refcount) == 0); + LASSERT (peer->ibp_persistence == 0); + LASSERT (!kibnal_peer_active(peer)); + LASSERT (peer->ibp_connecting == 0); + LASSERT (list_empty (&peer->ibp_conns)); + LASSERT (list_empty (&peer->ibp_tx_queue)); + + PORTAL_FREE (peer, sizeof (*peer)); + + /* NB a peer's connections keep a reference on their peer until + * they are destroyed, so we can be assured that _all_ state to do + * with this peer has been cleaned up when its refcount drops to + * zero. */ + atomic_dec (&kibnal_data.kib_npeers); +} + +/* the caller is responsible for accounting for the additional reference + * that this creates */ +kib_peer_t * +kibnal_find_peer_locked (ptl_nid_t nid) +{ + struct list_head *peer_list = kibnal_nid2peerlist (nid); + struct list_head *tmp; + kib_peer_t *peer; + + list_for_each (tmp, peer_list) { + + peer = list_entry (tmp, kib_peer_t, ibp_list); + + LASSERT (peer->ibp_persistence != 0 || /* persistent peer */ + peer->ibp_connecting != 0 || /* creating conns */ + !list_empty (&peer->ibp_conns)); /* active conn */ + + if (peer->ibp_nid != nid) + continue; + + CDEBUG(D_NET, "got peer [%p] -> "LPX64" (%d)\n", + peer, nid, atomic_read (&peer->ibp_refcount)); + return (peer); + } + return (NULL); +} + +kib_peer_t * +kibnal_get_peer (ptl_nid_t nid) +{ + kib_peer_t *peer; + + read_lock (&kibnal_data.kib_global_lock); + peer = kibnal_find_peer_locked (nid); + if (peer != NULL) /* +1 ref for caller? */ + kib_peer_addref(peer); + read_unlock (&kibnal_data.kib_global_lock); + + return (peer); +} + +void +kibnal_unlink_peer_locked (kib_peer_t *peer) +{ + LASSERT (peer->ibp_persistence == 0); + LASSERT (list_empty(&peer->ibp_conns)); + + LASSERT (kibnal_peer_active(peer)); + list_del_init (&peer->ibp_list); + /* lose peerlist's ref */ + kib_peer_decref(peer); +} + +static int +kibnal_get_peer_info (int index, ptl_nid_t *nidp, int *persistencep) +{ + kib_peer_t *peer; + struct list_head *ptmp; + int i; + + read_lock (&kibnal_data.kib_global_lock); + + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (index-- > 0) + continue; + + *nidp = peer->ibp_nid; + *persistencep = peer->ibp_persistence; + + read_unlock (&kibnal_data.kib_global_lock); + return (0); + } + } + + read_unlock (&kibnal_data.kib_global_lock); + return (-ENOENT); +} + +static int +kibnal_add_persistent_peer (ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + kib_peer_t *peer2; + + if (nid == PTL_NID_ANY) + return (-EINVAL); + + peer = kibnal_create_peer (nid); + if (peer == NULL) + return (-ENOMEM); + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + peer2 = kibnal_find_peer_locked (nid); + if (peer2 != NULL) { + kib_peer_decref (peer); + peer = peer2; + } else { + /* peer table takes existing ref on peer */ + list_add_tail (&peer->ibp_list, + kibnal_nid2peerlist (nid)); + } + + peer->ibp_persistence++; + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return (0); +} + +static void +kibnal_del_peer_locked (kib_peer_t *peer, int single_share) +{ + struct list_head *ctmp; + struct list_head *cnxt; + kib_conn_t *conn; + + if (!single_share) + peer->ibp_persistence = 0; + else if (peer->ibp_persistence > 0) + peer->ibp_persistence--; + + if (peer->ibp_persistence != 0) + return; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry(ctmp, kib_conn_t, ibc_list); + + kibnal_close_conn_locked (conn, 0); + } + + /* NB peer unlinks itself when last conn is closed */ +} + +int +kibnal_del_peer (ptl_nid_t nid, int single_share) +{ + unsigned long flags; + struct list_head *ptmp; + struct list_head *pnxt; + kib_peer_t *peer; + int lo; + int hi; + int i; + int rc = -ENOENT; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; + else { + lo = 0; + hi = kibnal_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (!(nid == PTL_NID_ANY || peer->ibp_nid == nid)) + continue; + + kibnal_del_peer_locked (peer, single_share); + rc = 0; /* matched something */ + + if (single_share) + goto out; + } + } + out: + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + return (rc); +} + +static kib_conn_t * +kibnal_get_conn_by_idx (int index) +{ + kib_peer_t *peer; + struct list_head *ptmp; + kib_conn_t *conn; + struct list_head *ctmp; + int i; + + read_lock (&kibnal_data.kib_global_lock); + + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + list_for_each (ptmp, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence > 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + list_for_each (ctmp, &peer->ibp_conns) { + if (index-- > 0) + continue; + + conn = list_entry (ctmp, kib_conn_t, ibc_list); + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + read_unlock (&kibnal_data.kib_global_lock); + return (conn); + } + } + } + + read_unlock (&kibnal_data.kib_global_lock); + return (NULL); +} + +kib_conn_t * +kibnal_create_conn (void) +{ + kib_conn_t *conn; + int i; + __u64 vaddr = 0; + __u64 vaddr_base; + int page_offset; + int ipage; + vv_qp_attr_t qp_attr; + vv_return_t retval; + int rc; + void *qp_context; + + PORTAL_ALLOC(conn, sizeof (*conn)); + if (conn == NULL) { + CERROR ("Can't allocate connection\n"); + return (NULL); + } + + /* zero flags, NULL pointers etc... */ + memset (conn, 0, sizeof (*conn)); + + INIT_LIST_HEAD (&conn->ibc_tx_queue); + INIT_LIST_HEAD (&conn->ibc_active_txs); + spin_lock_init (&conn->ibc_lock); + + atomic_inc (&kibnal_data.kib_nconns); + /* well not really, but I call destroy() on failure, which decrements */ + + PORTAL_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t)); + if (conn->ibc_rxs == NULL) { + CERROR("Cannot allocate RX buffers\n"); + goto failed; + } + memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t)); + + rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1); + if (rc != 0) + goto failed; + + vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr; + + for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) { + struct page *page = conn->ibc_rx_pages->ibp_pages[ipage]; + kib_rx_t *rx = &conn->ibc_rxs[i]; + + rx->rx_conn = conn; + rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + if (kibnal_whole_mem()) { + void *newaddr; + vv_mem_reg_h_t mem_h; + vv_r_key_t r_key; + + /* Voltaire stack already registers the whole + * memory, so use that API. */ + retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca, + rx->rx_msg, + IBNAL_MSG_SIZE, + &mem_h, + &rx->l_key, + &r_key); + if (retval) { + CERROR("vv_get_gen_mr_attrib failed: %d", retval); + /* TODO: free pages? */ + goto failed; + } + } + + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES); + + page_offset += IBNAL_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBNAL_RX_MSG_PAGES); + } + } + + qp_attr = (vv_qp_attr_t) { + .create.qp_type = vv_qp_type_r_conn, + .create.cq_send_h = kibnal_data.kib_cq, + .create.cq_receive_h = kibnal_data.kib_cq, + .create.send_max_outstand_wr = IBNAL_TX_MAX_SG * + IBNAL_MSG_QUEUE_SIZE, + .create.receive_max_outstand_wr = IBNAL_MSG_QUEUE_SIZE, + .create.max_scatgat_per_send_wr = 1, + .create.max_scatgat_per_receive_wr = 1, + .create.signaling_type = vv_selectable_signaling, /* TODO: correct? */ + .create.pd_h = kibnal_data.kib_pd, + .create.recv_solicited_events = vv_signal_all, + }; + retval = vv_qp_create(kibnal_data.kib_hca, &qp_attr, NULL, + &conn->ibc_qp, &conn->ibc_qp_attrs); + if (retval != 0) { + CERROR ("Failed to create queue pair: %d\n", retval); + goto failed; + } + + /* Mark QP created */ + conn->ibc_state = IBNAL_CONN_INIT_QP; + + qp_attr = (vv_qp_attr_t) { + .modify.qp_modify_into_state = vv_qp_state_init, + .modify.vv_qp_attr_mask = VV_QP_AT_STATE | VV_QP_AT_PHY_PORT_NUM | VV_QP_AT_P_KEY_IX | VV_QP_AT_ACCESS_CON_F, + .modify.qp_type = vv_qp_type_r_conn, + + .modify.params.init.p_key_indx = 0, + .modify.params.init.phy_port_num = kibnal_data.kib_port, + .modify.params.init.access_control = vv_acc_r_mem_write | vv_acc_r_mem_read, + }; + retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs); + if (retval != 0) { + CERROR ("Failed to modify queue pair: %d\n", retval); + goto failed; + } + + retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs); + if (retval) { + CERROR ("Failed to query queue pair: %d\n", retval); + goto failed; + } + + /* 1 ref for caller */ + atomic_set (&conn->ibc_refcount, 1); + return (conn); + + failed: + kibnal_destroy_conn (conn); + return (NULL); +} + +void +kibnal_destroy_conn (kib_conn_t *conn) +{ + vv_return_t retval; + + CDEBUG (D_NET, "connection %p\n", conn); + + LASSERT (atomic_read (&conn->ibc_refcount) == 0); + LASSERT (list_empty(&conn->ibc_tx_queue)); + LASSERT (list_empty(&conn->ibc_active_txs)); + LASSERT (conn->ibc_nsends_posted == 0); + LASSERT (conn->ibc_connreq == NULL); + + switch (conn->ibc_state) { + case IBNAL_CONN_DISCONNECTED: + /* called after connection sequence initiated */ + /* fall through */ + + case IBNAL_CONN_INIT_QP: + /* _destroy includes an implicit Reset of the QP which + * discards posted work */ + retval = vv_qp_destroy(kibnal_data.kib_hca, conn->ibc_qp); + if (retval) + CERROR("Can't destroy QP: %d\n", retval); + /* fall through */ + + case IBNAL_CONN_INIT_NOTHING: + break; + + default: + LASSERT (0); + } + + if (conn->ibc_cep != NULL) { + retval = cm_destroy_cep(conn->ibc_cep); + if (retval) + CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep, + retval); + } + + if (conn->ibc_rx_pages != NULL) + kibnal_free_pages(conn->ibc_rx_pages); + + if (conn->ibc_rxs != NULL) + PORTAL_FREE(conn->ibc_rxs, + IBNAL_RX_MSGS * sizeof(kib_rx_t)); + + if (conn->ibc_peer != NULL) + kib_peer_decref(conn->ibc_peer); + + PORTAL_FREE(conn, sizeof (*conn)); + + atomic_dec(&kibnal_data.kib_nconns); + + if (atomic_read (&kibnal_data.kib_nconns) == 0 && + kibnal_data.kib_shutdown) { + /* I just nuked the last connection on shutdown; wake up + * everyone so they can exit. */ + wake_up_all(&kibnal_data.kib_sched_waitq); + wake_up_all(&kibnal_data.kib_connd_waitq); + } +} + +void +kibnal_put_conn (kib_conn_t *conn) +{ + unsigned long flags; + + CDEBUG (D_NET, "putting conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + + LASSERT (atomic_read (&conn->ibc_refcount) > 0); + if (!atomic_dec_and_test (&conn->ibc_refcount)) + return; + + /* must disconnect before dropping the final ref */ + LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + + list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); +} + +static int +kibnal_close_peer_conns_locked (kib_peer_t *peer, int why) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + count++; + kibnal_close_conn_locked (conn, why); + } + + return (count); +} + +int +kibnal_close_stale_conns_locked (kib_peer_t *peer, __u64 incarnation) +{ + kib_conn_t *conn; + struct list_head *ctmp; + struct list_head *cnxt; + int count = 0; + + list_for_each_safe (ctmp, cnxt, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + if (conn->ibc_incarnation == incarnation) + continue; + + CDEBUG(D_NET, "Closing stale conn nid:"LPX64" incarnation:"LPX64"("LPX64")\n", + peer->ibp_nid, conn->ibc_incarnation, incarnation); + + count++; + kibnal_close_conn_locked (conn, -ESTALE); + } + + return (count); +} + +static int +kibnal_close_matching_conns (ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + struct list_head *ptmp; + struct list_head *pnxt; + int lo; + int hi; + int i; + int count = 0; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + if (nid != PTL_NID_ANY) + lo = hi = kibnal_nid2peerlist(nid) - kibnal_data.kib_peers; + else { + lo = 0; + hi = kibnal_data.kib_peer_hash_size - 1; + } + + for (i = lo; i <= hi; i++) { + list_for_each_safe (ptmp, pnxt, &kibnal_data.kib_peers[i]) { + + peer = list_entry (ptmp, kib_peer_t, ibp_list); + LASSERT (peer->ibp_persistence != 0 || + peer->ibp_connecting != 0 || + !list_empty (&peer->ibp_conns)); + + if (!(nid == PTL_NID_ANY || nid == peer->ibp_nid)) + continue; + + count += kibnal_close_peer_conns_locked (peer, 0); + } + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + /* wildcards always succeed */ + if (nid == PTL_NID_ANY) + return (0); + + return (count == 0 ? -ENOENT : 0); +} + +static int +kibnal_cmd(struct portals_cfg *pcfg, void * private) +{ + int rc = -EINVAL; + ENTRY; + + LASSERT (pcfg != NULL); + + switch(pcfg->pcfg_command) { + case NAL_CMD_GET_PEER: { + ptl_nid_t nid = 0; + int share_count = 0; + + rc = kibnal_get_peer_info(pcfg->pcfg_count, + &nid, &share_count); + pcfg->pcfg_nid = nid; + pcfg->pcfg_size = 0; + pcfg->pcfg_id = 0; + pcfg->pcfg_misc = 0; + pcfg->pcfg_count = 0; + pcfg->pcfg_wait = share_count; + break; + } + case NAL_CMD_ADD_PEER: { + rc = kibnal_add_persistent_peer (pcfg->pcfg_nid); + break; + } + case NAL_CMD_DEL_PEER: { + rc = kibnal_del_peer (pcfg->pcfg_nid, + /* flags == single_share */ + pcfg->pcfg_flags != 0); + break; + } + case NAL_CMD_GET_CONN: { + kib_conn_t *conn = kibnal_get_conn_by_idx (pcfg->pcfg_count); + + if (conn == NULL) + rc = -ENOENT; + else { + rc = 0; + pcfg->pcfg_nid = conn->ibc_peer->ibp_nid; + pcfg->pcfg_id = 0; + pcfg->pcfg_misc = 0; + pcfg->pcfg_flags = 0; + kibnal_put_conn (conn); + } + break; + } + case NAL_CMD_CLOSE_CONNECTION: { + rc = kibnal_close_matching_conns (pcfg->pcfg_nid); + break; + } + case NAL_CMD_REGISTER_MYNID: { + if (pcfg->pcfg_nid == PTL_NID_ANY) + rc = -EINVAL; + else + rc = kibnal_set_mynid (pcfg->pcfg_nid); + break; + } + } + + RETURN(rc); +} + +void +kibnal_free_pages (kib_pages_t *p) +{ + int npages = p->ibp_npages; + vv_return_t retval; + int i; + + if (p->ibp_mapped) { + retval = vv_mem_region_destroy(kibnal_data.kib_hca, p->ibp_handle); + if (retval != 0) + CERROR ("Deregister error: %d\n", retval); + } + + for (i = 0; i < npages; i++) + if (p->ibp_pages[i] != NULL) + __free_page(p->ibp_pages[i]); + + PORTAL_FREE (p, offsetof(kib_pages_t, ibp_pages[npages])); +} + +int +kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write) +{ + kib_pages_t *p; + vv_phy_list_t phys_pages; + vv_phy_buf_t *phys_buf; + int i; + vv_return_t retval; + + PORTAL_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages])); + if (p == NULL) { + CERROR ("Can't allocate buffer %d\n", npages); + return (-ENOMEM); + } + + memset (p, 0, offsetof(kib_pages_t, ibp_pages[npages])); + p->ibp_npages = npages; + + for (i = 0; i < npages; i++) { + p->ibp_pages[i] = alloc_page (GFP_KERNEL); + if (p->ibp_pages[i] == NULL) { + CERROR ("Can't allocate page %d of %d\n", i, npages); + kibnal_free_pages(p); + return (-ENOMEM); + } + } + + if (kibnal_whole_mem()) + goto out; + + PORTAL_ALLOC(phys_buf, npages * sizeof(vv_phy_buf_t)); + if (phys_buf == NULL) { + CERROR ("Can't allocate phys_buf for %d pages\n", npages); + /* XXX free ibp_pages? */ + kibnal_free_pages(p); + return (-ENOMEM); + } + + phys_pages.number_of_buff = npages; + phys_pages.phy_list = phys_buf; + + /* if we were using the _contig_ registration variant we would have + * an array of PhysAddr/Length pairs, but the discontiguous variant + * just takes the PhysAddr */ + for (i = 0; i < npages; i++) { + phys_buf[i].start = kibnal_page2phys(p->ibp_pages[i]); + phys_buf[i].size = PAGE_SIZE; + } + + retval = vv_phy_mem_region_register(kibnal_data.kib_hca, + &phys_pages, + 0, /* requested vaddr */ + npages * PAGE_SIZE, + 0, /* offset */ + kibnal_data.kib_pd, + vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */ + &p->ibp_handle, &p->ibp_vaddr, + &p->ibp_lkey, &p->ibp_rkey); + + PORTAL_FREE(phys_buf, npages * sizeof(vv_phy_buf_t)); + + if (retval) { + CERROR ("Error %d mapping %d pages\n", retval, npages); + kibnal_free_pages(p); + return (-ENOMEM); + } + + CDEBUG(D_NET, "registered %d pages; handle: %x vaddr "LPX64" " + "lkey %x rkey %x\n", npages, p->ibp_handle, + p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey); + + p->ibp_mapped = 1; +out: + *pp = p; + return (0); +} + +static int +kibnal_setup_tx_descs (void) +{ + int ipage = 0; + int page_offset = 0; + __u64 vaddr; + __u64 vaddr_base; + struct page *page; + kib_tx_t *tx; + int i; + int rc; + + /* pre-mapped messages are not bigger than 1 page */ + LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE); + + /* No fancy arithmetic when we do the buffer calculations */ + LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0); + + rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES, + 0); + if (rc != 0) + return (rc); + + /* ignored for the whole_mem case */ + vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr; + + for (i = 0; i < IBNAL_TX_MSGS; i++) { + page = kibnal_data.kib_tx_pages->ibp_pages[ipage]; + tx = &kibnal_data.kib_tx_descs[i]; + + memset (tx, 0, sizeof(*tx)); /* zero flags etc */ + + tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) + + page_offset); + + if (kibnal_whole_mem()) { + void *newaddr; + vv_mem_reg_h_t mem_h; + vv_return_t retval; + + /* Voltaire stack already registers the whole + * memory, so use that API. */ + retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca, + tx->tx_msg, + IBNAL_MSG_SIZE, + &mem_h, + &tx->l_key, + &tx->r_key); + if (retval) { + CERROR("vv_get_gen_mr_attrib failed: %d", retval); + /* TODO: free pages? */ + /* TODO: return. */ + } + } + + tx->tx_isnblk = (i >= IBNAL_NTX); + tx->tx_mapped = KIB_TX_UNMAPPED; + + CDEBUG(D_NET, "Tx[%d] %p->%p\n", i, tx, tx->tx_msg); + + if (tx->tx_isnblk) + list_add (&tx->tx_list, + &kibnal_data.kib_idle_nblk_txs); + else + list_add (&tx->tx_list, + &kibnal_data.kib_idle_txs); + + vaddr += IBNAL_MSG_SIZE; + LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES); + + page_offset += IBNAL_MSG_SIZE; + LASSERT (page_offset <= PAGE_SIZE); + + if (page_offset == PAGE_SIZE) { + page_offset = 0; + ipage++; + LASSERT (ipage <= IBNAL_TX_MSG_PAGES); + } + } + + return (0); +} + +static void +kibnal_api_shutdown (nal_t *nal) +{ + int i; + int rc; + vv_return_t retval; + + if (nal->nal_refct != 0) { + /* This module got the first ref */ + PORTAL_MODULE_UNUSE; + return; + } + + CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + + LASSERT(nal == &kibnal_api); + + switch (kibnal_data.kib_init) { + + case IBNAL_INIT_ALL: + /* stop calls to nal_cmd */ + libcfs_nal_cmd_unregister(VIBNAL); + /* No new peers */ + + /* resetting my NID to unadvertises me, removes my + * listener and nukes all current peers */ + kibnal_set_mynid (PTL_NID_ANY); + + /* Wait for all peer state to clean up (crazy) */ + i = 2; + while (atomic_read (&kibnal_data.kib_npeers) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "waiting for %d peers to disconnect (can take a few seconds)\n", + atomic_read (&kibnal_data.kib_npeers)); + set_current_state (TASK_UNINTERRUPTIBLE); + schedule_timeout (HZ); + } + /* fall through */ + + case IBNAL_INIT_CQ: + retval = vv_cq_destroy(kibnal_data.kib_hca, kibnal_data.kib_cq); + if (retval) + CERROR ("Destroy CQ error: %d\n", retval); + /* fall through */ + + case IBNAL_INIT_TXD: + kibnal_free_pages (kibnal_data.kib_tx_pages); + /* fall through */ + +#if IBNAL_FMR + case IBNAL_INIT_FMR: + rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool); + if (rc != 0) + CERROR ("Destroy FMR pool error: %d\n", rc); + /* fall through */ +#endif + case IBNAL_INIT_PD: +#if IBNAL_WHOLE_MEM==0 + retval = vv_pd_deallocate(kibnal_data.kib_hca, kibnal_data.kib_pd); + if (retval != 0) + CERROR ("Destroy PD error: %d\n", retval); +#endif + /* fall through */ + + case IBNAL_INIT_GSI: + retval = gsi_deregister_class(kibnal_data.gsi_handle); + if (retval != 0) + CERROR ("GSI deregister failed: %d\n", retval); + /* fall through */ + + case IBNAL_INIT_GSI_POOL: + gsi_dtgrm_pool_destroy(kibnal_data.gsi_pool_handle); + /* fall through */ + + case IBNAL_INIT_PORT: + /* XXX ??? */ + /* fall through */ + + case IBNAL_INIT_ASYNC: + retval = vv_dell_async_event_cb (kibnal_data.kib_hca, + kibnal_ca_async_callback); + if (retval) + CERROR("deregister asynchronous call back error: %d\n", retval); + + /* fall through */ + + case IBNAL_INIT_HCA: + retval = vv_hca_close(kibnal_data.kib_hca); + if (retval != 0) + CERROR ("Close HCA error: %d\n", retval); + /* fall through */ + + case IBNAL_INIT_LIB: + lib_fini(&kibnal_lib); + /* fall through */ + + case IBNAL_INIT_DATA: + /* Module refcount only gets to zero when all peers + * have been closed so all lists must be empty */ + LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0); + LASSERT (kibnal_data.kib_peers != NULL); + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) { + LASSERT (list_empty (&kibnal_data.kib_peers[i])); + } + LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0); + LASSERT (list_empty (&kibnal_data.kib_sched_rxq)); + LASSERT (list_empty (&kibnal_data.kib_sched_txq)); + LASSERT (list_empty (&kibnal_data.kib_connd_conns)); + LASSERT (list_empty (&kibnal_data.kib_connd_peers)); + + /* flag threads to terminate; wake and wait for them to die */ + kibnal_data.kib_shutdown = 1; + wake_up_all (&kibnal_data.kib_sched_waitq); + wake_up_all (&kibnal_data.kib_connd_waitq); + + i = 2; + while (atomic_read (&kibnal_data.kib_nthreads) != 0) { + i++; + CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */ + "Waiting for %d threads to terminate\n", + atomic_read (&kibnal_data.kib_nthreads)); + set_current_state (TASK_INTERRUPTIBLE); + schedule_timeout (HZ); + } + /* fall through */ + + case IBNAL_INIT_NOTHING: + break; + } + + if (kibnal_data.kib_tx_descs != NULL) + PORTAL_FREE (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + + if (kibnal_data.kib_peers != NULL) + PORTAL_FREE (kibnal_data.kib_peers, + sizeof (struct list_head) * + kibnal_data.kib_peer_hash_size); + + CDEBUG(D_MALLOC, "after NAL cleanup: kmem %d\n", + atomic_read (&portal_kmemory)); + printk(KERN_INFO "Lustre: Voltaire IB NAL unloaded (final mem %d)\n", + atomic_read(&portal_kmemory)); + + kibnal_data.kib_init = IBNAL_INIT_NOTHING; +} + +#define roundup_power(val, power) \ + ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) ) + +/* this isn't very portable or sturdy in the face of funny mem/bus configs */ +static __u64 max_phys_mem(void) +{ + struct sysinfo si; + __u64 ret; + + si_meminfo(&si); + ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit; + return roundup_power(ret, 128 * 1024 * 1024); +} +#undef roundup_power + +static int +kibnal_api_startup (nal_t *nal, ptl_pid_t requested_pid, + ptl_ni_limits_t *requested_limits, + ptl_ni_limits_t *actual_limits) +{ + ptl_process_id_t process_id; + int pkmem = atomic_read(&portal_kmemory); + int rc; + int i; + vv_request_event_record_t req_er; + vv_return_t retval; + + LASSERT (nal == &kibnal_api); + + if (nal->nal_refct != 0) { + if (actual_limits != NULL) + *actual_limits = kibnal_lib.libnal_ni.ni_actual_limits; + /* This module got the first ref */ + PORTAL_MODULE_USE; + return (PTL_OK); + } + + LASSERT (kibnal_data.kib_init == IBNAL_INIT_NOTHING); + + init_MUTEX (&kibnal_data.kib_nid_mutex); + kibnal_data.kib_nid = PTL_NID_ANY; + + rwlock_init(&kibnal_data.kib_global_lock); + + kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE; + PORTAL_ALLOC (kibnal_data.kib_peers, + sizeof (struct list_head) * kibnal_data.kib_peer_hash_size); + if (kibnal_data.kib_peers == NULL) { + goto failed; + } + for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) + INIT_LIST_HEAD(&kibnal_data.kib_peers[i]); + + spin_lock_init (&kibnal_data.kib_connd_lock); + INIT_LIST_HEAD (&kibnal_data.kib_connd_peers); + INIT_LIST_HEAD (&kibnal_data.kib_connd_conns); + init_waitqueue_head (&kibnal_data.kib_connd_waitq); + + spin_lock_init (&kibnal_data.kib_sched_lock); + INIT_LIST_HEAD (&kibnal_data.kib_sched_txq); + INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq); + init_waitqueue_head (&kibnal_data.kib_sched_waitq); + + spin_lock_init (&kibnal_data.kib_tx_lock); + INIT_LIST_HEAD (&kibnal_data.kib_idle_txs); + INIT_LIST_HEAD (&kibnal_data.kib_idle_nblk_txs); + init_waitqueue_head(&kibnal_data.kib_idle_tx_waitq); + + INIT_LIST_HEAD (&kibnal_data.gsi_pending); + init_MUTEX (&kibnal_data.gsi_mutex); + + PORTAL_ALLOC (kibnal_data.kib_tx_descs, + IBNAL_TX_MSGS * sizeof(kib_tx_t)); + if (kibnal_data.kib_tx_descs == NULL) { + CERROR ("Can't allocate tx descs\n"); + goto failed; + } + + /* lists/ptrs/locks initialised */ + kibnal_data.kib_init = IBNAL_INIT_DATA; + /*****************************************************/ + + process_id.pid = requested_pid; + process_id.nid = kibnal_data.kib_nid; + + rc = lib_init(&kibnal_lib, nal, process_id, + requested_limits, actual_limits); + if (rc != PTL_OK) { + CERROR("lib_init failed: error %d\n", rc); + goto failed; + } + + /* lib interface initialised */ + kibnal_data.kib_init = IBNAL_INIT_LIB; + /*****************************************************/ + + for (i = 0; i < IBNAL_N_SCHED; i++) { + rc = kibnal_thread_start (kibnal_scheduler, (void *)i); + if (rc != 0) { + CERROR("Can't spawn vibnal scheduler[%d]: %d\n", + i, rc); + goto failed; + } + } + + rc = kibnal_thread_start (kibnal_connd, NULL); + if (rc != 0) { + CERROR ("Can't spawn vibnal connd: %d\n", rc); + goto failed; + } + + /* TODO: apparently only one adapter is supported */ + retval = vv_hca_open("ANY_HCA", NULL, &kibnal_data.kib_hca); + if (retval) { + CERROR ("Can't open CA: %d\n", retval); + goto failed; + } + + /* Channel Adapter opened */ + kibnal_data.kib_init = IBNAL_INIT_HCA; + + /* register to get HCA's asynchronous events. */ + req_er.req_event_type = VV_ASYNC_EVENT_ALL_MASK; + retval = vv_set_async_event_cb (kibnal_data.kib_hca, + req_er, + kibnal_ca_async_callback); + + if (retval) { + CERROR ("Can't open CA: %d\n", retval); + goto failed; + } + + kibnal_data.kib_init = IBNAL_INIT_ASYNC; + + /*****************************************************/ + + retval = vv_hca_query(kibnal_data.kib_hca, + &kibnal_data.kib_hca_attrs); + if (retval) { + CERROR ("Can't size port attrs: %d\n", retval); + goto failed; + } + + kibnal_data.kib_port = -1; + + for (i = 0; i<kibnal_data.kib_hca_attrs.port_num; i++) { + + int port_num = i+1; + u_int32_t tbl_count; + vv_port_attrib_t *pattr = &kibnal_data.kib_port_attr; + + retval = vv_port_query(kibnal_data.kib_hca, port_num, pattr); + if (retval) { + CERROR("vv_port_query failed for port %d: %d\n", port_num, retval); + continue; + } + + switch (pattr->port_state) { + case vv_state_linkDoun: + CDEBUG(D_NET, "port[%d] Down\n", port_num); + continue; + case vv_state_linkInit: + CDEBUG(D_NET, "port[%d] Init\n", port_num); + continue; + case vv_state_linkArm: + CDEBUG(D_NET, "port[%d] Armed\n", port_num); + continue; + case vv_state_linkActive: + CDEBUG(D_NET, "port[%d] Active\n", port_num); + + /* Found a suitable port. Get its GUID and PKEY. */ + kibnal_data.kib_port = port_num; + + tbl_count = 1; + retval = vv_get_port_gid_tbl(kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_gid); + if (retval) { + CERROR("vv_get_port_gid_tbl failed for port %d: %d\n", port_num, retval); + continue; + } + + tbl_count = 1; + retval = vv_get_port_partition_tbl (kibnal_data.kib_hca, port_num, &tbl_count, &kibnal_data.kib_port_pkey); + if (retval) { + CERROR("vv_get_port_partition_tbl failed for port %d: %d\n", port_num, retval); + continue; + } + + break; + case vv_state_linkActDefer: /* TODO: correct? */ + case vv_state_linkNoChange: + CERROR("Unexpected port[%d] state %d\n", + i, pattr->port_state); + continue; + } + break; + } + + if (kibnal_data.kib_port == -1) { + CERROR ("Can't find an active port\n"); + goto failed; + } + + CDEBUG(D_NET, "Using port %d - GID="LPX64":"LPX64"\n", + kibnal_data.kib_port, kibnal_data.kib_port_gid.scope.g.subnet, kibnal_data.kib_port_gid.scope.g.eui64); + CDEBUG(D_NET, "got guid "LPX64"\n", cpu_to_le64(kibnal_data.kib_port_gid.scope.g.eui64)); + + /* Active port found */ + kibnal_data.kib_init = IBNAL_INIT_PORT; + /*****************************************************/ + + /* Prepare things to be able to send/receive MADS */ + retval = gsi_dtgrm_pool_create(IBNAL_CONCURRENT_PEERS, &kibnal_data.gsi_pool_handle); + if (retval) { + CERROR("Could not create GSI pool: %d\n", retval); + goto failed; + } + kibnal_data.kib_init = IBNAL_INIT_GSI_POOL; + + retval = gsi_register_class(MAD_CLASS_SUBN_ADM, /* TODO: correct? */ + 2, /* version */ + "ANY_HCA", +#ifdef GSI_PASS_PORT_NUM + kibnal_data.kib_port, +#endif + 0, 0, + vibnal_mad_sent_cb, vibnal_mad_received_cb, + NULL, &kibnal_data.gsi_handle); + if (retval) { + CERROR("Cannot register GSI class: %d\n", retval); + goto failed; + } + + kibnal_data.kib_init = IBNAL_INIT_GSI; + /*****************************************************/ + +#if IBNAL_WHOLE_MEM==0 + retval = vv_pd_allocate(kibnal_data.kib_hca, &kibnal_data.kib_pd); +#else + retval = vv_get_gen_pd_h(kibnal_data.kib_hca, &kibnal_data.kib_pd); +#endif + if (retval) { + CERROR ("Can't create PD: %d\n", retval); + goto failed; + } + + /* flag PD initialised */ + kibnal_data.kib_init = IBNAL_INIT_PD; + /*****************************************************/ + +#if IBNAL_FMR + { + const int pool_size = IBNAL_NTX + IBNAL_NTX_NBLK; + struct ib_fmr_pool_param params = { + .max_pages_per_fmr = PTL_MTU/PAGE_SIZE, + .access = (IB_ACCESS_LOCAL_WRITE | + IB_ACCESS_REMOTE_WRITE | + IB_ACCESS_REMOTE_READ), + .pool_size = pool_size, + .dirty_watermark = (pool_size * 3)/4, + .flush_function = NULL, + .flush_arg = NULL, + .cache = 1, + }; + rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms, + &kibnal_data.kib_fmr_pool); + if (rc != 0) { + CERROR ("Can't create FMR pool size %d: %d\n", + pool_size, rc); + goto failed; + } + } + + /* flag FMR pool initialised */ + kibnal_data.kib_init = IBNAL_INIT_FMR; +#endif + + /*****************************************************/ + + rc = kibnal_setup_tx_descs(); + if (rc != 0) { + CERROR ("Can't register tx descs: %d\n", rc); + goto failed; + } + + /* flag TX descs initialised */ + kibnal_data.kib_init = IBNAL_INIT_TXD; + /*****************************************************/ + { + uint32_t nentries; + + retval = vv_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES, + kibnal_ca_callback, + NULL, /* context */ + &kibnal_data.kib_cq, &nentries); + if (retval) { + CERROR ("Can't create RX CQ: %d\n", retval); + goto failed; + } + + /* flag CQ initialised */ + kibnal_data.kib_init = IBNAL_INIT_CQ; + + if (nentries < IBNAL_CQ_ENTRIES) { + CERROR ("CQ only has %d entries, need %d\n", + nentries, IBNAL_CQ_ENTRIES); + goto failed; + } + + retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event); + if (retval != 0) { + CERROR ("Failed to re-arm completion queue: %d\n", rc); + goto failed; + } + } + + /*****************************************************/ + + rc = libcfs_nal_cmd_register(VIBNAL, &kibnal_cmd, NULL); + if (rc != 0) { + CERROR ("Can't initialise command interface (rc = %d)\n", rc); + goto failed; + } + + /* flag everything initialised */ + kibnal_data.kib_init = IBNAL_INIT_ALL; + /*****************************************************/ + + printk(KERN_INFO "Lustre: Voltaire IB NAL loaded " + "(initial mem %d)\n", pkmem); + + return (PTL_OK); + + failed: + CDEBUG(D_NET, "kibnal_api_startup failed\n"); + kibnal_api_shutdown (&kibnal_api); + return (PTL_FAIL); +} + +void __exit +kibnal_module_fini (void) +{ +#ifdef CONFIG_SYSCTL + if (kibnal_tunables.kib_sysctl != NULL) + unregister_sysctl_table (kibnal_tunables.kib_sysctl); +#endif + PtlNIFini(kibnal_ni); + + ptl_unregister_nal(VIBNAL); +} + +int __init +kibnal_module_init (void) +{ + int rc; + + if (sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len) { + CERROR("sizeof(kib_wire_connreq_t) > cm_REQ_priv_data_len\n"); + return -EINVAL; + } + + /* the following must be sizeof(int) for proc_dointvec() */ + if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) { + CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n"); + return -EINVAL; + } + + kibnal_api.nal_ni_init = kibnal_api_startup; + kibnal_api.nal_ni_fini = kibnal_api_shutdown; + + /* Initialise dynamic tunables to defaults once only */ + kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT; + + rc = ptl_register_nal(VIBNAL, &kibnal_api); + if (rc != PTL_OK) { + CERROR("Can't register IBNAL: %d\n", rc); + return (-ENOMEM); /* or something... */ + } + + /* Pure gateways want the NAL started up at module load time... */ + rc = PtlNIInit(VIBNAL, LUSTRE_SRV_PTL_PID, NULL, NULL, &kibnal_ni); + if (rc != PTL_OK && rc != PTL_IFACE_DUP) { + ptl_unregister_nal(VIBNAL); + return (-ENODEV); + } + +#ifdef CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + kibnal_tunables.kib_sysctl = + register_sysctl_table (kibnal_top_ctl_table, 0); +#endif + return (0); +} + +MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>"); +MODULE_DESCRIPTION("Kernel Voltaire IB NAL v0.01"); +MODULE_LICENSE("GPL"); + +module_init(kibnal_module_init); +module_exit(kibnal_module_fini); + diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h new file mode 100644 index 0000000000000000000000000000000000000000..7866aba05203f375f0a8517de455e4f69085bdb8 --- /dev/null +++ b/lnet/klnds/viblnd/viblnd.h @@ -0,0 +1,820 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton <eric@bartonsoftware.com> + * Author: Frank Zago <fzago@systemfabricworks.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/smp_lock.h> +#include <linux/unistd.h> +#include <linux/uio.h> + +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/io.h> + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/list.h> +#include <linux/kmod.h> +#include <linux/sysctl.h> + +#define DEBUG_SUBSYSTEM S_IBNAL + +#define IBNAL_CHECK_ADVERT + +#include <linux/kp30.h> +#include <portals/p30.h> +#include <portals/lib-p30.h> +#include <portals/nal.h> + +#include <vverbs.h> +#include <sa-mads.h> +#include <ib-cm.h> +#include <gsi.h> + +#if 0 +#undef CDEBUG +#define CDEBUG(mask, format, a...) printk(KERN_INFO "%s:%d - " format, __func__, __LINE__,##a) +#endif + +#ifdef __CHECKER__ +#undef CDEBUG +#undef CERROR +#define CDEBUG(a...) +#define CERROR(a...) +#endif + +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) + +/* Test for GCC > 3.2.2 */ +#if GCC_VERSION <= 30202 +/* GCC 3.2.2, and presumably several versions before it, will + * miscompile this driver. See + * http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9853. */ +#error Invalid GCC version. Must use GCC >= 3.2.3 +#endif + +#define IBNAL_SERVICE_NAME "vibnal" +#define IBNAL_SERVICE_NUMBER 0x11b9a2 /* TODO */ + +#if CONFIG_SMP +# define IBNAL_N_SCHED num_online_cpus() /* # schedulers */ +#else +# define IBNAL_N_SCHED 1 /* # schedulers */ +#endif + +#define IBNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ +#define IBNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ + +#define IBNAL_MSG_SIZE (4<<10) /* max size of queued messages (inc hdr) */ + +#define IBNAL_MSG_QUEUE_SIZE 8 /* # messages/RDMAs in-flight */ +#define IBNAL_CREDIT_HIGHWATER 7 /* when to eagerly return credits */ + +/* 7 indicates infinite retry attempts, Infinicon recommended 5 */ +#define IBNAL_RETRY 5 /* # times to retry */ +#define IBNAL_RNR_RETRY 5 /* */ +#define IBNAL_CM_RETRY 5 /* # times to retry connection */ + +#define IBNAL_FLOW_CONTROL 1 +#define IBNAL_ACK_TIMEOUT 20 /* supposedly 4 secs */ + +#define IBNAL_NTX 64 /* # tx descs */ +/* this had to be dropped down so that we only register < 255 pages per + * region. this will change if we register all memory. */ +#define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ + +#define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ + +#define IBNAL_RESCHED 100 /* # scheduler loops before reschedule */ + +#define IBNAL_CONCURRENT_PEERS 1000 /* # nodes all talking at once to me */ + +/* default vals for runtime tunables */ +#define IBNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ + +/************************/ +/* derived constants... */ + +/* TX messages (shared by all connections) */ +#define IBNAL_TX_MSGS (IBNAL_NTX + IBNAL_NTX_NBLK) +#define IBNAL_TX_MSG_BYTES (IBNAL_TX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_TX_MSG_PAGES ((IBNAL_TX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) + +#define IBNAL_TX_MAX_SG (PTL_MD_MAX_IOV + 1) + +/* RX messages (per connection) */ +#define IBNAL_RX_MSGS IBNAL_MSG_QUEUE_SIZE +#define IBNAL_RX_MSG_BYTES (IBNAL_RX_MSGS * IBNAL_MSG_SIZE) +#define IBNAL_RX_MSG_PAGES ((IBNAL_RX_MSG_BYTES + PAGE_SIZE - 1)/PAGE_SIZE) + + +/* we may have up to 2 completions per transmit + + 1 completion per receive, per connection */ +#define IBNAL_CQ_ENTRIES ((2*IBNAL_TX_MSGS) + \ + (IBNAL_RX_MSGS * IBNAL_CONCURRENT_PEERS)) + +#define IBNAL_RDMA_BASE 0x0eeb0000 +#define IBNAL_FMR 0 +#define IBNAL_WHOLE_MEM 1 +#define IBNAL_CKSUM 0 + +/* Starting sequence number. */ +#define IBNAL_STARTING_PSN 0x465A + +/* Timeout for SA requests, in seconds */ +#define GSI_TIMEOUT 5 +#define GSI_RETRY 10 + +typedef struct +{ + int kib_io_timeout; /* comms timeout (seconds) */ + struct ctl_table_header *kib_sysctl; /* sysctl interface */ +} kib_tunables_t; + +/* some of these have specific types in the stack that just map back + * to the uFOO types, like IB_{L,R}_KEY. */ +typedef struct +{ + int ibp_npages; /* # pages */ + int ibp_mapped; /* mapped? */ + __u64 ibp_vaddr; /* mapped region vaddr */ + __u32 ibp_lkey; /* mapped region lkey */ + __u32 ibp_rkey; /* mapped region rkey */ + vv_mem_reg_h_t ibp_handle; /* mapped region handle */ + struct page *ibp_pages[0]; +} kib_pages_t; + +typedef struct +{ + vv_mem_reg_h_t md_handle; + __u32 md_lkey; + __u32 md_rkey; + __u64 md_addr; +} kib_md_t __attribute__((packed)); + +typedef struct +{ + /* initialisation state. These values are sorted by their initialization order. */ + enum { + IBNAL_INIT_NOTHING, + IBNAL_INIT_DATA, + IBNAL_INIT_LIB, + IBNAL_INIT_HCA, + IBNAL_INIT_ASYNC, + IBNAL_INIT_PORT, + IBNAL_INIT_GSI_POOL, + IBNAL_INIT_GSI, + IBNAL_INIT_PD, +#if IBNAL_FMR + IBNAL_INIT_FMR, +#endif + IBNAL_INIT_TXD, + IBNAL_INIT_CQ, + IBNAL_INIT_ALL, + } kib_init; + + __u64 kib_incarnation; /* which one am I */ + int kib_shutdown; /* shut down? */ + atomic_t kib_nthreads; /* # live threads */ + + __u64 kib_service_id; /* service number I listen on */ + vv_gid_t kib_port_gid; /* port GID in HOST ORDER! */ + vv_p_key_t kib_port_pkey; /* my pkey */ + ptl_nid_t kib_nid; /* my NID */ + struct semaphore kib_nid_mutex; /* serialise NID ops */ + cm_cep_handle_t kib_cep; /* connection end point */ + + rwlock_t kib_global_lock; /* stabilize peer/conn ops */ + + struct list_head *kib_peers; /* hash table of all my known peers */ + int kib_peer_hash_size; /* size of kib_peers */ + atomic_t kib_npeers; /* # peers extant */ + atomic_t kib_nconns; /* # connections extant */ + + struct list_head kib_connd_conns; /* connections to progress */ + struct list_head kib_connd_peers; /* peers waiting for a connection */ + wait_queue_head_t kib_connd_waitq; /* connection daemons sleep here */ + unsigned long kib_connd_waketime; /* when connd will wake */ + spinlock_t kib_connd_lock; /* serialise */ + + wait_queue_head_t kib_sched_waitq; /* schedulers sleep here */ + struct list_head kib_sched_txq; /* tx requiring attention */ + struct list_head kib_sched_rxq; /* rx requiring attention */ + spinlock_t kib_sched_lock; /* serialise */ + + struct kib_tx *kib_tx_descs; /* all the tx descriptors */ + kib_pages_t *kib_tx_pages; /* premapped tx msg pages */ + + struct list_head kib_idle_txs; /* idle tx descriptors */ + struct list_head kib_idle_nblk_txs; /* idle reserved tx descriptors */ + wait_queue_head_t kib_idle_tx_waitq; /* block here for tx descriptor */ + __u64 kib_next_tx_cookie; /* RDMA completion cookie */ + spinlock_t kib_tx_lock; /* serialise */ + + vv_hca_h_t kib_hca; /* The HCA */ + vv_hca_attrib_t kib_hca_attrs; /* HCA attributes */ + + int kib_port; /* port on the device */ + vv_port_attrib_t kib_port_attr; /* port attributes */ + + vv_pd_h_t kib_pd; /* protection domain */ + vv_cq_h_t kib_cq; /* completion queue */ + + void *kib_listen_handle; /* where I listen for connections */ + + /* These fields are left untouched, so they can be shared. */ + union { + cm_drequest_data_t dreq_data; + cm_dreply_data_t drep_data; + } cm_data; + + /* Send and receive MADs (service records, path records) */ + gsi_class_handle_t gsi_handle; + gsi_dtgrm_pool_handle_t gsi_pool_handle; + struct semaphore gsi_mutex; /* protect GSI list - TODO:spinlock instead? */ + struct list_head gsi_pending; /* pending GSI datagrams */ + +} kib_data_t; + +/************************************************************************ + * Wire message structs. + * These are sent in sender's byte order (i.e. receiver flips). + * CAVEAT EMPTOR: other structs communicated between nodes (e.g. MAD + * private data and SM service info), is LE on the wire. + */ + +/* also kib_md_t above */ + +typedef struct +{ + __u32 rd_nob; /* # of bytes */ + __u64 rd_addr; /* remote io vaddr */ +} kib_rdma_desc_t __attribute__((packed)); + +typedef struct +{ + ptl_hdr_t ibim_hdr; /* portals header */ + char ibim_payload[0]; /* piggy-backed payload */ +} kib_immediate_msg_t __attribute__((packed)); + +/* these arrays serve two purposes during rdma. they are built on the passive + * side and sent to the active side as remote arguments. On the active side + * the descs are used as a data structure on the way to local gather items. + * the different roles result in split local/remote meaning of desc->rd_key */ +typedef struct +{ + ptl_hdr_t ibrm_hdr; /* portals header */ + __u64 ibrm_cookie; /* opaque completion cookie */ + __u32 ibrm_num_descs; /* how many descs */ + __u32 rd_key; /* remote key */ + kib_rdma_desc_t ibrm_desc[0]; /* where to suck/blow */ +} kib_rdma_msg_t __attribute__((packed)); + +#define kib_rdma_msg_len(num_descs) \ + offsetof(kib_msg_t, ibm_u.rdma.ibrm_desc[num_descs]) + +typedef struct +{ + __u64 ibcm_cookie; /* opaque completion cookie */ + __u32 ibcm_status; /* completion status */ +} kib_completion_msg_t __attribute__((packed)); + +typedef struct +{ + __u32 ibm_magic; /* I'm an openibnal message */ + __u16 ibm_version; /* this is my version number */ + __u8 ibm_type; /* msg type */ + __u8 ibm_credits; /* returned credits */ +#if IBNAL_CKSUM + __u32 ibm_nob; + __u32 ibm_cksum; +#endif + union { + kib_immediate_msg_t immediate; + kib_rdma_msg_t rdma; + kib_completion_msg_t completion; + } ibm_u __attribute__((packed)); +} kib_msg_t __attribute__((packed)); + +#define IBNAL_MSG_MAGIC 0x0be91b91 /* unique magic */ +#define IBNAL_MSG_VERSION 1 /* current protocol version */ + +#define IBNAL_MSG_NOOP 0xd0 /* nothing (just credits) */ +#define IBNAL_MSG_IMMEDIATE 0xd1 /* portals hdr + payload */ +#define IBNAL_MSG_PUT_RDMA 0xd2 /* portals PUT hdr + source rdma desc */ +#define IBNAL_MSG_PUT_DONE 0xd3 /* signal PUT rdma completion */ +#define IBNAL_MSG_GET_RDMA 0xd4 /* portals GET hdr + sink rdma desc */ +#define IBNAL_MSG_GET_DONE 0xd5 /* signal GET rdma completion */ + +/***********************************************************************/ + +typedef struct kib_rx /* receive message */ +{ + struct list_head rx_list; /* queue for attention */ + struct kib_conn *rx_conn; /* owning conn */ + int rx_rdma; /* RDMA completion posted? */ + int rx_posted; /* posted? */ + kib_msg_t *rx_msg; /* pre-mapped buffer */ + vv_l_key_t l_key; + vv_wr_t rx_wrq; + vv_scatgat_t rx_gl; /* and its memory */ +} kib_rx_t; + +typedef struct kib_tx /* transmit message */ +{ + struct list_head tx_list; /* queue on idle_txs ibc_tx_queue etc. */ + int tx_isnblk; /* I'm reserved for non-blocking sends */ + struct kib_conn *tx_conn; /* owning conn */ + int tx_mapped; /* mapped for RDMA? */ + int tx_sending; /* # tx callbacks outstanding */ + int tx_status; /* completion status */ + unsigned long tx_deadline; /* completion deadline */ + int tx_passive_rdma; /* peer sucks/blows */ + int tx_passive_rdma_wait; /* waiting for peer to complete */ + __u64 tx_passive_rdma_cookie; /* completion cookie */ + lib_msg_t *tx_libmsg[2]; /* lib msgs to finalize on completion */ + kib_md_t tx_md; /* RDMA mapping (active/passive) */ + kib_msg_t *tx_msg; /* pre-mapped buffer */ + vv_l_key_t l_key; + vv_r_key_t r_key; + int tx_nsp; /* # send work items */ + vv_wr_t tx_wrq[IBNAL_TX_MAX_SG]; /* send work items... */ + vv_scatgat_t tx_gl[IBNAL_TX_MAX_SG]; /* ...and their memory */ +} kib_tx_t; + +#define KIB_TX_UNMAPPED 0 +#define KIB_TX_MAPPED 1 +#define KIB_TX_MAPPED_FMR 2 + +typedef struct kib_wire_connreq +{ + __u32 wcr_magic; /* I'm an openibnal connreq */ + __u16 wcr_version; /* this is my version number */ + __u16 wcr_queue_depth; /* this is my receive queue size */ + __u64 wcr_nid; /* peer's NID */ + __u64 wcr_incarnation; /* peer's incarnation */ +} kib_wire_connreq_t; + +typedef struct kib_gid +{ + __u64 hi, lo; +} kib_gid_t; + +typedef struct kib_connreq +{ + /* connection-in-progress */ + struct kib_conn *cr_conn; + kib_wire_connreq_t cr_wcr; + __u64 cr_tid; + //ib_service_record_v2_t cr_service; + kib_gid_t cr_gid; + ib_path_record_v2_t cr_path; + + union { + cm_request_data_t cr_cm_req; + cm_rtu_data_t cr_cm_rtu; + } ; + +} kib_connreq_t; + +typedef struct kib_conn +{ + struct kib_peer *ibc_peer; /* owning peer */ + struct list_head ibc_list; /* stash on peer's conn list */ + __u64 ibc_incarnation; /* which instance of the peer */ + atomic_t ibc_refcount; /* # users */ + int ibc_state; /* what's happening */ + atomic_t ibc_nob; /* # bytes buffered */ + int ibc_nsends_posted; /* # uncompleted sends */ + int ibc_credits; /* # credits I have */ + int ibc_outstanding_credits; /* # credits to return */ + int ibc_rcvd_disconnect;/* received discon request */ + int ibc_sent_disconnect;/* sent discon request */ + struct list_head ibc_tx_queue; /* send queue */ + struct list_head ibc_active_txs; /* active tx awaiting completion */ + spinlock_t ibc_lock; /* serialise */ + kib_rx_t *ibc_rxs; /* the rx descs */ + kib_pages_t *ibc_rx_pages; /* premapped rx msg pages */ + vv_qp_h_t ibc_qp; /* queue pair */ + cm_cep_handle_t ibc_cep; /* connection ID? */ + vv_qp_attr_t ibc_qp_attrs; /* QP attrs */ + kib_connreq_t *ibc_connreq; /* connection request state */ +} kib_conn_t; + +#define IBNAL_CONN_INIT_NOTHING 0 /* initial state */ +#define IBNAL_CONN_INIT_QP 1 /* ibc_qp set up */ +#define IBNAL_CONN_CONNECTING 2 /* started to connect */ +#define IBNAL_CONN_ESTABLISHED 3 /* connection established */ +#define IBNAL_CONN_SEND_DREQ 4 /* to send disconnect req */ +#define IBNAL_CONN_DREQ 5 /* sent disconnect req */ +#define IBNAL_CONN_DREP 6 /* sent disconnect rep */ +#define IBNAL_CONN_DISCONNECTED 7 /* no more QP or CM traffic */ + +#define KIB_ASSERT_CONN_STATE(conn, state) do { \ + LASSERTF((conn)->ibc_state == state, "%d\n", conn->ibc_state); \ +} while (0) + +#define KIB_ASSERT_CONN_STATE_RANGE(conn, low, high) do { \ + LASSERTF(low <= high, "%d %d\n", low, high); \ + LASSERTF((conn)->ibc_state >= low && (conn)->ibc_state <= high, \ + "%d\n", conn->ibc_state); \ +} while (0) + +typedef struct kib_peer +{ + struct list_head ibp_list; /* stash on global peer list */ + struct list_head ibp_connd_list; /* schedule on kib_connd_peers */ + ptl_nid_t ibp_nid; /* who's on the other end(s) */ + atomic_t ibp_refcount; /* # users */ + int ibp_persistence; /* "known" peer refs */ + struct list_head ibp_conns; /* all active connections */ + struct list_head ibp_tx_queue; /* msgs waiting for a conn */ + int ibp_connecting; /* connecting+accepting */ + unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ + unsigned long ibp_reconnect_interval; /* exponential backoff */ +} kib_peer_t; + +struct sa_request; +typedef void (*sa_request_cb_t)(struct sa_request *request); + +struct sa_request { + /* Link all the pending GSI datagrams together. */ + struct list_head list; + + int retry; /* number of retries left (after a timeout only) */ + int status; /* status of the request */ + gsi_dtgrm_t *dtgrm_req; /* request */ + gsi_dtgrm_t *dtgrm_resp; /* response */ + sa_mad_v2_t *mad; /* points inside the datagram */ + + void *context; + + struct timer_list timer; + + /* When the requests is completed, we either call the callback + * or post a completion. They are mutually exclusive. */ + struct completion signal; + sa_request_cb_t callback; +}; + +/* The CM callback are called on the interrupt level. However we + * cannot do everything we want on that level, so we let keventd run + * the callback. */ +struct cm_off_level { + struct tq_struct tq; + + cm_cep_handle_t cep; + cm_conn_data_t *info; + kib_conn_t *conn; +}; + +extern lib_nal_t kibnal_lib; +extern kib_data_t kibnal_data; +extern kib_tunables_t kibnal_tunables; + +static inline int wrq_signals_completion(vv_wr_t *wrq) +{ + return wrq->completion_notification != 0; +} + +/******************************************************************************/ + +/* these are purposely avoiding using local vars so they don't increase + * stack consumption. */ + +#define kib_peer_addref(peer) do { \ + LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ + atomic_read(&peer->ibp_refcount)); \ + CDEBUG(D_NET, "++peer[%p] -> "LPX64" (%d)\n", \ + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ + atomic_inc(&peer->ibp_refcount); \ +} while (0) + +#define kib_peer_decref(peer) do { \ + LASSERTF(atomic_read(&peer->ibp_refcount) > 0, "%d\n", \ + atomic_read(&peer->ibp_refcount)); \ + CDEBUG(D_NET, "--peer[%p] -> "LPX64" (%d)\n", \ + peer, peer->ibp_nid, atomic_read (&peer->ibp_refcount)); \ + if (atomic_dec_and_test (&peer->ibp_refcount)) { \ + CDEBUG (D_NET, "destroying peer "LPX64" %p\n", \ + peer->ibp_nid, peer); \ + kibnal_destroy_peer (peer); \ + } \ +} while (0) + +/******************************************************************************/ + +static inline struct list_head * +kibnal_nid2peerlist (ptl_nid_t nid) +{ + unsigned int hash = ((unsigned int)nid) % kibnal_data.kib_peer_hash_size; + + return (&kibnal_data.kib_peers [hash]); +} + +static inline int +kibnal_peer_active(kib_peer_t *peer) +{ + /* Am I in the peer hash table? */ + return (!list_empty(&peer->ibp_list)); +} + +static inline void +kibnal_queue_tx_locked (kib_tx_t *tx, kib_conn_t *conn) +{ + /* CAVEAT EMPTOR: tx takes caller's ref on conn */ + + LASSERT (tx->tx_nsp > 0); /* work items set up */ + LASSERT (tx->tx_conn == NULL); /* only set here */ + + tx->tx_conn = conn; + tx->tx_deadline = jiffies + kibnal_tunables.kib_io_timeout * HZ; + list_add_tail(&tx->tx_list, &conn->ibc_tx_queue); +} + +static inline __u64* +kibnal_service_nid_field(ib_service_record_v2_t *sr) +{ + /* The service key mask must have byte 0 to 7 set. */ + return (__u64 *)sr->service_data8; +} + +static inline void +kibnal_set_service_keys(ib_service_record_v2_t *sr, ptl_nid_t nid) +{ + LASSERT (strlen(IBNAL_SERVICE_NAME) < sizeof(sr->service_name)); + + strcpy (sr->service_name, IBNAL_SERVICE_NAME); + + *kibnal_service_nid_field(sr) = cpu_to_le64(nid); +} + +#if CONFIG_X86 +/* TODO: use vv_va2adverize instead */ +static inline __u64 +kibnal_page2phys (struct page *p) +{ + __u64 page_number = p - mem_map; + + return (page_number << PAGE_SHIFT); +} +#else +# error "no page->phys" +#endif + +/* CAVEAT EMPTOR: We rely on tx/rx descriptor alignment to allow us to + * use the lowest bit of the work request id as a flag to determine if + * the completion is for a transmit or a receive (the op field is not + * valid when the wc completes in error). */ + +static inline vv_wr_id_t +kibnal_ptr2wreqid (void *ptr, int isrx) +{ + unsigned long lptr = (unsigned long)ptr; + + LASSERT ((lptr & 1) == 0); + return (vv_wr_id_t)(lptr | (isrx ? 1 : 0)); +} + +static inline void * +kibnal_wreqid2ptr (vv_wr_id_t wreqid) +{ + return (void *)(((unsigned long)wreqid) & ~1UL); +} + +static inline int +kibnal_wreqid_is_rx (vv_wr_id_t wreqid) +{ + return (wreqid & 1) != 0; +} + +static inline int +kibnal_whole_mem(void) +{ +#if IBNAL_WHOLE_MEM + return true; +#else + return false; +#endif +} + +/* Voltaire stores GIDs in host order. */ +static inline void gid_swap(vv_gid_t *gid) +{ + u_int64_t s; + + s = gid->scope.g.subnet; + gid->scope.g.subnet = cpu_to_be64(gid->scope.g.eui64); + gid->scope.g.eui64 = cpu_to_be64(s); +} + +#if 0 +static void dump_qp(kib_conn_t *conn) +{ + vv_qp_attr_t *qp_attrs; + void *qp_context; + vv_return_t retval; + + CERROR("QP dumping %p\n", conn); + + retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs); + if (retval) { + CERROR ("Couldn't query qp attributes: %d\n", retval); + return; + } + + qp_attrs = &conn->ibc_qp_attrs; + + CERROR("QP %x dump\n", qp_attrs->query.qp_num); + CERROR(" vv_qp_attr_mask = %llx\n", qp_attrs->query.vv_qp_attr_mask); + CERROR(" qp_state = %d\n", qp_attrs->query.qp_state); + CERROR(" cq_send_h = %p\n", qp_attrs->query.cq_send_h); + CERROR(" cq_receive_h = %p \n", qp_attrs->query.cq_receive_h); + CERROR(" send_max_outstand_wr = %d\n", qp_attrs->query.send_max_outstand_wr); + CERROR(" receive_max_outstand_wr = %d\n", qp_attrs->query.receive_max_outstand_wr); + CERROR(" max_scatgat_per_send_wr = %d\n", qp_attrs->query.max_scatgat_per_send_wr); + CERROR(" max_scatgat_per_receive_wr = %d\n", qp_attrs->query.max_scatgat_per_receive_wr); + CERROR(" send_psn = %x\n", qp_attrs->query.send_psn); + CERROR(" receve_psn = %x\n", qp_attrs->query.receve_psn); + CERROR(" access_control = %x\n", qp_attrs->query.access_control); + CERROR(" phy_port_num = %d\n", qp_attrs->query.phy_port_num); + CERROR(" primary_p_key_indx = %x\n", qp_attrs->query.primary_p_key_indx); + CERROR(" q_key = %x\n", qp_attrs->query.q_key); + CERROR(" destanation_qp = %x\n", qp_attrs->query.destanation_qp); + CERROR(" rdma_r_atom_outstand_num = %d\n", qp_attrs->query.rdma_r_atom_outstand_num); + CERROR(" responder_rdma_r_atom_num = %d\n", qp_attrs->query.responder_rdma_r_atom_num); + CERROR(" min_rnr_nak_timer = %d\n", qp_attrs->query.min_rnr_nak_timer); + CERROR(" pd_h = %lx\n", qp_attrs->query.pd_h); + CERROR(" recv_solicited_events = %d\n", qp_attrs->query.recv_solicited_events); + CERROR(" send_signaled_comp = %d\n", qp_attrs->query.send_signaled_comp); + CERROR(" flow_control = %d\n", qp_attrs->query.flow_control); +} +#else +#define dump_qp(a) +#endif + +#if 0 +static void dump_wqe(vv_wr_t *wr) +{ + CERROR("Dumping send WR %p\n", wr); + + CERROR(" wr_id = %llx\n", wr->wr_id); + CERROR(" completion_notification = %d\n", wr->completion_notification); + CERROR(" scatgat_list = %p\n", wr->scatgat_list); + CERROR(" num_of_data_segments = %d\n", wr->num_of_data_segments); + + if (wr->scatgat_list && wr->num_of_data_segments) { + CERROR(" scatgat_list[0].v_address = %p\n", wr->scatgat_list[0].v_address); + CERROR(" scatgat_list[0].length = %d\n", wr->scatgat_list[0].length); + CERROR(" scatgat_list[0].l_key = %x\n", wr->scatgat_list[0].l_key); + } + + CERROR(" wr_type = %d\n", wr->wr_type); + + switch(wr->wr_type) { + case vv_wr_send: + CERROR(" send\n"); + + CERROR(" fance_indicator = %d\n", wr->type.send.send_qp_type.rc_type.fance_indicator); + break; + + case vv_wr_receive: + break; + + case vv_wr_rdma_write: + case vv_wr_rdma_read: + CERROR(" rdma\n"); + CERROR(" fance_indicator = %d\n", wr->type.send.send_qp_type.rc_type.fance_indicator); + CERROR(" r_addr = %llx\n", wr->type.send.send_qp_type.rc_type.r_addr); + CERROR(" r_r_key = %x\n", wr->type.send.send_qp_type.rc_type.r_r_key); + break; + + default: + break; + } +} + +#else +#define dump_wqe(a) +#endif + +#if 0 +static void dump_wc(vv_wc_t *wc) +{ + CERROR("Dumping WC\n"); + + CERROR(" wr_id = %llx\n", wc->wr_id); + CERROR(" operation_type = %d\n", wc->operation_type); + CERROR(" num_bytes_transfered = %lld\n", wc->num_bytes_transfered); + CERROR(" completion_status = %d\n", wc->completion_status); +} +#else +#define dump_wc(a) +#endif + +#if 0 +static void hexdump(char *string, void *ptr, int len) +{ + unsigned char *c = ptr; + int i; + + if (len < 0 || len > 2048) { + printk("XXX what the hell? %d\n",len); + return; + } + + printk("%d bytes of '%s' from 0x%p\n", len, string, ptr); + + for (i = 0; i < len;) { + printk("%02x",*(c++)); + i++; + if (!(i & 15)) { + printk("\n"); + } else if (!(i&1)) { + printk(" "); + } + } + + if(len & 15) { + printk("\n"); + } +} +#else +#define hexdump(a,b,c) +#endif + +/*--------------------------------------------------------------------------*/ + + +extern kib_peer_t *kibnal_create_peer (ptl_nid_t nid); +extern void kibnal_destroy_peer (kib_peer_t *peer); +extern int kibnal_del_peer (ptl_nid_t nid, int single_share); +extern kib_peer_t *kibnal_find_peer_locked (ptl_nid_t nid); +extern void kibnal_unlink_peer_locked (kib_peer_t *peer); +extern int kibnal_close_stale_conns_locked (kib_peer_t *peer, + __u64 incarnation); +extern kib_conn_t *kibnal_create_conn (void); +extern void kibnal_put_conn (kib_conn_t *conn); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern void kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg); + +extern int kibnal_alloc_pages (kib_pages_t **pp, int npages, int access); +extern void kibnal_free_pages (kib_pages_t *p); + +extern void kibnal_check_sends (kib_conn_t *conn); +extern void kibnal_close_conn_locked (kib_conn_t *conn, int error); +extern void kibnal_destroy_conn (kib_conn_t *conn); +extern int kibnal_thread_start (int (*fn)(void *arg), void *arg); +extern int kibnal_scheduler(void *arg); +extern int kibnal_connd (void *arg); +extern void kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob); +extern void kibnal_close_conn (kib_conn_t *conn, int why); +extern void kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob); + +void kibnal_ca_async_callback(vv_event_record_t ev); +void kibnal_ca_callback (unsigned long context); +extern void vibnal_mad_received_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm); +extern void vibnal_mad_sent_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm); +extern int kibnal_advertize_op(ptl_nid_t nid, int op, sa_request_cb_t callback, void *context); +extern int vibnal_start_sa_request(struct sa_request *request); +extern struct sa_request *alloc_sa_request(void); +extern void free_sa_request(struct sa_request *request); +extern int kibnal_pathrecord_op(struct sa_request *request, vv_gid_t dgid, sa_request_cb_t callback, void *context); diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c new file mode 100644 index 0000000000000000000000000000000000000000..78bcda4775c8f04701ba6414c557499c7ae76673 --- /dev/null +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -0,0 +1,3163 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Eric Barton <eric@bartonsoftware.com> + * Author: Frank Zago <fzago@systemfabricworks.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "vibnal.h" + +static void kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg); + +/* + * LIB functions follow + * + */ +static void +kibnal_schedule_tx_done (kib_tx_t *tx) +{ + unsigned long flags; + + spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags); + + list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq); + wake_up (&kibnal_data.kib_sched_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); +} + +static void +kibnal_tx_done (kib_tx_t *tx) +{ + ptl_err_t ptlrc = (tx->tx_status == 0) ? PTL_OK : PTL_FAIL; + unsigned long flags; + int i; + vv_return_t retval; + + LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */ + LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */ + + switch (tx->tx_mapped) { + default: + LBUG(); + + case KIB_TX_UNMAPPED: + break; + + case KIB_TX_MAPPED: + if (in_interrupt()) { + /* can't deregister memory in IRQ context... */ + kibnal_schedule_tx_done(tx); + return; + } + retval = vv_mem_region_destroy(kibnal_data.kib_hca, tx->tx_md.md_handle); + LASSERT (retval == vv_return_ok); + tx->tx_mapped = KIB_TX_UNMAPPED; + break; + +#if IBNAL_FMR + case KIB_TX_MAPPED_FMR: + if (in_interrupt() && tx->tx_status != 0) { + /* can't flush FMRs in IRQ context... */ + kibnal_schedule_tx_done(tx); + return; + } + + rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr); + LASSERT (rc == 0); + + if (tx->tx_status != 0) + ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool); + tx->tx_mapped = KIB_TX_UNMAPPED; + break; +#endif + } + + for (i = 0; i < 2; i++) { + /* tx may have up to 2 libmsgs to finalise */ + if (tx->tx_libmsg[i] == NULL) + continue; + + lib_finalize (&kibnal_lib, NULL, tx->tx_libmsg[i], ptlrc); + tx->tx_libmsg[i] = NULL; + } + + if (tx->tx_conn != NULL) { + kibnal_put_conn (tx->tx_conn); + tx->tx_conn = NULL; + } + + tx->tx_nsp = 0; + tx->tx_passive_rdma = 0; + tx->tx_status = 0; + + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + + if (tx->tx_isnblk) { + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_nblk_txs); + } else { + list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs); + wake_up (&kibnal_data.kib_idle_tx_waitq); + } + + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); +} + +static kib_tx_t * +kibnal_get_idle_tx (int may_block) +{ + unsigned long flags; + kib_tx_t *tx = NULL; + ENTRY; + + for (;;) { + spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags); + + /* "normal" descriptor is free */ + if (!list_empty (&kibnal_data.kib_idle_txs)) { + tx = list_entry (kibnal_data.kib_idle_txs.next, + kib_tx_t, tx_list); + break; + } + + if (!may_block) { + /* may dip into reserve pool */ + if (list_empty (&kibnal_data.kib_idle_nblk_txs)) { + CERROR ("reserved tx desc pool exhausted\n"); + break; + } + + tx = list_entry (kibnal_data.kib_idle_nblk_txs.next, + kib_tx_t, tx_list); + break; + } + + /* block for idle tx */ + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + wait_event (kibnal_data.kib_idle_tx_waitq, + !list_empty (&kibnal_data.kib_idle_txs) || + kibnal_data.kib_shutdown); + } + + if (tx != NULL) { + list_del (&tx->tx_list); + + /* Allocate a new passive RDMA completion cookie. It might + * not be needed, but we've got a lock right now and we're + * unlikely to wrap... */ + tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++; + + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + LASSERT (tx->tx_nsp == 0); + LASSERT (tx->tx_sending == 0); + LASSERT (tx->tx_status == 0); + LASSERT (tx->tx_conn == NULL); + LASSERT (!tx->tx_passive_rdma); + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_libmsg[0] == NULL); + LASSERT (tx->tx_libmsg[1] == NULL); + } + + spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags); + + RETURN(tx); +} + +static int +kibnal_dist(lib_nal_t *nal, ptl_nid_t nid, unsigned long *dist) +{ + /* I would guess that if kibnal_get_peer (nid) == NULL, + and we're not routing, then 'nid' is very distant :) */ + if ( nal->libnal_ni.ni_pid.nid == nid ) { + *dist = 0; + } else { + *dist = 1; + } + + return 0; +} + +static void +kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status) +{ + struct list_head *ttmp; + unsigned long flags; + int idle; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + list_for_each (ttmp, &conn->ibc_active_txs) { + kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (!tx->tx_passive_rdma_wait || + tx->tx_passive_rdma_cookie != cookie) + continue; + + CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status); + + tx->tx_status = status; + tx->tx_passive_rdma_wait = 0; + idle = (tx->tx_sending == 0); + + if (idle) + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + /* I could be racing with tx callbacks. It's whoever + * _makes_ tx idle that frees it */ + if (idle) + kibnal_tx_done (tx); + return; + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + CERROR ("Unmatched (late?) RDMA completion "LPX64" from "LPX64"\n", + cookie, conn->ibc_peer->ibp_nid); +} + +static void +kibnal_post_rx (kib_rx_t *rx, int do_credits) +{ + kib_conn_t *conn = rx->rx_conn; + int rc = 0; + unsigned long flags; + vv_return_t retval; + + ENTRY; + + rx->rx_gl = (vv_scatgat_t) { + .v_address = (void *)rx->rx_msg, + .length = IBNAL_MSG_SIZE, + .l_key = rx->l_key, + }; + + rx->rx_wrq = (vv_wr_t) { + .wr_id = kibnal_ptr2wreqid(rx, 1), + .completion_notification = 1, + .scatgat_list = &rx->rx_gl, + .num_of_data_segments = 1, + .wr_type = vv_wr_receive, + }; + + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, + IBNAL_CONN_DREP); + LASSERT (!rx->rx_posted); + rx->rx_posted = 1; + mb(); + + if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) + rc = -ECONNABORTED; + else { + retval = vv_post_receive(kibnal_data.kib_hca, conn->ibc_qp, &rx->rx_wrq); + + if (retval) { + CDEBUG(D_NET, "post failed %d\n", retval); + rc = -EINVAL; + } + CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq); + } + + if (rc == 0) { + if (do_credits) { + spin_lock_irqsave(&conn->ibc_lock, flags); + conn->ibc_outstanding_credits++; + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); + } + EXIT; + return; + } + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + CERROR ("Error posting receive -> "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + kibnal_close_conn (rx->rx_conn, rc); + } else { + CDEBUG (D_NET, "Error posting receive -> "LPX64": %d\n", + conn->ibc_peer->ibp_nid, rc); + } + + /* Drop rx's ref */ + kibnal_put_conn (conn); + EXIT; +} + +#if IBNAL_CKSUM +static inline __u32 kibnal_cksum (void *ptr, int nob) +{ + char *c = ptr; + __u32 sum = 0; + + while (nob-- > 0) + sum = ((sum << 1) | (sum >> 31)) + *c++; + + return (sum); +} +#endif + +static void +kibnal_rx_callback (vv_wc_t *wc) +{ + kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->wr_id); + kib_msg_t *msg = rx->rx_msg; + kib_conn_t *conn = rx->rx_conn; + int nob = wc->num_bytes_transfered; + const int base_nob = offsetof(kib_msg_t, ibm_u); + int credits; + int flipped; + unsigned long flags; + __u32 i; +#if IBNAL_CKSUM + __u32 msg_cksum; + __u32 computed_cksum; +#endif + + /* we set the QP to erroring after we've finished disconnecting, + * maybe we should do so sooner. */ + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED, + IBNAL_CONN_DISCONNECTED); + + CDEBUG(D_NET, "rx %p conn %p, nob=%d\n", rx, conn, nob); + + LASSERT (rx->rx_posted); + rx->rx_posted = 0; + mb(); + + /* receives complete with error in any case after we've started + * disconnecting */ + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) + goto failed; + + if (wc->completion_status != vv_comp_status_success) { + CERROR("Rx from "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, wc->completion_status); + goto failed; + } + + if (nob < base_nob) { + CERROR ("Short rx from "LPX64": %d < expected %d\n", + conn->ibc_peer->ibp_nid, nob, base_nob); + goto failed; + } + + /* Receiver does any byte flipping if necessary... */ + + if (msg->ibm_magic == IBNAL_MSG_MAGIC) { + flipped = 0; + } else { + if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) { + CERROR ("Unrecognised magic: %08x from "LPX64"\n", + msg->ibm_magic, conn->ibc_peer->ibp_nid); + goto failed; + } + flipped = 1; + __swab16s (&msg->ibm_version); + LASSERT (sizeof(msg->ibm_type) == 1); + LASSERT (sizeof(msg->ibm_credits) == 1); + } + + if (msg->ibm_version != IBNAL_MSG_VERSION) { + CERROR ("Incompatible msg version %d (%d expected)\n", + msg->ibm_version, IBNAL_MSG_VERSION); + goto failed; + } + +#if IBNAL_CKSUM + if (nob != msg->ibm_nob) { + CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob); + goto failed; + } + + msg_cksum = le32_to_cpu(msg->ibm_cksum); + msg->ibm_cksum = 0; + computed_cksum = kibnal_cksum (msg, nob); + + if (msg_cksum != computed_cksum) { + CERROR ("Checksum failure %d: (%d expected)\n", + computed_cksum, msg_cksum); +// goto failed; + } + CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob); +#endif + + /* Have I received credits that will let me send? */ + credits = msg->ibm_credits; + if (credits != 0) { + spin_lock_irqsave(&conn->ibc_lock, flags); + conn->ibc_credits += credits; + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); + } + + switch (msg->ibm_type) { + case IBNAL_MSG_NOOP: + kibnal_post_rx (rx, 1); + return; + + case IBNAL_MSG_IMMEDIATE: + if (nob < base_nob + sizeof (kib_immediate_msg_t)) { + CERROR ("Short IMMEDIATE from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + break; + + case IBNAL_MSG_PUT_RDMA: + case IBNAL_MSG_GET_RDMA: + if (nob < base_nob + sizeof (kib_rdma_msg_t)) { + CERROR ("Short RDMA msg from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + if (flipped) + __swab32(msg->ibm_u.rdma.ibrm_num_descs); + + CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n", + msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie); + + if ((msg->ibm_u.rdma.ibrm_num_descs > PTL_MD_MAX_IOV) || + (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) > + min(nob, IBNAL_MSG_SIZE))) { + CERROR ("num_descs %d too large\n", + msg->ibm_u.rdma.ibrm_num_descs); + goto failed; + } + + if (flipped) { + __swab32(msg->ibm_u.rdma.rd_key); + } + + for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) { + kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i]; + + if (flipped) { + __swab32(desc->rd_nob); + __swab64(desc->rd_addr); + } + + CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n", + msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob); + } + break; + + case IBNAL_MSG_PUT_DONE: + case IBNAL_MSG_GET_DONE: + if (nob < base_nob + sizeof (kib_completion_msg_t)) { + CERROR ("Short COMPLETION msg from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, nob); + goto failed; + } + if (flipped) + __swab32s(&msg->ibm_u.completion.ibcm_status); + + CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n", + msg->ibm_type, msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + + kibnal_complete_passive_rdma (conn, + msg->ibm_u.completion.ibcm_cookie, + msg->ibm_u.completion.ibcm_status); + kibnal_post_rx (rx, 1); + return; + + default: + CERROR ("Can't parse type from "LPX64": %d\n", + conn->ibc_peer->ibp_nid, msg->ibm_type); + goto failed; + } + + /* schedule for kibnal_rx() in thread context */ + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + + list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq); + wake_up (&kibnal_data.kib_sched_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + + return; + + failed: + CDEBUG(D_NET, "rx %p conn %p\n", rx, conn); + kibnal_close_conn(conn, -ECONNABORTED); + + /* Don't re-post rx & drop its ref on conn */ + kibnal_put_conn(conn); +} + +static void +kibnal_rx (kib_rx_t *rx) +{ + kib_msg_t *msg = rx->rx_msg; + + /* Clear flag so I can detect if I've sent an RDMA completion */ + rx->rx_rdma = 0; + + switch (msg->ibm_type) { + case IBNAL_MSG_GET_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); + /* If the incoming get was matched, I'll have initiated the + * RDMA and the completion message... */ + if (rx->rx_rdma) + break; + + /* Otherwise, I'll send a failed completion now to prevent + * the peer's GET blocking for the full timeout. */ + CERROR ("Completing unmatched RDMA GET from "LPX64"\n", + rx->rx_conn->ibc_peer->ibp_nid); + kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -EIO, + rx, NULL, 0, NULL, NULL, 0, 0); + break; + + case IBNAL_MSG_PUT_RDMA: + lib_parse(&kibnal_lib, &msg->ibm_u.rdma.ibrm_hdr, rx); + if (rx->rx_rdma) + break; + /* This is most unusual, since even if lib_parse() didn't + * match anything, it should have asked us to read (and + * discard) the payload. The portals header must be + * inconsistent with this message type, so it's the + * sender's fault for sending garbage and she can time + * herself out... */ + CERROR ("Uncompleted RMDA PUT from "LPX64"\n", + rx->rx_conn->ibc_peer->ibp_nid); + break; + + case IBNAL_MSG_IMMEDIATE: + lib_parse(&kibnal_lib, &msg->ibm_u.immediate.ibim_hdr, rx); + LASSERT (!rx->rx_rdma); + break; + + default: + LBUG(); + break; + } + + kibnal_post_rx (rx, 1); +} + +static struct page * +kibnal_kvaddr_to_page (unsigned long vaddr) +{ + struct page *page; + + if (vaddr >= VMALLOC_START && + vaddr < VMALLOC_END) + page = vmalloc_to_page ((void *)vaddr); +#if CONFIG_HIGHMEM + else if (vaddr >= PKMAP_BASE && + vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) + page = vmalloc_to_page ((void *)vaddr); + /* in 2.4 ^ just walks the page tables */ +#endif + else + page = virt_to_page (vaddr); + + if (!VALID_PAGE (page)) + page = NULL; + + return page; +} + +static void +kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset, + unsigned long len, int active) +{ + kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma; + kib_rdma_desc_t *desc; + vv_l_key_t l_key; + vv_r_key_t r_key; + void *addr; + vv_mem_reg_h_t mem_h; + vv_return_t retval; + + LASSERTF(ibrm->ibrm_num_descs < PTL_MD_MAX_IOV, "%u\n", + ibrm->ibrm_num_descs); + + desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs]; + + addr = page_address(page) + page_offset; + + /* TODO: This next step is only needed to get either the lkey + * or the rkey. However they should be the same than for the + * tx buffer, so we might as well use it. */ + retval = vv_get_gen_mr_attrib(kibnal_data.kib_hca, + addr, + len, + &mem_h, + &l_key, + &r_key); + if (retval) { + CERROR("vv_get_gen_mr_attrib failed: %d", retval); + /* TODO: this shouldn't really fail, but what if? */ + return; + } + + if (active) { + ibrm->rd_key = l_key; + } else { + ibrm->rd_key = r_key; + + vv_va2advertise_addr(kibnal_data.kib_hca, addr, &addr); + } + + desc->rd_addr = (__u64)(unsigned long)addr; + desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */ + + ibrm->ibrm_num_descs++; +} + +static int +kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active) +{ + struct page *page; + int page_offset, len; + + while (nob > 0) { + page = kibnal_kvaddr_to_page(vaddr); + if (page == NULL) + return -EFAULT; + + page_offset = vaddr & (PAGE_SIZE - 1); + len = min(nob, (int)PAGE_SIZE - page_offset); + + kibnal_fill_ibrm(tx, page, page_offset, len, active); + nob -= len; + vaddr += len; + } + + return 0; +} + +static int +kibnal_map_iov (kib_tx_t *tx, vv_access_con_bit_mask_t access, + int niov, struct iovec *iov, int offset, int nob, int active) + +{ + void *vaddr; + vv_return_t retval; + + LASSERT (nob > 0); + LASSERT (niov > 0); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + niov--; + iov++; + LASSERT (niov > 0); + } + + if (nob > iov->iov_len - offset) { + CERROR ("Can't map multiple vaddr fragments\n"); + return (-EMSGSIZE); + } + + /* our large contiguous iov could be backed by multiple physical + * pages. */ + if (kibnal_whole_mem()) { + int rc; + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; + rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base + + offset, nob, active); + if (rc != 0) { + CERROR ("Can't map iov: %d\n", rc); + return rc; + } + return 0; + } + + vaddr = (void *)(((unsigned long)iov->iov_base) + offset); + tx->tx_md.md_addr = (__u64)((unsigned long)vaddr); + + retval = vv_mem_region_register(kibnal_data.kib_hca, vaddr, nob, + kibnal_data.kib_pd, access, + &tx->tx_md.md_handle, &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); + if (retval != 0) { + CERROR ("Can't map vaddr %p: %d\n", vaddr, retval); + return -EINVAL; + } + + tx->tx_mapped = KIB_TX_MAPPED; + return (0); +} + +static int +kibnal_map_kiov (kib_tx_t *tx, vv_access_con_bit_mask_t access, + int nkiov, ptl_kiov_t *kiov, + int offset, int nob, int active) +{ + vv_phy_list_t phys_pages; + vv_phy_buf_t *phys_buf = NULL; + int page_offset; + int nphys; + int resid; + int phys_size = 0; + int i, rc = 0; + vv_return_t retval; + + CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob); + + LASSERT (nob > 0); + LASSERT (nkiov > 0); + LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED); + + while (offset >= kiov->kiov_len) { + offset -= kiov->kiov_len; + nkiov--; + kiov++; + LASSERT (nkiov > 0); + } + + page_offset = kiov->kiov_offset + offset; + nphys = 1; + + if (!kibnal_whole_mem()) { + phys_size = nkiov * sizeof(vv_phy_buf_t); + PORTAL_ALLOC(phys_buf, phys_size); + + if (phys_buf == NULL) { + CERROR ("Can't allocate phys_buf\n"); + return (-ENOMEM); + } + + phys_buf[0].start = kibnal_page2phys(kiov->kiov_page); + phys_buf[0].size = PAGE_SIZE; + + } else { + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0; + kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset, + kiov->kiov_len, active); + } + + resid = nob - (kiov->kiov_len - offset); + + while (resid > 0) { + kiov++; + nkiov--; + LASSERT (nkiov > 0); + + if (kiov->kiov_offset != 0 || + ((resid > PAGE_SIZE) && + kiov->kiov_len < PAGE_SIZE)) { + /* Can't have gaps */ + CERROR ("Can't make payload contiguous in I/O VM:" + "page %d, offset %d, len %d \n", nphys, + kiov->kiov_offset, kiov->kiov_len); + + for (i = -nphys; i < nkiov; i++) + { + CERROR("kiov[%d] %p +%d for %d\n", + i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len); + } + + rc = -EINVAL; + goto out; + } + + if (nphys == PTL_MD_MAX_IOV) { + CERROR ("payload too big (%d)\n", nphys); + rc = -EMSGSIZE; + goto out; + } + + if (!kibnal_whole_mem()) { + LASSERT (nphys * sizeof (vv_phy_buf_t) < phys_size); + phys_buf[nphys].start = kibnal_page2phys(kiov->kiov_page); + phys_buf[nphys].size = PAGE_SIZE; + + } else { + if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) { + CERROR ("payload too big (%d)\n", nphys); + rc = -EMSGSIZE; + goto out; + } + kibnal_fill_ibrm(tx, kiov->kiov_page, + kiov->kiov_offset, kiov->kiov_len, + active); + } + + nphys ++; + resid -= PAGE_SIZE; + } + + if (kibnal_whole_mem()) + goto out; + +#if 0 + CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset); + for (i = 0; i < nphys; i++) + CWARN (" [%d] "LPX64"\n", i, phys[i]); +#endif + +#if IBNAL_FMR +#error "vibnal hasn't learned about FMR yet" + rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool, + phys_pages, nphys, + &tx->tx_md.md_addr, + page_offset, + &tx->tx_md.md_handle.fmr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); +#else + retval = vv_phy_mem_region_register(kibnal_data.kib_hca, + &phys_pages, + IBNAL_RDMA_BASE, + nphys, + 0, /* offset */ + kibnal_data.kib_pd, + vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind, /* TODO: translated as-is, but seems incorrect or too much */ + &tx->tx_md.md_handle, + &tx->tx_md.md_addr, + &tx->tx_md.md_lkey, + &tx->tx_md.md_rkey); +#endif + if (retval == vv_return_ok) { + CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n", + nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey); +#if IBNAL_FMR + tx->tx_mapped = KIB_TX_MAPPED_FMR; +#else + tx->tx_mapped = KIB_TX_MAPPED; +#endif + } else { + CERROR ("Can't map phys_pages: %d\n", retval); + rc = -EFAULT; + } + + out: + if (phys_buf != NULL) + PORTAL_FREE(phys_buf, phys_size); + + return (rc); +} + +static kib_conn_t * +kibnal_find_conn_locked (kib_peer_t *peer) +{ + struct list_head *tmp; + + /* just return the first connection */ + list_for_each (tmp, &peer->ibp_conns) { + return (list_entry(tmp, kib_conn_t, ibc_list)); + } + + return (NULL); +} + +void +kibnal_check_sends (kib_conn_t *conn) +{ + unsigned long flags; + kib_tx_t *tx; + int rc; + int i; + int done; + int nwork; + + ENTRY; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE); + + if (list_empty(&conn->ibc_tx_queue) && + conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) { + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + tx = kibnal_get_idle_tx(0); /* don't block */ + if (tx != NULL) + kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0); + + spin_lock_irqsave(&conn->ibc_lock, flags); + + if (tx != NULL) { + atomic_inc(&conn->ibc_refcount); + kibnal_queue_tx_locked(tx, conn); + } + } + + while (!list_empty (&conn->ibc_tx_queue)) { + tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list); + + /* We rely on this for QP sizing */ + LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG); + + LASSERT (conn->ibc_outstanding_credits >= 0); + LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE); + LASSERT (conn->ibc_credits >= 0); + LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE); + + /* Not on ibc_rdma_queue */ + LASSERT (!tx->tx_passive_rdma_wait); + + if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) + GOTO(out, 0); + + if (conn->ibc_credits == 0) /* no credits */ + GOTO(out, 1); + + if (conn->ibc_credits == 1 && /* last credit reserved for */ + conn->ibc_outstanding_credits == 0) /* giving back credits */ + GOTO(out, 2); + + list_del (&tx->tx_list); + + if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP && + (!list_empty(&conn->ibc_tx_queue) || + conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) { + /* redundant NOOP */ + spin_unlock_irqrestore(&conn->ibc_lock, flags); + kibnal_tx_done(tx); + spin_lock_irqsave(&conn->ibc_lock, flags); + continue; + } + + tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits; + conn->ibc_outstanding_credits = 0; + + conn->ibc_nsends_posted++; + conn->ibc_credits--; + + /* we only get a tx completion for the final rdma op */ + tx->tx_sending = 0; + tx->tx_passive_rdma_wait = tx->tx_passive_rdma; + list_add (&tx->tx_list, &conn->ibc_active_txs); +#if IBNAL_CKSUM + tx->tx_msg->ibm_cksum = 0; + tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob); + CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob); +#endif + /* NB the gap between removing tx from the queue and sending it + * allows message re-ordering to occur */ + + LASSERT (tx->tx_nsp > 0); + + rc = -ECONNABORTED; + nwork = 0; + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + vv_return_t retval; + + tx->tx_status = 0; + rc = 0; + + retval = vv_post_send_list(kibnal_data.kib_hca, conn->ibc_qp, tx->tx_nsp, tx->tx_wrq, vv_operation_type_send_rc); + + if (retval != 0) { + CERROR("post send failed with %d\n", retval); + rc = -ECONNABORTED; + break; + } + + tx->tx_sending = tx->tx_nsp; + } + + if (rc != 0) { + /* NB credits are transferred in the actual + * message, which can only be the last work item */ + conn->ibc_outstanding_credits += tx->tx_msg->ibm_credits; + conn->ibc_credits++; + conn->ibc_nsends_posted--; + + tx->tx_status = rc; + tx->tx_passive_rdma_wait = 0; + + /* TODO: I think this is buggy if vv_post_send_list failed. */ + done = (tx->tx_sending == 0); + if (done) + list_del (&tx->tx_list); + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) + CERROR ("Error %d posting transmit to "LPX64"\n", + rc, conn->ibc_peer->ibp_nid); + else + CDEBUG (D_NET, "Error %d posting transmit to " + LPX64"\n", rc, conn->ibc_peer->ibp_nid); + + kibnal_close_conn (conn, rc); + + if (done) + kibnal_tx_done (tx); + return; + } + + } + + EXIT; +out: + spin_unlock_irqrestore (&conn->ibc_lock, flags); +} + +static void +kibnal_tx_callback (vv_wc_t *wc) +{ + kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->wr_id); + kib_conn_t *conn; + unsigned long flags; + int idle; + + conn = tx->tx_conn; + LASSERT (conn != NULL); + LASSERT (tx->tx_sending != 0); + + CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx, + tx->tx_sending, tx->tx_nsp, wc->completion_status); + + spin_lock_irqsave(&conn->ibc_lock, flags); + + /* I could be racing with rdma completion. Whoever makes 'tx' idle + * gets to free it, which also drops its ref on 'conn'. If it's + * not me, then I take an extra ref on conn so it can't disappear + * under me. */ + + tx->tx_sending--; + idle = (tx->tx_sending == 0) && /* This is the final callback */ + (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */ + if (idle) + list_del(&tx->tx_list); + + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + + if (tx->tx_sending == 0) + conn->ibc_nsends_posted--; + + if (wc->completion_status != vv_comp_status_success && + tx->tx_status == 0) + tx->tx_status = -ECONNABORTED; + + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + if (idle) + kibnal_tx_done (tx); + + if (wc->completion_status != vv_comp_status_success) { + CERROR ("Tx completion to "LPX64" failed: %d\n", + conn->ibc_peer->ibp_nid, wc->completion_status); + kibnal_close_conn (conn, -ENETDOWN); + } else { + /* can I shovel some more sends out the door? */ + kibnal_check_sends(conn); + } + + kibnal_put_conn (conn); +} + +void +kibnal_ca_async_callback(vv_event_record_t ev) +{ + /* XXX flesh out. this seems largely for async errors */ + CERROR("type: %d, port: %d, data: "LPX64"\n", ev.event_type, ev.port_num, ev.type.data); +} + +void +kibnal_ca_callback (unsigned long unused_context) +{ + vv_wc_t wc; + int armed = 0; + vv_return_t retval; + + for(;;) { + + while (vv_poll_for_completion(kibnal_data.kib_hca, kibnal_data.kib_cq, &wc) == vv_return_ok) { + + /* We will need to rearm the CQ to avoid a potential race. */ + armed = 0; + + if (kibnal_wreqid_is_rx(wc.wr_id)) + kibnal_rx_callback(&wc); + else + kibnal_tx_callback(&wc); + } + + if (armed) + return; + + retval = vv_request_completion_notification(kibnal_data.kib_hca, kibnal_data.kib_cq, vv_next_solicit_unsolicit_event); + if (retval != 0) { + CERROR ("Failed to re-arm completion queue: %d\n", retval); + return; + } + + armed = 1; + } +} + +void +kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob) +{ + vv_scatgat_t *gl = &tx->tx_gl[tx->tx_nsp]; + vv_wr_t *wrq = &tx->tx_wrq[tx->tx_nsp]; + int fence; + int nob = offsetof (kib_msg_t, ibm_u) + body_nob; + + LASSERT (tx->tx_nsp >= 0 && + tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0])); + LASSERT (nob <= IBNAL_MSG_SIZE); + + tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC; + tx->tx_msg->ibm_version = IBNAL_MSG_VERSION; + tx->tx_msg->ibm_type = type; +#if IBNAL_CKSUM + tx->tx_msg->ibm_nob = nob; +#endif + /* Fence the message if it's bundled with an RDMA read */ + fence = (tx->tx_nsp > 0) && + (type == IBNAL_MSG_PUT_DONE); + + *gl = (vv_scatgat_t) { + .v_address = (void *)tx->tx_msg, + .length = nob, + .l_key = tx->l_key, + }; + + wrq->wr_id = kibnal_ptr2wreqid(tx, 0); + wrq->completion_notification = 1; + wrq->scatgat_list = gl; + wrq->num_of_data_segments = 1; + wrq->wr_type = vv_wr_send; + + wrq->type.send.solicited_event = 1; + + wrq->type.send.send_qp_type.rc_type.fance_indicator = fence; + + tx->tx_nsp++; +} + +static void +kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) +{ + unsigned long flags; + + spin_lock_irqsave(&conn->ibc_lock, flags); + + kibnal_queue_tx_locked (tx, conn); + + spin_unlock_irqrestore(&conn->ibc_lock, flags); + + kibnal_check_sends(conn); +} + +static void +kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) +{ + unsigned long flags; + kib_peer_t *peer; + kib_conn_t *conn; + rwlock_t *g_lock = &kibnal_data.kib_global_lock; + + /* If I get here, I've committed to send, so I complete the tx with + * failure on any problems */ + + LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */ + LASSERT (tx->tx_nsp > 0); /* work items have been set up */ + + read_lock (g_lock); + + peer = kibnal_find_peer_locked (nid); + if (peer == NULL) { + read_unlock (g_lock); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + read_unlock (g_lock); + + kibnal_queue_tx (tx, conn); + return; + } + + /* Making one or more connections; I'll need a write lock... */ + read_unlock (g_lock); + write_lock_irqsave (g_lock, flags); + + peer = kibnal_find_peer_locked (nid); + if (peer == NULL) { + write_unlock_irqrestore (g_lock, flags); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + conn = kibnal_find_conn_locked (peer); + if (conn != NULL) { + /* Connection exists; queue message on it */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */ + write_unlock_irqrestore (g_lock, flags); + + kibnal_queue_tx (tx, conn); + return; + } + + if (peer->ibp_connecting == 0) { + if (!time_after_eq(jiffies, peer->ibp_reconnect_time)) { + write_unlock_irqrestore (g_lock, flags); + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + return; + } + + peer->ibp_connecting = 1; + + kib_peer_addref(peer); /* extra ref for connd */ + + spin_lock (&kibnal_data.kib_connd_lock); + + list_add_tail (&peer->ibp_connd_list, + &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock (&kibnal_data.kib_connd_lock); + } + + /* A connection is being established; queue the message... */ + list_add_tail (&tx->tx_list, &peer->ibp_tx_queue); + + write_unlock_irqrestore (g_lock, flags); +} + +static ptl_err_t +kibnal_start_passive_rdma (int type, ptl_nid_t nid, + lib_msg_t *libmsg, ptl_hdr_t *hdr) +{ + int nob = libmsg->md->length; + kib_tx_t *tx; + kib_msg_t *ibmsg; + int rc; + vv_access_con_bit_mask_t access; + + LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA); + LASSERT (nob > 0); + LASSERT (!in_interrupt()); /* Mapping could block */ + + access = vv_acc_l_mem_write | vv_acc_r_mem_write | vv_acc_r_mem_read | vv_acc_mem_bind; + + tx = kibnal_get_idle_tx (1); /* May block; caller is an app thread */ + LASSERT (tx != NULL); + + if ((libmsg->md->options & PTL_MD_KIOV) == 0) + rc = kibnal_map_iov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.iov, + 0, nob, 0); + else + rc = kibnal_map_kiov (tx, access, + libmsg->md->md_niov, + libmsg->md->md_iov.kiov, + 0, nob, 0); + + if (rc != 0) { + CERROR ("Can't map RDMA for "LPX64": %d\n", nid, rc); + goto failed; + } + + if (type == IBNAL_MSG_GET_RDMA) { + /* reply gets finalized when tx completes */ + tx->tx_libmsg[1] = lib_create_reply_msg(&kibnal_lib, + nid, libmsg); + if (tx->tx_libmsg[1] == NULL) { + CERROR ("Can't create reply for GET -> "LPX64"\n", + nid); + rc = -ENOMEM; + goto failed; + } + } + + tx->tx_passive_rdma = 1; + + ibmsg = tx->tx_msg; + + ibmsg->ibm_u.rdma.ibrm_hdr = *hdr; + ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie; + /* map_kiov alrady filled the rdma descs for the whole_mem case */ + if (!kibnal_whole_mem()) { + ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey; + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; + ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; + ibmsg->ibm_u.rdma.ibrm_num_descs = 1; + } + + kibnal_init_tx_msg (tx, type, + kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs)); + + CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr " + LPX64", nob %d\n", + tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey, + tx->tx_md.md_addr, nob); + + /* libmsg gets finalized when tx completes. */ + tx->tx_libmsg[0] = libmsg; + + kibnal_launch_tx(tx, nid); + return (PTL_OK); + + failed: + tx->tx_status = rc; + kibnal_tx_done (tx); + return (PTL_FAIL); +} + +void +kibnal_start_active_rdma (int type, int status, + kib_rx_t *rx, lib_msg_t *libmsg, + unsigned int niov, + struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t nob) +{ + kib_msg_t *rxmsg = rx->rx_msg; + kib_msg_t *txmsg; + kib_tx_t *tx; + vv_access_con_bit_mask_t access; + vv_wr_operation_t rdma_op; + int rc; + __u32 i; + + CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n", + type, status, niov, offset, nob); + + /* Called by scheduler */ + LASSERT (!in_interrupt ()); + + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + /* No data if we're completing with failure */ + LASSERT (status == 0 || nob == 0); + + LASSERT (type == IBNAL_MSG_GET_DONE || + type == IBNAL_MSG_PUT_DONE); + + /* Flag I'm completing the RDMA. Even if I fail to send the + * completion message, I will have tried my best so further + * attempts shouldn't be tried. */ + LASSERT (!rx->rx_rdma); + rx->rx_rdma = 1; + + if (type == IBNAL_MSG_GET_DONE) { + access = 0; + rdma_op = vv_wr_rdma_write; + LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA); + } else { + access = vv_acc_l_mem_write; + rdma_op = vv_wr_rdma_read; + LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA); + } + + tx = kibnal_get_idle_tx (0); /* Mustn't block */ + if (tx == NULL) { + CERROR ("tx descs exhausted on RDMA from "LPX64 + " completing locally with failure\n", + rx->rx_conn->ibc_peer->ibp_nid); + lib_finalize (&kibnal_lib, NULL, libmsg, PTL_NO_SPACE); + return; + } + LASSERT (tx->tx_nsp == 0); + + if (nob == 0) + GOTO(init_tx, 0); + + /* We actually need to transfer some data (the transfer + * size could get truncated to zero when the incoming + * message is matched) */ + if (kiov != NULL) + rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1); + else + rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1); + + if (rc != 0) { + CERROR ("Can't map RDMA -> "LPX64": %d\n", + rx->rx_conn->ibc_peer->ibp_nid, rc); + /* We'll skip the RDMA and complete with failure. */ + status = rc; + nob = 0; + GOTO(init_tx, rc); + } + + if (!kibnal_whole_mem()) { + tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey; + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr; + tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob; + tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1; + } + + /* XXX ugh. different page-sized hosts. */ + if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs != + rxmsg->ibm_u.rdma.ibrm_num_descs) { + CERROR("tx descs (%u) != rx descs (%u)\n", + tx->tx_msg->ibm_u.rdma.ibrm_num_descs, + rxmsg->ibm_u.rdma.ibrm_num_descs); + /* We'll skip the RDMA and complete with failure. */ + status = rc; + nob = 0; + GOTO(init_tx, rc); + } + + /* map_kiov filled in the rdma descs which describe our side of the + * rdma transfer. */ + /* ibrm_num_descs was verified in rx_callback */ + for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) { + kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */ + vv_scatgat_t *ds = &tx->tx_gl[i]; + vv_wr_t *wrq = &tx->tx_wrq[i]; + + ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i]; + rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i]; + + ds->v_address = (void *)(unsigned long)ldesc->rd_addr; + ds->length = ldesc->rd_nob; + ds->l_key = tx->tx_msg->ibm_u.rdma.rd_key; + + wrq->wr_id = kibnal_ptr2wreqid(tx, 0); + +#if 0 + /* only the last rdma post triggers tx completion */ + if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1) + wrq->completion_notification = 1; + else + wrq->completion_notification = 0; + +#else + /* TODO: hack. Right now complete everything, else the + * driver will deadlock. This is less efficient than + * requestion a notification for only a few of the + * WQE. */ + wrq->completion_notification = 1; +#endif + + wrq->scatgat_list = ds; + wrq->num_of_data_segments = 1; + wrq->wr_type = rdma_op; + + wrq->type.send.solicited_event = 0; + + wrq->type.send.send_qp_type.rc_type.fance_indicator = 0; + wrq->type.send.send_qp_type.rc_type.r_addr = rdesc->rd_addr; + wrq->type.send.send_qp_type.rc_type.r_r_key = rxmsg->ibm_u.rdma.rd_key; + + CDEBUG(D_NET, "prepared RDMA with r_addr=%llx r_key=%x\n", + wrq->type.send.send_qp_type.rc_type.r_addr, + wrq->type.send.send_qp_type.rc_type.r_r_key); + + tx->tx_nsp++; + } + +init_tx: + txmsg = tx->tx_msg; + + txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie; + txmsg->ibm_u.completion.ibcm_status = status; + + kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t)); + + if (status == 0 && nob != 0) { + LASSERT (tx->tx_nsp > 1); + /* RDMA: libmsg gets finalized when the tx completes. This + * is after the completion message has been sent, which in + * turn is after the RDMA has finished. */ + tx->tx_libmsg[0] = libmsg; + } else { + LASSERT (tx->tx_nsp == 1); + /* No RDMA: local completion happens now! */ + CDEBUG(D_WARNING,"No data: immediate completion\n"); + lib_finalize (&kibnal_lib, NULL, libmsg, + status == 0 ? PTL_OK : PTL_FAIL); + } + + /* +1 ref for this tx... */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + rx->rx_conn, rx->rx_conn->ibc_state, + rx->rx_conn->ibc_peer->ibp_nid, + atomic_read (&rx->rx_conn->ibc_refcount)); + atomic_inc (&rx->rx_conn->ibc_refcount); + /* ...and queue it up */ + kibnal_queue_tx(tx, rx->rx_conn); +} + +static ptl_err_t +kibnal_sendmsg(lib_nal_t *nal, + void *private, + lib_msg_t *libmsg, + ptl_hdr_t *hdr, + int type, + ptl_nid_t nid, + ptl_pid_t pid, + unsigned int payload_niov, + struct iovec *payload_iov, + ptl_kiov_t *payload_kiov, + size_t payload_offset, + size_t payload_nob) +{ + kib_msg_t *ibmsg; + kib_tx_t *tx; + int nob; + + /* NB 'private' is different depending on what we're sending.... */ + + CDEBUG(D_NET, "sending "LPSZ" bytes in %d frags to nid:"LPX64 + " pid %d\n", payload_nob, payload_niov, nid , pid); + + LASSERT (payload_nob == 0 || payload_niov > 0); + LASSERT (payload_niov <= PTL_MD_MAX_IOV); + + /* Thread context if we're sending payload */ + LASSERT (!in_interrupt() || payload_niov == 0); + /* payload is either all vaddrs or all pages */ + LASSERT (!(payload_kiov != NULL && payload_iov != NULL)); + + switch (type) { + default: + LBUG(); + return (PTL_FAIL); + + case PTL_MSG_REPLY: { + /* reply's 'private' is the incoming receive */ + kib_rx_t *rx = private; + + /* RDMA reply expected? */ + if (rx->rx_msg->ibm_type == IBNAL_MSG_GET_RDMA) { + kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0, + rx, libmsg, payload_niov, + payload_iov, payload_kiov, + payload_offset, payload_nob); + return (PTL_OK); + } + + /* Incoming message consistent with immediate reply? */ + if (rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) { + CERROR ("REPLY to "LPX64" bad opbm type %d!!!\n", + nid, rx->rx_msg->ibm_type); + return (PTL_FAIL); + } + + /* Will it fit in a message? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob > IBNAL_MSG_SIZE) { + CERROR("REPLY for "LPX64" too big (RDMA not requested): %d (max for message is %d)\n", + nid, payload_nob, IBNAL_MSG_SIZE); + return (PTL_FAIL); + } + break; + } + + case PTL_MSG_GET: + /* might the REPLY message be big enough to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[libmsg->md->length]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, + nid, libmsg, hdr)); + break; + + case PTL_MSG_ACK: + LASSERT (payload_nob == 0); + break; + + case PTL_MSG_PUT: + /* Is the payload big enough to need RDMA? */ + nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]); + if (nob > IBNAL_MSG_SIZE) + return (kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA, + nid, libmsg, hdr)); + + break; + } + + tx = kibnal_get_idle_tx(!(type == PTL_MSG_ACK || + type == PTL_MSG_REPLY || + in_interrupt())); + if (tx == NULL) { + CERROR ("Can't send %d to "LPX64": tx descs exhausted%s\n", + type, nid, in_interrupt() ? " (intr)" : ""); + return (PTL_NO_SPACE); + } + + ibmsg = tx->tx_msg; + ibmsg->ibm_u.immediate.ibim_hdr = *hdr; + + if (payload_nob > 0) { + if (payload_kiov != NULL) + lib_copy_kiov2buf(ibmsg->ibm_u.immediate.ibim_payload, + payload_niov, payload_kiov, + payload_offset, payload_nob); + else + lib_copy_iov2buf(ibmsg->ibm_u.immediate.ibim_payload, + payload_niov, payload_iov, + payload_offset, payload_nob); + } + + kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, + offsetof(kib_immediate_msg_t, + ibim_payload[payload_nob])); + + /* libmsg gets finalized when tx completes */ + tx->tx_libmsg[0] = libmsg; + + kibnal_launch_tx(tx, nid); + return (PTL_OK); +} + +static ptl_err_t +kibnal_send (lib_nal_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, struct iovec *payload_iov, + size_t payload_offset, size_t payload_len) +{ + CDEBUG(D_NET, " pid = %d, nid="LPU64"\n", + pid, nid); + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, payload_iov, NULL, + payload_offset, payload_len)); +} + +static ptl_err_t +kibnal_send_pages (lib_nal_t *nal, void *private, lib_msg_t *cookie, + ptl_hdr_t *hdr, int type, ptl_nid_t nid, ptl_pid_t pid, + unsigned int payload_niov, ptl_kiov_t *payload_kiov, + size_t payload_offset, size_t payload_len) +{ + return (kibnal_sendmsg(nal, private, cookie, + hdr, type, nid, pid, + payload_niov, NULL, payload_kiov, + payload_offset, payload_len)); +} + +static ptl_err_t +kibnal_recvmsg (lib_nal_t *nal, void *private, lib_msg_t *libmsg, + unsigned int niov, struct iovec *iov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) +{ + kib_rx_t *rx = private; + kib_msg_t *rxmsg = rx->rx_msg; + int msg_nob; + + LASSERT (mlen <= rlen); + LASSERT (!in_interrupt ()); + /* Either all pages or all vaddrs */ + LASSERT (!(kiov != NULL && iov != NULL)); + + switch (rxmsg->ibm_type) { + default: + LBUG(); + return (PTL_FAIL); + + case IBNAL_MSG_IMMEDIATE: + msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]); + if (msg_nob > IBNAL_MSG_SIZE) { + CERROR ("Immediate message from "LPX64" too big: %d\n", + rxmsg->ibm_u.immediate.ibim_hdr.src_nid, rlen); + return (PTL_FAIL); + } + + if (kiov != NULL) + lib_copy_buf2kiov(niov, kiov, offset, + rxmsg->ibm_u.immediate.ibim_payload, + mlen); + else + lib_copy_buf2iov(niov, iov, offset, + rxmsg->ibm_u.immediate.ibim_payload, + mlen); + + lib_finalize (nal, NULL, libmsg, PTL_OK); + return (PTL_OK); + + case IBNAL_MSG_GET_RDMA: + /* We get called here just to discard any junk after the + * GET hdr. */ + LASSERT (libmsg == NULL); + lib_finalize (nal, NULL, libmsg, PTL_OK); + return (PTL_OK); + + case IBNAL_MSG_PUT_RDMA: + kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, + rx, libmsg, + niov, iov, kiov, offset, mlen); + return (PTL_OK); + } +} + +static ptl_err_t +kibnal_recv (lib_nal_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, struct iovec *iov, + size_t offset, size_t mlen, size_t rlen) +{ + return (kibnal_recvmsg (nal, private, msg, niov, iov, NULL, + offset, mlen, rlen)); +} + +static ptl_err_t +kibnal_recv_pages (lib_nal_t *nal, void *private, lib_msg_t *msg, + unsigned int niov, ptl_kiov_t *kiov, + size_t offset, size_t mlen, size_t rlen) +{ + return (kibnal_recvmsg (nal, private, msg, niov, NULL, kiov, + offset, mlen, rlen)); +} + +/***************************************************************************** + * the rest of this file concerns connection management. active connetions + * start with connect_peer, passive connections start with passive_callback. + * active disconnects start with conn_close, cm_callback starts passive + * disconnects and contains the guts of how the disconnect state machine + * progresses. + *****************************************************************************/ + +int +kibnal_thread_start (int (*fn)(void *arg), void *arg) +{ + long pid = kernel_thread (fn, arg, 0); + + if (pid < 0) + return ((int)pid); + + atomic_inc (&kibnal_data.kib_nthreads); + return (0); +} + +static void +kibnal_thread_fini (void) +{ + atomic_dec (&kibnal_data.kib_nthreads); +} + +/* this can be called by anyone at any time to close a connection. if + * the connection is still established it heads to the connd to start + * the disconnection in a safe context. It has no effect if called + * on a connection that is already disconnecting */ +void +kibnal_close_conn_locked (kib_conn_t *conn, int error) +{ + /* This just does the immmediate housekeeping, and schedules the + * connection for the connd to finish off. + * Caller holds kib_global_lock exclusively in irq context */ + kib_peer_t *peer = conn->ibc_peer; + + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING, + IBNAL_CONN_DISCONNECTED); + + if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) + return; /* already disconnecting */ + + CDEBUG (error == 0 ? D_NET : D_ERROR, + "closing conn to "LPX64": error %d\n", peer->ibp_nid, error); + + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) { + /* kib_connd_conns takes ibc_list's ref */ + list_del (&conn->ibc_list); + } else { + /* new ref for kib_connd_conns */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + } + + if (list_empty (&peer->ibp_conns) && + peer->ibp_persistence == 0) { + /* Non-persistent peer with no more conns... */ + kibnal_unlink_peer_locked (peer); + } + + conn->ibc_state = IBNAL_CONN_SEND_DREQ; + + spin_lock (&kibnal_data.kib_connd_lock); + + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock (&kibnal_data.kib_connd_lock); +} + +void +kibnal_close_conn (kib_conn_t *conn, int error) +{ + unsigned long flags; + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + kibnal_close_conn_locked (conn, error); + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); +} + +static void +kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc) +{ + LIST_HEAD (zombies); + kib_tx_t *tx; + unsigned long flags; + + LASSERT (rc != 0); + LASSERT (peer->ibp_reconnect_interval >= IBNAL_MIN_RECONNECT_INTERVAL); + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + LASSERT (peer->ibp_connecting != 0); + peer->ibp_connecting--; + if (peer->ibp_connecting != 0) { + /* another connection attempt under way (loopback?)... */ + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + return; + } + + if (list_empty(&peer->ibp_conns)) { + /* Say when active connection can be re-attempted */ + peer->ibp_reconnect_time = jiffies + peer->ibp_reconnect_interval; + /* Increase reconnection interval */ + peer->ibp_reconnect_interval = MIN (peer->ibp_reconnect_interval * 2, + IBNAL_MAX_RECONNECT_INTERVAL); + + /* Take peer's blocked blocked transmits; I'll complete + * them with error */ + while (!list_empty (&peer->ibp_tx_queue)) { + tx = list_entry (peer->ibp_tx_queue.next, + kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add_tail (&tx->tx_list, &zombies); + } + + if (kibnal_peer_active(peer) && + (peer->ibp_persistence == 0)) { + /* failed connection attempt on non-persistent peer */ + kibnal_unlink_peer_locked (peer); + } + } else { + /* Can't have blocked transmits if there are connections */ + LASSERT (list_empty(&peer->ibp_tx_queue)); + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + if (!list_empty (&zombies)) + CERROR ("Deleting messages for "LPX64": connection failed\n", + peer->ibp_nid); + + while (!list_empty (&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + /* complete now */ + tx->tx_status = -EHOSTUNREACH; + kibnal_tx_done (tx); + } +} + +static void +kibnal_connreq_done (kib_conn_t *conn, int active, int status) +{ + int state = conn->ibc_state; + kib_peer_t *peer = conn->ibc_peer; + kib_tx_t *tx; + unsigned long flags; + int i; + + CDEBUG(D_NET, "Enter kibnal_connreq_done for conn=%p, active=%d, status=%d\n", + conn, active, status); + + /* passive connection has no connreq & vice versa */ + LASSERTF(!active == !(conn->ibc_connreq != NULL), + "%d %p\n", active, conn->ibc_connreq); + + if (active) { + PORTAL_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + conn->ibc_connreq = NULL; + } + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + LASSERT (peer->ibp_connecting != 0); + + if (status == 0) { + /* connection established... */ + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING); + conn->ibc_state = IBNAL_CONN_ESTABLISHED; + + if (!kibnal_peer_active(peer)) { + /* ...but peer deleted meantime */ + status = -ECONNABORTED; + } + } else { + KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP, + IBNAL_CONN_CONNECTING); + } + + if (status == 0) { + /* Everything worked! */ + + peer->ibp_connecting--; + + /* +1 ref for ibc_list; caller(== CM)'s ref remains until + * the IB_CM_IDLE callback */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + list_add (&conn->ibc_list, &peer->ibp_conns); + + /* reset reconnect interval for next attempt */ + peer->ibp_reconnect_interval = IBNAL_MIN_RECONNECT_INTERVAL; + + /* post blocked sends to the new connection */ + spin_lock (&conn->ibc_lock); + + while (!list_empty (&peer->ibp_tx_queue)) { + tx = list_entry (peer->ibp_tx_queue.next, + kib_tx_t, tx_list); + + list_del (&tx->tx_list); + + /* +1 ref for each tx */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + kibnal_queue_tx_locked (tx, conn); + } + + spin_unlock (&conn->ibc_lock); + + /* Nuke any dangling conns from a different peer instance... */ + kibnal_close_stale_conns_locked (conn->ibc_peer, + conn->ibc_incarnation); + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + /* queue up all the receives */ + for (i = 0; i < IBNAL_RX_MSGS; i++) { + /* +1 ref for rx desc */ + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_inc (&conn->ibc_refcount); + + CDEBUG(D_NET, "RX[%d] %p->%p\n", + i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg); + + kibnal_post_rx (&conn->ibc_rxs[i], 0); + } + + kibnal_check_sends (conn); + return; + } + + /* connection failed */ + if (state == IBNAL_CONN_CONNECTING) { + /* schedule for connd to close */ + kibnal_close_conn_locked (conn, status); + } else { + /* Don't have a CM comm_id; just wait for refs to drain */ + conn->ibc_state = IBNAL_CONN_DISCONNECTED; + } + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + kibnal_peer_connect_failed (conn->ibc_peer, active, status); + + /* If we didn't establish the connection we don't have to pass + * through the disconnect protocol before dropping the CM ref */ + if (state < IBNAL_CONN_CONNECTING) + kibnal_put_conn (conn); +} + +static int +kibnal_accept (kib_conn_t **connp, cm_cep_handle_t *cep, + ptl_nid_t nid, __u64 incarnation, int queue_depth) +{ + kib_conn_t *conn = kibnal_create_conn(); + kib_peer_t *peer; + kib_peer_t *peer2; + unsigned long flags; + + if (conn == NULL) + return (-ENOMEM); + + if (queue_depth != IBNAL_MSG_QUEUE_SIZE) { + CERROR("Can't accept "LPX64": bad queue depth %d (%d expected)\n", + nid, queue_depth, IBNAL_MSG_QUEUE_SIZE); + atomic_dec (&conn->ibc_refcount); + kibnal_destroy_conn(conn); + return (-EPROTO); + } + + /* assume 'nid' is a new peer */ + peer = kibnal_create_peer (nid); + if (peer == NULL) { + CDEBUG(D_NET, "--conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, conn->ibc_peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + atomic_dec (&conn->ibc_refcount); + kibnal_destroy_conn(conn); + return (-ENOMEM); + } + + write_lock_irqsave (&kibnal_data.kib_global_lock, flags); + + peer2 = kibnal_find_peer_locked(nid); + if (peer2 == NULL) { + /* peer table takes my ref on peer */ + list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid)); + } else { + kib_peer_decref (peer); + peer = peer2; + } + + kib_peer_addref(peer); /* +1 ref for conn */ + peer->ibp_connecting++; + + write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags); + + conn->ibc_peer = peer; + conn->ibc_state = IBNAL_CONN_CONNECTING; + /* conn->ibc_cep is set when cm_accept is called */ + conn->ibc_incarnation = incarnation; + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + + *connp = conn; + return (0); +} + +static void kibnal_move_qp_to_error(kib_conn_t *conn) +{ + vv_qp_attr_t qp_attr; + vv_return_t retval; + + qp_attr.modify.qp_modify_into_state = vv_qp_state_error; + qp_attr.modify.vv_qp_attr_mask = VV_QP_AT_STATE; + qp_attr.modify.qp_type = vv_qp_type_r_conn; + + retval = vv_qp_modify(kibnal_data.kib_hca, conn->ibc_qp, &qp_attr, &conn->ibc_qp_attrs); + if (retval) + CERROR("couldn't move qp into error state, error %d\n", retval); +} + +static void kibnal_flush_pending(kib_conn_t *conn) +{ + LIST_HEAD (zombies); + struct list_head *tmp; + struct list_head *nxt; + kib_tx_t *tx; + unsigned long flags; + int done; + + /* NB we wait until the connection has closed before completing + * outstanding passive RDMAs so we can be sure the network can't + * touch the mapped memory any more. */ + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED); + + /* set the QP to the error state so that we get flush callbacks + * on our posted receives which can then drop their conn refs */ + kibnal_move_qp_to_error(conn); + + spin_lock_irqsave (&conn->ibc_lock, flags); + + /* grab passive RDMAs not waiting for the tx callback */ + list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + /* still waiting for tx callback? */ + if (!tx->tx_passive_rdma_wait) + continue; + + tx->tx_status = -ECONNABORTED; + tx->tx_passive_rdma_wait = 0; + done = (tx->tx_sending == 0); + + if (!done) + continue; + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + /* grab all blocked transmits */ + list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) { + tx = list_entry (tmp, kib_tx_t, tx_list); + + list_del (&tx->tx_list); + list_add (&tx->tx_list, &zombies); + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + while (!list_empty(&zombies)) { + tx = list_entry (zombies.next, kib_tx_t, tx_list); + + list_del(&tx->tx_list); + kibnal_tx_done (tx); + } +} + +static void +kibnal_reject (cm_cep_handle_t cep, cm_rej_code_t reason) +{ + cm_reject_data_t *rej; + + PORTAL_ALLOC(rej, sizeof(*rej)); + if (rej == NULL) /* PORTAL_ALLOC() will CERROR on failure */ + return; + + rej->reason = reason; + cm_reject(cep, rej); + PORTAL_FREE(rej, sizeof(*rej)); +} + +static void get_av_from_path(ib_path_record_v2_t *path, vv_add_vec_t *av) +{ + av->service_level = path->sl; + av->grh_flag = 0; /* TODO: correct? */ + av->dlid = path->dlid; + av->pmtu = path->mtu; + + /* From sdp-hca-params.h. */ + switch(path->rate) { + case 2: + av->max_static_rate = 1; + break; + case 3: + case 4: + default: + av->max_static_rate = 0; + break; + } + + av->l_ack_timeout = IBNAL_ACK_TIMEOUT; + av->retry_count = IBNAL_RETRY; + av->rnr_retry_count = IBNAL_RNR_RETRY; + av->source_path_bit = 0; + + av->global_dest.flow_lable = path->flow_label; + av->global_dest.hope_limit = path->hop_limut; + av->global_dest.traffic_class = path->traffic_class; + av->global_dest.s_gid_index = 0; + av->global_dest.d_gid = path->dgid; +}; + +static vv_return_t +kibnal_qp_rts(vv_qp_h_t qp_handle, __u32 qpn, __u8 resp_res, + ib_path_record_v2_t *path, __u8 init_depth, __u32 send_psn) +{ + vv_qp_attr_t qp_attr; + vv_return_t retval; + + ENTRY; + +#if 1 + /* TODO - Hack. I don't know whether I get bad values from the + * stack or if I'm using the wrong names. */ + resp_res = 8; + init_depth = 8; +#endif + + /* RTR */ + qp_attr.modify.qp_modify_into_state = vv_qp_state_rtr; + qp_attr.modify.vv_qp_attr_mask = + VV_QP_AT_STATE | + VV_QP_AT_ADD_VEC | + VV_QP_AT_DEST_QP | + VV_QP_AT_R_PSN | + VV_QP_AT_RESP_RDMA_ATOM_OUT_NUM | + VV_QP_AT_MIN_RNR_NAK_T | VV_QP_AT_OP_F; + + qp_attr.modify.qp_type = vv_qp_type_r_conn; + + get_av_from_path(path, &qp_attr.modify.params.rtr.remote_add_vec); + qp_attr.modify.params.rtr.destanation_qp = qpn; + qp_attr.modify.params.rtr.receive_psn = IBNAL_STARTING_PSN; + qp_attr.modify.params.rtr.responder_rdma_r_atom_num = resp_res; + qp_attr.modify.params.rtr.opt_min_rnr_nak_timer = 16; /* 20 ms */ + + /* For now, force MTU to 1KB (Voltaire's advice). */ + qp_attr.modify.params.rtr.remote_add_vec.pmtu = vv_mtu_1024; + + retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL); + if (retval) { + CERROR("Cannot modify QP to RTR: %d\n", retval); + RETURN(retval); + } + + /* RTS */ + qp_attr.modify.qp_modify_into_state = vv_qp_state_rts; + qp_attr.modify.vv_qp_attr_mask = + VV_QP_AT_STATE | + VV_QP_AT_L_ACK_T | + VV_QP_AT_RETRY_NUM | + VV_QP_AT_RNR_NUM | + VV_QP_AT_S_PSN | + VV_QP_AT_DEST_RDMA_ATOM_OUT_NUM; + qp_attr.modify.qp_type = vv_qp_type_r_conn; + + qp_attr.modify.params.rts.local_ack_timeout = path->pkt_life_time + 2; /* 2 or 1? */ + qp_attr.modify.params.rts.retry_num = IBNAL_RETRY; + qp_attr.modify.params.rts.rnr_num = IBNAL_RNR_RETRY; + qp_attr.modify.params.rts.send_psn = send_psn; + qp_attr.modify.params.rts.dest_out_rdma_r_atom_num = init_depth; + qp_attr.modify.params.rts.flow_control = 1; /* Stack does not use it. */ + + retval = vv_qp_modify(kibnal_data.kib_hca, qp_handle, &qp_attr, NULL); + if (retval) { + CERROR("Cannot modify QP to RTS: %d\n", retval); + } + + RETURN(retval); +} + +static void +kibnal_connect_reply (cm_cep_handle_t cep, cm_conn_data_t *info, kib_conn_t *conn) +{ + vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs; + kib_wire_connreq_t *wcr; + cm_reply_data_t *rep = &info->data.reply; + cm_rej_code_t reason; + vv_return_t retval; + + wcr = (kib_wire_connreq_t *)info->data.reply.priv_data; + + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { + CERROR ("Can't connect "LPX64": bad magic %08x\n", + conn->ibc_peer->ibp_nid, le32_to_cpu(wcr->wcr_magic)); + GOTO(reject, reason = cm_rej_code_usr_rej); + } + + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { + CERROR ("Can't connect "LPX64": bad version %d\n", + conn->ibc_peer->ibp_nid, le16_to_cpu(wcr->wcr_magic)); + GOTO(reject, reason = cm_rej_code_usr_rej); + } + + if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) { + CERROR ("Can't connect "LPX64": bad queue depth %d\n", + conn->ibc_peer->ibp_nid, + le16_to_cpu(wcr->wcr_queue_depth)); + GOTO(reject, reason = cm_rej_code_usr_rej); + } + + if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) { + CERROR ("Unexpected NID "LPX64" from "LPX64"\n", + le64_to_cpu(wcr->wcr_nid), conn->ibc_peer->ibp_nid); + GOTO(reject, reason = cm_rej_code_usr_rej); + } + + CDEBUG(D_NET, "Connection %p -> "LPX64" REP_RECEIVED.\n", + conn, conn->ibc_peer->ibp_nid); + + conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation); + conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE; + + retval = kibnal_qp_rts(conn->ibc_qp, rep->qpn, + min_t(__u8, rep->arb_initiator_depth, + ca_attr->max_read_atom_qp_outstanding), + &conn->ibc_connreq->cr_path, + min_t(__u8, rep->arb_resp_res, + ca_attr->max_qp_depth_for_init_read_atom), + rep->start_psn); + + if (retval) { + CERROR("Connection %p -> "LPX64" QP RTS/RTR failed: %d\n", + conn, conn->ibc_peer->ibp_nid, retval); + GOTO(reject, reason = cm_rej_code_no_qp); + } + + dump_qp(conn); + + /* the callback arguments are ignored for an active accept */ + /* TODO: memset cmrtu? */ + retval = cm_accept(cep, NULL, &conn->ibc_connreq->cr_cm_rtu, kibnal_cm_callback, conn); + if (retval) { + CERROR("Connection %p -> "LPX64" CMAccept RTU failed: %d\n", + conn, conn->ibc_peer->ibp_nid, retval); + kibnal_connreq_done (conn, 1, -ECONNABORTED); + /* XXX don't call reject after accept fails? */ + return; + } + + CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", + conn, conn->ibc_peer->ibp_nid); + + kibnal_connreq_done (conn, 1, 0); + + return; + +reject: + kibnal_reject(cep, reason); + kibnal_connreq_done (conn, 1, -EPROTO); +} + +/* Off level CM callback */ +static void +_kibnal_cm_callback(void * arg) +{ + struct cm_off_level *cm_tq = arg; + cm_cep_handle_t cep = cm_tq->cep; + cm_conn_data_t *info = cm_tq->info; + kib_conn_t *conn = cm_tq->conn; + vv_return_t retval; + + CDEBUG(D_NET, "CM event 0x%x for CEP %p\n", info->status, cep); + + PORTAL_FREE(cm_tq, sizeof(*cm_tq)); + + /* Established Connection Notifier */ + switch (info->status) { + case cm_event_connected: + CDEBUG(D_NET, "Connection %p -> "LPX64" Established\n", + conn, conn->ibc_peer->ibp_nid); + kibnal_connreq_done (conn, 0, 0); + break; + + case cm_event_conn_timeout: + case cm_event_conn_reject: + /* TODO: be sure this is called only if REQ times out. */ + CERROR("connection timed out\n"); + LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING); + conn->ibc_state = IBNAL_CONN_INIT_QP; + kibnal_connreq_done (conn, 1, -EINVAL); + break; + + case cm_event_conn_reply: + kibnal_connect_reply(cep, info, conn); + break; + + case cm_event_disconn_request: + /* XXX lock around these state management bits? */ + if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) + kibnal_close_conn (conn, 0); + conn->ibc_state = IBNAL_CONN_DREP; + + retval = cm_disconnect(conn->ibc_cep, NULL, &kibnal_data.cm_data.drep_data); + if (retval) + CERROR("disconnect rep failed: %d\n", retval); + + /* Fall through ... */ + + /* these both guarantee that no more cm callbacks will occur */ + case cm_event_disconnected: /* aka cm_event_disconn_timeout */ + case cm_event_disconn_reply: + CDEBUG(D_NET, "Connection %p -> "LPX64" disconnect done.\n", + conn, conn->ibc_peer->ibp_nid); + + conn->ibc_state = IBNAL_CONN_DISCONNECTED; + kibnal_flush_pending(conn); + kibnal_put_conn(conn); /* Lose CM's ref */ + break; + + default: + CERROR("unknown status %d on Connection %p -> "LPX64"\n", + info->status, conn, conn->ibc_peer->ibp_nid); + LBUG(); + break; + } + + return; +} + +static void +kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg) +{ + struct cm_off_level *cm_tq; + + LASSERT(cep); + LASSERT(info); + + CDEBUG(D_NET, "CM event 0x%x for CEP %p\n", info->status, cep); + + PORTAL_ALLOC_ATOMIC(cm_tq, sizeof(*cm_tq)); + if (cm_tq == NULL) { + CERROR("Failed to allocate a CM off level structure\n"); + return; + } + + cm_tq->tq.sync = 0; + cm_tq->tq.routine = _kibnal_cm_callback; + cm_tq->tq.data = cm_tq; + + cm_tq->cep = cep; + cm_tq->info = info; + cm_tq->conn = (kib_conn_t *)arg; + + schedule_task(&cm_tq->tq); +} + +static int +kibnal_set_cm_flags(cm_cep_handle_t cep) +{ +#ifdef TODO +voltaire cm doesnot appear to have that functionnality + FSTATUS frc; + uint32 value = 1; + + frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK, + (char *)&value, sizeof(value), 0); + if (frc != FSUCCESS) { + CERROR("error setting timeout callback: %d\n", frc); + return -1; + } + +#if 0 + frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value, + sizeof(value), 0); + if (frc != FSUCCESS) { + CERROR("error setting async accept: %d\n", frc); + return -1; + } +#endif +#endif + + return 0; +} + +/* Off level listen callback */ +static void +_kibnal_listen_callback(void *arg) +{ + struct cm_off_level *cm_tq = arg; + cm_cep_handle_t cep = cm_tq->cep; + cm_conn_data_t *info = cm_tq->info; + vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs; + cm_request_data_t *req; + cm_reply_data_t *rep = NULL; + kib_wire_connreq_t *wcr; + kib_conn_t *conn = NULL; + cm_rej_code_t reason = 0; + int rc = 0; + vv_return_t retval; + vv_qp_attr_t *query; + void *qp_context; + + LASSERT(cep); + LASSERT(info); + + CDEBUG(D_NET, "LISTEN status 0x%x for CEP %p\n", info->status, cep); + + PORTAL_FREE(cm_tq, sizeof(*cm_tq)); + + req = &info->data.request; + wcr = (kib_wire_connreq_t *)req->priv_data; + + CDEBUG(D_NET, "%d from "LPX64"\n", info->status, + le64_to_cpu(wcr->wcr_nid)); + +#ifdef TODO + is there an equivalent? + if (info->status == FCM_CONNECT_CANCEL) + return; +#endif + + LASSERT (info->status == cm_event_conn_request); + + if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) { + CERROR ("Can't accept: bad magic %08x\n", + le32_to_cpu(wcr->wcr_magic)); + GOTO(out, reason = cm_rej_code_usr_rej); + } + + if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) { + CERROR ("Can't accept: bad version %d\n", + le16_to_cpu(wcr->wcr_magic)); + GOTO(out, reason = cm_rej_code_usr_rej); + } + + rc = kibnal_accept(&conn, cep, + le64_to_cpu(wcr->wcr_nid), + le64_to_cpu(wcr->wcr_incarnation), + le16_to_cpu(wcr->wcr_queue_depth)); + if (rc != 0) { + CERROR ("Can't accept "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), rc); + GOTO(out, reason = cm_rej_code_no_res); + } + + /* TODO: I hope I got the ca_attr names correctly. */ + retval = kibnal_qp_rts(conn->ibc_qp, req->cep_data.qpn, + min_t(__u8, req->cep_data.offered_initiator_depth, + ca_attr->max_read_atom_qp_outstanding), + &req->path_data.path, + min_t(__u8, req->cep_data.offered_resp_res, + ca_attr->max_qp_depth_for_init_read_atom), + req->cep_data.start_psn); + + if (retval) { + CERROR ("Can't mark QP RTS/RTR "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), retval); + GOTO(out, reason = cm_rej_code_no_qp); + } + + dump_qp(conn); + + retval = vv_qp_query(kibnal_data.kib_hca, conn->ibc_qp, &qp_context, &conn->ibc_qp_attrs); + if (retval) { + CERROR ("Couldn't query qp attributes "LPX64": %d\n", + le64_to_cpu(wcr->wcr_nid), retval); + GOTO(out, reason = cm_rej_code_no_qp); + } + query = &conn->ibc_qp_attrs; + + PORTAL_ALLOC(rep, sizeof(*rep)); + if (rep == NULL) { + CERROR ("can't reply and receive buffers\n"); + GOTO(out, reason = cm_rej_code_insuff_resp_res); + } + + /* don't try to deref this into the incoming wcr :) */ + wcr = (kib_wire_connreq_t *)rep->priv_data; + + *rep = (cm_reply_data_t) { + .qpn = query->query.qp_num, + .start_psn = query->query.receve_psn, + .arb_resp_res = query->query.rdma_r_atom_outstand_num, + .arb_initiator_depth = query->query.rdma_r_atom_outstand_num, + .targ_ack_delay = 0, + .failover_accepted = 0, + .end_to_end_flow_ctrl = 1, /* (query->query.flow_control is never set) */ + .rnr_retry_count = req->cep_data.rtr_retry_cnt, + }; + + *wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), + }; + + retval = cm_accept(cep, rep, NULL, kibnal_cm_callback, conn); + + PORTAL_FREE(rep, sizeof(*rep)); + + if (retval) { + /* XXX it seems we don't call reject after this point? */ + CERROR("cm_accept() failed: %d, aborting\n", retval); + rc = -ECONNABORTED; + goto out; + } + + if (kibnal_set_cm_flags(conn->ibc_cep)) { + rc = -ECONNABORTED; + goto out; + } + + conn->ibc_cep = cep; + + CDEBUG(D_WARNING, "Connection %p -> "LPX64" ESTABLISHED.\n", + conn, conn->ibc_peer->ibp_nid); + +out: + if (reason) { + kibnal_reject(cep, reason); + rc = -ECONNABORTED; + } + + return; +} + +void +kibnal_listen_callback(cm_cep_handle_t cep, cm_conn_data_t *info, void *arg) +{ + struct cm_off_level *cm_tq; + + LASSERT(cep); + LASSERT(info); + LASSERT(arg == NULL); /* no conn yet for passive */ + + PORTAL_ALLOC_ATOMIC(cm_tq, sizeof(*cm_tq)); + if (cm_tq == NULL) { + CERROR("Failed to allocate a CM off level structure\n"); + return; + } + + cm_tq->tq.sync = 0; + cm_tq->tq.routine = _kibnal_listen_callback; + cm_tq->tq.data = cm_tq; + + cm_tq->cep = cep; + cm_tq->info = info; + cm_tq->conn = NULL; + + schedule_task(&cm_tq->tq); +} + +static void +kibnal_pathreq_callback (struct sa_request *request) +{ + vv_hca_attrib_t *ca_attr = &kibnal_data.kib_hca_attrs; + kib_conn_t *conn = request->context; + gsi_dtgrm_t *dtgrm; + sa_mad_v2_t *mad; + ib_path_record_v2_t *path; + u64 component_mask; + cm_return_t cmret; + + if (request->status) { + CERROR ("status %d\n", request->status); + free_sa_request(request); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + dtgrm = request->dtgrm_resp; + mad = (sa_mad_v2_t *) dtgrm->mad; + path = (ib_path_record_v2_t *) mad->payload; + + /* Put the path record in host order for that stack. */ + gid_swap(&path->sgid); + gid_swap(&path->dgid); + path->slid = be16_to_cpu(path->slid); + path->dlid = be16_to_cpu(path->dlid); + path->flow_label = be32_to_cpu(path->flow_label); + path->pkey = be16_to_cpu(path->pkey); + path->sl = be16_to_cpu(path->sl); + + CDEBUG(D_NET, "sgid "LPX64":"LPX64" dgid " + LPX64":"LPX64" pkey %x\n", + path->sgid.scope.g.subnet, + path->sgid.scope.g.eui64, + path->dgid.scope.g.subnet, + path->dgid.scope.g.eui64, + path->pkey); + +#if TODO + component_mask = be64_to_cpu(mad->component_mask); + if ((component_mask && (1ull << 1)) == 0) { + CERROR ("no servivce GID in SR: "LPX64"\n", component_mask); + free_sa_request(request); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } +#endif + + conn->ibc_connreq->cr_path = *path; + + free_sa_request(request); + + conn->ibc_cep = cm_create_cep(cm_cep_transp_rc); + if (conn->ibc_cep == NULL) { + CERROR ("Can't create CEP\n"); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + if (kibnal_set_cm_flags(conn->ibc_cep)) { + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) { + .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC), + .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION), + .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE), + .wcr_nid = cpu_to_le64(kibnal_data.kib_nid), + .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation), + }; + + conn->ibc_connreq->cr_cm_req = (cm_request_data_t) { + .sid = kibnal_data.kib_service_id, + .cep_data = (cm_cep_data_t) { + .ca_guid = kibnal_data.kib_hca_attrs.guid, + .end_to_end_flow_ctrl = 1, + .port_guid = kibnal_data.kib_port_gid.scope.g.eui64, + .local_port_num = kibnal_data.kib_port, + .start_psn = IBNAL_STARTING_PSN, + .qpn = conn->ibc_qp_attrs.query.qp_num, + .retry_cnt = IBNAL_RETRY, + .rtr_retry_cnt = IBNAL_RNR_RETRY, + .ack_timeout = IBNAL_ACK_TIMEOUT, + .offered_resp_res = ca_attr->max_read_atom_qp_outstanding, + .offered_initiator_depth = ca_attr->max_qp_depth_for_init_read_atom, + }, + .path_data = (cm_cep_path_data_t) { + .subn_local = TRUE, + .path = conn->ibc_connreq->cr_path, + }, + }; + +#if 0 + /* XXX set timeout just like SDP!!!*/ + conn->ibc_connreq->cr_path.packet_life = 13; +#endif + /* Flag I'm getting involved with the CM... */ + conn->ibc_state = IBNAL_CONN_CONNECTING; + +#if 0 + CDEBUG(D_NET, "Connecting to, service id "LPX64", on "LPX64"\n", + conn->ibc_connreq->cr_service.RID.ServiceID, + *kibnal_service_nid_field(&conn->ibc_connreq->cr_service)); +#endif + + memset(conn->ibc_connreq->cr_cm_req.priv_data, 0, + cm_REQ_priv_data_len); + memcpy(conn->ibc_connreq->cr_cm_req.priv_data, + &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr)); + + /* kibnal_cm_callback gets my conn ref */ + cmret = cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cm_req, + kibnal_cm_callback, conn); + + if (cmret) { + CERROR ("Connect failed: %d\n", cmret); + /* Back out state change as connect failed */ + conn->ibc_state = IBNAL_CONN_INIT_QP; + kibnal_connreq_done (conn, 1, -EINVAL); + } + + CDEBUG(D_NET, "connection REQ sent\n"); +} + +static void +kibnal_service_get_callback (struct sa_request *request) +{ + kib_conn_t *conn = request->context; + gsi_dtgrm_t *dtgrm; + sa_mad_v2_t *mad; + ib_service_record_v2_t *sr; + u64 component_mask; + int ret; + + if (request->status) { + CERROR ("status %d\n", request->status); + free_sa_request(request); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + dtgrm = request->dtgrm_resp; + mad = (sa_mad_v2_t *) dtgrm->mad; + sr = (ib_service_record_v2_t *) mad->payload; + + CDEBUG(D_NET, "sid "LPX64" gid "LPX64":"LPX64" pkey %x\n", + sr->service_id, + sr->service_gid.scope.g.subnet, + sr->service_gid.scope.g.eui64, + sr->service_pkey); + + component_mask = be64_to_cpu(mad->component_mask); + if ((component_mask && (1ull << 1)) == 0) { + CERROR ("no service GID in SR: "LPX64"\n", component_mask); + free_sa_request(request); + kibnal_connreq_done (conn, 1, -EINVAL); + return; + } + + //conn->ibc_connreq->cr_service = sr; + + /* Return the response datagram to its pool. We don't need it anymore. */ + gsi_dtgrm_pool_put(request->dtgrm_resp); + request->dtgrm_resp = NULL; + + /* kibnal_pathreq_callback gets my conn ref */ + ret = kibnal_pathrecord_op(request, sr->service_gid, kibnal_pathreq_callback, conn); + if (ret) { + CERROR ("Path record request failed: %d\n", ret); + kibnal_connreq_done (conn, 1, -EINVAL); + } + + return; +} + +static void +kibnal_connect_peer (kib_peer_t *peer) +{ + kib_conn_t *conn = kibnal_create_conn(); + struct sa_request *request; + int ret; + + LASSERT (peer->ibp_connecting != 0); + + if (conn == NULL) { + CERROR ("Can't allocate conn\n"); + kibnal_peer_connect_failed (peer, 1, -ENOMEM); + return; + } + + conn->ibc_peer = peer; + kib_peer_addref(peer); + + PORTAL_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq)); + if (conn->ibc_connreq == NULL) { + CERROR ("Can't allocate connreq\n"); + kibnal_connreq_done (conn, 1, -ENOMEM); + return; + } + + memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq)); + + /* kibnal_service_get_callback gets my conn ref */ + ret = kibnal_advertize_op(peer->ibp_nid, SUBN_ADM_GET, kibnal_service_get_callback, conn); + + if (ret) { + CERROR("kibnal_advertize_op failed for op %d NID "LPX64"\n", SUBN_ADM_GET, peer->ibp_nid); + /* TODO: I'm unsure yet whether ret contains a + * consistent error type, so I return -EIO in the + * meantime. */ + kibnal_connreq_done (conn, 1, -EIO); + } + + return; +} + +static int +kibnal_conn_timed_out (kib_conn_t *conn) +{ + kib_tx_t *tx; + struct list_head *ttmp; + unsigned long flags; + + spin_lock_irqsave (&conn->ibc_lock, flags); + + list_for_each (ttmp, &conn->ibc_tx_queue) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (!tx->tx_passive_rdma_wait); + LASSERT (tx->tx_sending == 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + + list_for_each (ttmp, &conn->ibc_active_txs) { + tx = list_entry (ttmp, kib_tx_t, tx_list); + + LASSERT (tx->tx_passive_rdma || + !tx->tx_passive_rdma_wait); + + LASSERT (tx->tx_passive_rdma_wait || + tx->tx_sending != 0); + + if (time_after_eq (jiffies, tx->tx_deadline)) { + spin_unlock_irqrestore (&conn->ibc_lock, flags); + return 1; + } + } + + spin_unlock_irqrestore (&conn->ibc_lock, flags); + + return 0; +} + +static void +kibnal_check_conns (int idx) +{ + struct list_head *peers = &kibnal_data.kib_peers[idx]; + struct list_head *ptmp; + kib_peer_t *peer; + kib_conn_t *conn; + struct list_head *ctmp; + + again: + /* NB. We expect to have a look at all the peers and not find any + * rdmas to time out, so we just use a shared lock while we + * take a look... */ + read_lock (&kibnal_data.kib_global_lock); + + list_for_each (ptmp, peers) { + peer = list_entry (ptmp, kib_peer_t, ibp_list); + + list_for_each (ctmp, &peer->ibp_conns) { + conn = list_entry (ctmp, kib_conn_t, ibc_list); + + KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED); + + /* In case we have enough credits to return via a + * NOOP, but there were no non-blocking tx descs + * free to do it last time... */ + kibnal_check_sends(conn); + + if (!kibnal_conn_timed_out(conn)) + continue; + + CDEBUG(D_NET, "++conn[%p] state %d -> "LPX64" (%d)\n", + conn, conn->ibc_state, peer->ibp_nid, + atomic_read (&conn->ibc_refcount)); + + atomic_inc (&conn->ibc_refcount); + read_unlock (&kibnal_data.kib_global_lock); + + CERROR("Timed out RDMA with "LPX64"\n", + peer->ibp_nid); + + kibnal_close_conn (conn, -ETIMEDOUT); + kibnal_put_conn (conn); + + /* start again now I've dropped the lock */ + goto again; + } + } + + read_unlock (&kibnal_data.kib_global_lock); +} + +static void +kib_connd_handle_state(kib_conn_t *conn) +{ + vv_return_t retval; + + switch (conn->ibc_state) { + /* all refs have gone, free and be done with it */ + case IBNAL_CONN_DISCONNECTED: + kibnal_destroy_conn (conn); + return; /* avoid put_conn */ + + case IBNAL_CONN_SEND_DREQ: + + retval = cm_disconnect(conn->ibc_cep, &kibnal_data.cm_data.dreq_data, NULL); + if (retval) /* XXX do real things */ + CERROR("disconnect failed: %d\n", retval); + + conn->ibc_state = IBNAL_CONN_DREQ; + break; + + /* a callback got to the conn before we did */ + case IBNAL_CONN_DREP: + break; + + default: + CERROR ("Bad conn %p state: %d\n", conn, + conn->ibc_state); + LBUG(); + break; + } + + /* drop ref from close_conn */ + kibnal_put_conn(conn); +} + +int +kibnal_connd (void *arg) +{ + wait_queue_t wait; + unsigned long flags; + kib_conn_t *conn; + kib_peer_t *peer; + int timeout; + int i; + int peer_index = 0; + unsigned long deadline = jiffies; + + kportal_daemonize ("kibnal_connd"); + kportal_blockallsigs (); + + init_waitqueue_entry (&wait, current); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + + for (;;) { + if (!list_empty (&kibnal_data.kib_connd_conns)) { + conn = list_entry (kibnal_data.kib_connd_conns.next, + kib_conn_t, ibc_list); + list_del (&conn->ibc_list); + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + kib_connd_handle_state(conn); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + continue; + } + + if (!list_empty (&kibnal_data.kib_connd_peers)) { + peer = list_entry (kibnal_data.kib_connd_peers.next, + kib_peer_t, ibp_connd_list); + + list_del_init (&peer->ibp_connd_list); + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + kibnal_connect_peer (peer); + kib_peer_decref (peer); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } + + /* shut down and nobody left to reap... */ + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) + break; + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + /* careful with the jiffy wrap... */ + while ((timeout = (int)(deadline - jiffies)) <= 0) { + const int n = 4; + const int p = 1; + int chunk = kibnal_data.kib_peer_hash_size; + + /* Time to check for RDMA timeouts on a few more + * peers: I do checks every 'p' seconds on a + * proportion of the peer table and I need to check + * every connection 'n' times within a timeout + * interval, to ensure I detect a timeout on any + * connection within (n+1)/n times the timeout + * interval. */ + + if (kibnal_tunables.kib_io_timeout > n * p) + chunk = (chunk * n * p) / + kibnal_tunables.kib_io_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + kibnal_check_conns (peer_index); + peer_index = (peer_index + 1) % + kibnal_data.kib_peer_hash_size; + } + + deadline += p * HZ; + } + + kibnal_data.kib_connd_waketime = jiffies + timeout; + + set_current_state (TASK_INTERRUPTIBLE); + add_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + + if (!kibnal_data.kib_shutdown && + list_empty (&kibnal_data.kib_connd_conns) && + list_empty (&kibnal_data.kib_connd_peers)) + schedule_timeout (timeout); + + set_current_state (TASK_RUNNING); + remove_wait_queue (&kibnal_data.kib_connd_waitq, &wait); + + spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags); + } + + spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags); + + kibnal_thread_fini (); + return (0); +} + +int +kibnal_scheduler(void *arg) +{ + long id = (long)arg; + char name[16]; + kib_rx_t *rx; + kib_tx_t *tx; + unsigned long flags; + int rc; + int counter = 0; + int did_something; + + snprintf(name, sizeof(name), "kibnal_sd_%02ld", id); + kportal_daemonize(name); + kportal_blockallsigs(); + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags); + + for (;;) { + did_something = 0; + + while (!list_empty(&kibnal_data.kib_sched_txq)) { + tx = list_entry(kibnal_data.kib_sched_txq.next, + kib_tx_t, tx_list); + list_del(&tx->tx_list); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + kibnal_tx_done(tx); + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + + if (!list_empty(&kibnal_data.kib_sched_rxq)) { + rx = list_entry(kibnal_data.kib_sched_rxq.next, + kib_rx_t, rx_list); + list_del(&rx->rx_list); + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + + kibnal_rx(rx); + + did_something = 1; + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + + /* shut down and no receives to complete... */ + if (kibnal_data.kib_shutdown && + atomic_read(&kibnal_data.kib_nconns) == 0) + break; + + /* nothing to do or hogging CPU */ + if (!did_something || counter++ == IBNAL_RESCHED) { + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, + flags); + counter = 0; + + if (!did_something) { + rc = wait_event_interruptible( + kibnal_data.kib_sched_waitq, + !list_empty(&kibnal_data.kib_sched_txq) || + !list_empty(&kibnal_data.kib_sched_rxq) || + (kibnal_data.kib_shutdown && + atomic_read (&kibnal_data.kib_nconns) == 0)); + } else { + our_cond_resched(); + } + + spin_lock_irqsave(&kibnal_data.kib_sched_lock, + flags); + } + } + + spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags); + + kibnal_thread_fini(); + return (0); +} + + +lib_nal_t kibnal_lib = { + .libnal_data = &kibnal_data, /* NAL private data */ + .libnal_send = kibnal_send, + .libnal_send_pages = kibnal_send_pages, + .libnal_recv = kibnal_recv, + .libnal_recv_pages = kibnal_recv_pages, + .libnal_dist = kibnal_dist +}; diff --git a/lnet/klnds/viblnd/vibnal_sa.c b/lnet/klnds/viblnd/vibnal_sa.c new file mode 100644 index 0000000000000000000000000000000000000000..c8ff098ef3be5f55d3d3b9830233d9391553b5ae --- /dev/null +++ b/lnet/klnds/viblnd/vibnal_sa.c @@ -0,0 +1,333 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2004 Cluster File Systems, Inc. + * Author: Frank Zago <fzago@systemfabricworks.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include "vibnal.h" + +/*--------------------------------------------------------------------------*/ + +struct sa_request *alloc_sa_request(void) +{ + struct sa_request *request; + gsi_dtgrm_t *dtgrm; + vv_return_t retval; + + PORTAL_ALLOC(request, sizeof(*request)); + if (request == NULL) + return NULL; + + retval = gsi_dtgrm_pool_get(kibnal_data.gsi_pool_handle, &dtgrm); + if (retval) { + CERROR("cannot get a datagram: %d\n", retval); + PORTAL_FREE(request, sizeof(*request)); + return NULL; + } + + memset(request, 0, sizeof(*request)); + + request->dtgrm_req = dtgrm; + request->retry = GSI_RETRY; /* retry the request up to 10 times */ + + return request; +} + +void free_sa_request(struct sa_request *request) +{ + if (request) { + if (request->dtgrm_req) { + gsi_dtgrm_pool_put(request->dtgrm_req); + } + + if (request->dtgrm_resp) { + gsi_dtgrm_pool_put(request->dtgrm_resp); + } + + PORTAL_FREE(request, sizeof(*request)); + } +} + +/*--------------------------------------------------------------------------*/ + +static void complete_sa_request(struct sa_request *request) +{ + if (request->callback) { + request->callback(request); + } else { + complete(&request->signal); + } +} + +static void +sa_request_timeout_handler(unsigned long context) +{ + struct sa_request *request = (struct sa_request *)context; + int ret; + vv_return_t retval; + + if (request->retry--) { + /* Resend */ + + CDEBUG(D_NET, "timer expired for MAD TID "LPX64" - retrying (%d retry left)\n", request->mad->hdr.transact_id, request->retry); + retval = gsi_post_send_dtgrm(kibnal_data.gsi_handle, request->dtgrm_req); + if (retval) { + CERROR("gsi_post_send_dtgrm failed: %d\n", retval); + ret = -EIO; + } else { + + /* restart the timer */ + request->timer.expires = jiffies + (HZ * GSI_TIMEOUT); + add_timer(&request->timer); + + ret = 0; + } + } else { + CDEBUG(D_NET, "timer expired for MAD TID "LPX64" - no more retry\n", request->mad->hdr.transact_id); + ret = ETIMEDOUT; + } + + if (ret) { + request->status = ret; + complete_sa_request(request); + } +} + +/*--------------------------------------------------------------------------*/ + +/* Send a SA request */ +int vibnal_start_sa_request(struct sa_request *request) +{ + int ret; + vv_return_t vv_stat; + int retval; + + CDEBUG (D_NET, "querying SA\n"); + + /* Put the request on the pending list and get a transaction ID. */ + down(&kibnal_data.gsi_mutex); + + list_add_tail(&request->list, &kibnal_data.gsi_pending); + + up(&kibnal_data.gsi_mutex); + + retval = gsi_post_send_dtgrm(kibnal_data.gsi_handle, request->dtgrm_req); + if (retval) { + CERROR("gsi_post_send_dtgrm failed: %d\n", retval); + return -EIO; + } + + /* TODO: This might create a race condition if the response has + * already been received. */ + init_timer(&request->timer); + request->timer.expires = jiffies + (HZ * GSI_TIMEOUT); + request->timer.data = (unsigned long)request; + request->timer.function = sa_request_timeout_handler; + add_timer(&request->timer); + + CDEBUG(D_NET, "Posted MAD with TID= "LPX64"\n", request->mad->hdr.transact_id); + return 0; +} + +/* Received a MAD */ +void +vibnal_mad_received_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t *dtgrm) +{ + sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad; + ib_service_record_v2_t *sr = (ib_service_record_v2_t *) mad->payload; + struct list_head *this; + struct sa_request *request; + + CDEBUG(D_NET, "Received new MAD\n"); + + /* Validate the MAD */ + if (mad->hdr.base_ver != MAD_IB_BASE_VERSION || + mad->hdr.class != MAD_CLASS_SUBN_ADM || + mad->hdr.class_ver != 2) { + CDEBUG(D_NET, "ignoring MAD (base_ver=%x, class=%x, class_ver=%x)\n", + mad->hdr.base_ver, mad->hdr.class, mad->hdr.class_ver); + return; + } + + /* We don't care about queries, only about responses */ + if (mad->hdr.m.ms.r != 1) { + CDEBUG(D_NET, "ignoring MAD (response=%d)\n", mad->hdr.m.ms.r); + return; + } + + /* We only care about service records and path records. */ + if (mad->hdr.attrib_id != SA_SERVICE_RECORD && + mad->hdr.attrib_id != SA_PATH_RECORD) { + CDEBUG(D_NET, "ignoring MAD (attrib_id=%x)\n", mad->hdr.attrib_id); + return; + } + + /* Find the MAD request in our list */ + request = NULL; + + down(&kibnal_data.gsi_mutex); + + list_for_each(this, &kibnal_data.gsi_pending) { + struct sa_request *_request = list_entry(this, struct sa_request, list); + + CDEBUG(D_NET, "Comparing pending MAD TID "LPX64" with incoming MAD TID "LPX64"\n", + _request->mad->hdr.transact_id, mad->hdr.transact_id); + + if (_request->mad->hdr.transact_id == mad->hdr.transact_id) { + CDEBUG(D_NET, "TIDs match\n"); + request = _request; + break; + } + } + + if (request == NULL) { + up(&kibnal_data.gsi_mutex); + CDEBUG(D_NET, "ignoring MAD (TID = "LPX64"\n", mad->hdr.transact_id); + return; + } + + up(&kibnal_data.gsi_mutex); + + /* Stop the timer and remove the request from the pending list of requests. */ + del_timer_sync(&request->timer); + + down(&kibnal_data.gsi_mutex); + + list_del(&request->list); + + up(&kibnal_data.gsi_mutex); + + request->dtgrm_resp = dtgrm; + + /* Depending on the response, update the status. This is not exact + * because a non-zero status is not always an error, but that + * should be good enough right now. */ + /* TODO: fix. */ + if (mad->hdr.u.ns.status.raw16) { + CDEBUG(D_NET, "MAD response has bad status: %x\n", mad->hdr.u.ns.status.raw16); + request->status = -EIO; + } else { + request->status = 0; + } + + CDEBUG(D_NET, "incoming MAD successfully processed (status is %d)\n", request->status); + + complete_sa_request(request); +} + +/* MAD send completion */ +void +vibnal_mad_sent_cb(gsi_class_handle_t handle, void *context, gsi_dtgrm_t * dtgrm) +{ + sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad; + + /* Don't do anything. We might have to resend the datagram later. */ + CDEBUG(D_NET, "Datagram with TID "LPX64" sent.\n", mad->hdr.transact_id); +} + +/* + * method is SUBN_ADM_SET, SUBN_ADM_GET, SUBN_ADM_DELETE. Tables not supported. + * nid is the nid to advertize/query/unadvertize + * Note: dgid is in network order. + */ +static void fill_pathrecord_request(struct sa_request *request, vv_gid_t dgid) +{ + gsi_dtgrm_t *dtgrm = request->dtgrm_req; + sa_mad_v2_t *mad = (sa_mad_v2_t *) dtgrm->mad; + ib_path_record_v2_t *path = (ib_path_record_v2_t *) mad->payload; + + memset(mad, 0, MAD_BLOCK_SIZE); + + request->mad = mad; + + dtgrm->rlid = kibnal_data.kib_port_attr.port_sma_address_info.sm_lid; + dtgrm->sl = kibnal_data.kib_port_attr.port_sma_address_info.service_level; + + mad->hdr.base_ver = MAD_IB_BASE_VERSION; + mad->hdr.class = MAD_CLASS_SUBN_ADM; + mad->hdr.class_ver = 2; + mad->hdr.m.ms.method = SUBN_ADM_GET; + mad->hdr.attrib_id = SA_PATH_RECORD; /* something(?) will swap that field */ + mad->hdr.attrib_modifier = 0xFFFFFFFF; /* and that one too? */ + + /* Note: the transaction ID is set by the Voltaire stack if it is 0. */ + + /* TODO: these harcoded value to something better */ + mad->payload_len = cpu_to_be32(0x40 /*header size*/ + 0x35 /* PathRecord size */); + + mad->component_mask = cpu_to_be64( + (1 << 2) | /* DGID */ + (1 << 3) | /* SGID */ + (1 << 12)| /* numb_paths*/ + (1 << 13) /* P_key */ + ); + + path->pkey = cpu_to_be16(kibnal_data.kib_port_pkey); + path->sgid = kibnal_data.kib_port_gid; + gid_swap(&path->sgid); + path->dgid = dgid; /* already in network order */ + path->numb_path = 1; +} + +/* + * Do a path record query + * If callback is NULL, the function is synchronous (and context is ignored). + * Note: dgid is in network order. + */ +/* TODO: passing a request is a bit of a hack, but since this function + * is called under interrupt, we cannot allocate memory here :(. */ +int kibnal_pathrecord_op(struct sa_request *request, vv_gid_t dgid, sa_request_cb_t callback, void *context) +{ + int ret; + + LASSERT (kibnal_data.kib_nid != PTL_NID_ANY); + + fill_pathrecord_request(request, dgid); + + if (callback) { + request->callback = callback; + request->context = context; + } else { + init_completion(&request->signal); + } + + ret = vibnal_start_sa_request(request); + if (ret) { + CERROR("vibnal_send_sa failed: %d\n", ret); + free_sa_request(request); + } else { + if (callback) { + /* Return. The callback will have to free the SA request. */ + ret = 0; + } else { + wait_for_completion(&request->signal); + + ret = request->status; + + if (ret != 0) { + CERROR ("Error %d in querying a path record\n", ret); + } + + free_sa_request(request); + } + } + + return ret; +} diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index b5286fccc3bf0e9ccf2fdce2d1f557ea30d8c950..85de4cfb904e97e53c182278d5fa2284c3ef85dd 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -265,6 +265,7 @@ char *portals_nid2str(int nal, ptl_nid_t nid, char *str) case TCPNAL: /* userspace NAL */ case IIBNAL: + case VIBNAL: case OPENIBNAL: case RANAL: case SOCKNAL: diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 9c1537b474602698b2a6d15d4de3e80c83a8053c..dbe264bca349e56e69d2e75c5fc0f6c005de4922 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -78,6 +78,7 @@ static name2num_t nalnames[] = { {"gm", GMNAL}, {"openib", OPENIBNAL}, {"iib", IIBNAL}, + {"vib", VIBNAL}, {"lo", LONAL}, {"ra", RANAL}, #else @@ -676,7 +677,8 @@ jt_ptl_print_peers (int argc, char **argv) int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, + OPENIBNAL, IIBNAL, VIBNAL, 0)) return -1; for (index = 0;;index++) { @@ -693,6 +695,11 @@ jt_ptl_print_peers (int argc, char **argv) ptl_ipaddr_2_str (pcfg.pcfg_size, buffer[0], 1), ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), pcfg.pcfg_misc, pcfg.pcfg_count); + else if (g_nal_is_compatible(NULL, RANAL, 0)) + printf (LPX64"[%d]@%s:%d\n", + pcfg.pcfg_nid, pcfg.pcfg_wait, + ptl_ipaddr_2_str (pcfg.pcfg_id, buffer[1], 1), + pcfg.pcfg_misc); else printf (LPX64"[%d]\n", pcfg.pcfg_nid, pcfg.pcfg_wait); @@ -712,17 +719,18 @@ jt_ptl_add_peer (int argc, char **argv) int port = 0; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, + OPENIBNAL, IIBNAL, VIBNAL, 0)) return -1; if (g_nal_is_compatible(NULL, SOCKNAL, RANAL, 0)) { if (argc != 4) { - fprintf (stderr, "usage(tcp): %s nid ipaddr port\n", + fprintf (stderr, "usage(tcp,ra): %s nid ipaddr port\n", argv[0]); return 0; } } else if (argc != 2) { - fprintf (stderr, "usage(openib,iib): %s nid\n", argv[0]); + fprintf (stderr, "usage(openib,iib,vib): %s nid\n", argv[0]); return 0; } @@ -769,7 +777,8 @@ jt_ptl_del_peer (int argc, char **argv) int argidx; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, + OPENIBNAL, IIBNAL, VIBNAL, 0)) return -1; if (g_nal_is_compatible(NULL, SOCKNAL, 0)) { @@ -832,7 +841,8 @@ jt_ptl_print_connections (int argc, char **argv) int index; int rc; - if (!g_nal_is_compatible (argv[0], SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) + if (!g_nal_is_compatible (argv[0], SOCKNAL, RANAL, + OPENIBNAL, IIBNAL, VIBNAL, 0)) return -1; for (index = 0;;index++) { @@ -857,6 +867,10 @@ jt_ptl_print_connections (int argc, char **argv) pcfg.pcfg_count, /* tx buffer size */ pcfg.pcfg_size, /* rx buffer size */ pcfg.pcfg_wait ? "nagle" : "nonagle"); + else if (g_nal_is_compatible (NULL, RANAL, 0)) + printf ("[%d]"LPX64"\n", + pcfg.pcfg_id, /* device id */ + pcfg.pcfg_nid); else printf (LPX64"\n", pcfg.pcfg_nid); @@ -1023,7 +1037,8 @@ int jt_ptl_disconnect(int argc, char **argv) return 0; } - if (!g_nal_is_compatible (NULL, SOCKNAL, OPENIBNAL, IIBNAL, RANAL, 0)) + if (!g_nal_is_compatible (NULL, SOCKNAL, RANAL, + OPENIBNAL, IIBNAL, VIBNAL, 0)) return 0; if (argc >= 2 &&