diff --git a/lnet/include/linux/kp30.h b/lnet/include/linux/kp30.h index 750d16c06800cc390ef04200e6a1070aa810cea9..0a1cc57a78b4e8bec72c552a828ae9583d294ad9 100644 --- a/lnet/include/linux/kp30.h +++ b/lnet/include/linux/kp30.h @@ -371,12 +371,14 @@ typedef struct { } kpr_fwd_desc_t; typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); +typedef void (*kpr_notify_t)(void *arg, ptl_nid_t peer, int alive); /* NAL's routing interface (Kernel Portals Routing Nal Interface) */ typedef const struct { int kprni_nalid; /* NAL's id */ void *kprni_arg; /* Arg to pass when calling into NAL */ kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */ + kpr_notify_t kprni_notify; /* NAL's notification entrypoint */ } kpr_nal_interface_t; /* Router's routing interface (Kernel Portals Routing Router Interface) */ @@ -386,9 +388,10 @@ typedef const struct { int (*kprri_register) (kpr_nal_interface_t *nal_interface, void **router_arg); - /* ask the router to find a gateway that forwards to 'nid' and is a peer - * of the calling NAL */ - int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, + /* ask the router to find a gateway that forwards to 'nid' and is a + * peer of the calling NAL; assume caller will send 'nob' bytes of + * payload there */ + int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid); /* hand a packet over to the router for forwarding */ @@ -398,6 +401,10 @@ typedef const struct { void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd, int error); + /* notify the router about peer state */ + void (*kprri_notify) (void *router_arg, ptl_nid_t peer, + int alive, time_t when); + /* the calling NAL is shutting down */ void (*kprri_shutdown) (void *router_arg); @@ -416,10 +423,14 @@ typedef struct { typedef const struct { int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, ptl_nid_t hi_nid); - int (*kprci_del_route)(ptl_nid_t nid); + int (*kprci_del_route)(int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); int (*kprci_get_route)(int index, int *gateway_nal, - ptl_nid_t *gateway, ptl_nid_t *lo_nid, - ptl_nid_t *hi_nid); + ptl_nid_t *gateway, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, + int *alive); + int (*kprci_notify)(int gateway_nal, ptl_nid_t gateway_nid, + int alive, time_t when); } kpr_control_interface_t; extern kpr_control_interface_t kpr_control_interface; @@ -449,12 +460,12 @@ kpr_routing (kpr_router_t *router) } static inline int -kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid) +kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid) { if (!kpr_routing (router)) - return (-EHOSTUNREACH); + return (-ENETUNREACH); - return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, + return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, nob, gateway_nid)); } @@ -476,7 +487,7 @@ static inline void kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd) { if (!kpr_routing (router)) - fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH); + fwd->kprfd_callback (fwd->kprfd_callback_arg, -ENETUNREACH); else router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd); } @@ -488,6 +499,16 @@ kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error) router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error); } +static inline void +kpr_notify (kpr_router_t *router, + ptl_nid_t peer, int alive, time_t when) +{ + if (!kpr_routing (router)) + return; + + router->kpr_interface->kprri_notify(router->kpr_arg, peer, alive, when); +} + static inline void kpr_shutdown (kpr_router_t *router) { @@ -554,6 +575,7 @@ extern struct prof_ent prof_ents[MAX_PROFS]; #endif /* PORTALS_PROFILING */ /* debug.c */ +void portals_run_upcall(char **argv); void portals_run_lbug_upcall(char * file, const char *fn, const int line); void portals_debug_dumplog(void); int portals_debug_init(unsigned long bufsize); @@ -622,6 +644,92 @@ extern void kportal_blockallsigs (void); # define CURRENT_TIME time(0) #endif +/******************************************************************************/ +/* Light-weight trace + * Support for temporary event tracing with minimal Heisenberg effect. */ +#define LWT_SUPPORT 1 + +typedef struct { + cycles_t lwte_when; + char *lwte_where; + void *lwte_task; + long lwte_p1; + long lwte_p2; + long lwte_p3; + long lwte_p4; +} lwt_event_t; + +#if LWT_SUPPORT +#ifdef __KERNEL__ +#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t)) + +typedef struct _lwt_page { + struct list_head lwtp_list; + struct page *lwtp_page; + lwt_event_t *lwtp_events; +} lwt_page_t; + +typedef struct { + int lwtc_current_index; + lwt_page_t *lwtc_current_page; +} lwt_cpu_t; + +extern int lwt_enabled; +extern lwt_cpu_t lwt_cpus[]; + +extern int lwt_init (void); +extern void lwt_fini (void); +extern int lwt_lookup_string (int *size, char *knlptr, + char *usrptr, int usrsize); +extern int lwt_control (int enable, int clear); +extern int lwt_snapshot (int *ncpu, int *total_size, + void *user_ptr, int user_size); + +/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set. + * This stuff is meant for finding specific problems; it never stays in + * production code... */ + +#define LWTSTR(n) #n +#define LWTWHERE(f,l) f ":" LWTSTR(l) + +#define LWT_EVENT(p1, p2, p3, p4) \ +do { \ + unsigned long flags; \ + lwt_cpu_t *cpu; \ + lwt_page_t *p; \ + lwt_event_t *e; \ + \ + local_irq_save (flags); \ + \ + if (lwt_enabled) { \ + cpu = &lwt_cpus[smp_processor_id()]; \ + p = cpu->lwtc_current_page; \ + e = &p->lwtp_events[cpu->lwtc_current_index++]; \ + \ + if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) { \ + cpu->lwtc_current_page = \ + list_entry (p->lwtp_list.next, \ + lwt_page_t, lwtp_list); \ + cpu->lwtc_current_index = 0; \ + } \ + \ + e->lwte_when = get_cycles(); \ + e->lwte_where = LWTWHERE(__FILE__,__LINE__); \ + e->lwte_task = current; \ + e->lwte_p1 = (long)(p1); \ + e->lwte_p2 = (long)(p2); \ + e->lwte_p3 = (long)(p3); \ + e->lwte_p4 = (long)(p4); \ + } \ + \ + local_irq_restore (flags); \ +} while (0) +#else /* __KERNEL__ */ +#define LWT_EVENT(p1,p2,p3,p4) /* no userland implementation yet */ +#endif /* __KERNEL__ */ +#endif /* LWT_SUPPORT */ + + #include <linux/portals_lib.h> /* @@ -856,8 +964,11 @@ static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) #define IOC_PORTAL_GET_NID _IOWR('e', 39, long) #define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long) #define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long) - -#define IOC_PORTAL_MAX_NR 41 +#define IOC_PORTAL_NOTIFY_ROUTER _IOWR('e', 42, long) +#define IOC_PORTAL_LWT_CONTROL _IOWR('e', 43, long) +#define IOC_PORTAL_LWT_SNAPSHOT _IOWR('e', 44, long) +#define IOC_PORTAL_LWT_LOOKUP_STRING _IOWR('e', 45, long) +#define IOC_PORTAL_MAX_NR 45 enum { QSWNAL = 1, diff --git a/lnet/include/lnet/lnetctl.h b/lnet/include/lnet/lnetctl.h index 827811127a21cc8a68e4f5073b3e50ed793b5a94..7763f1b548be7333ab117ae02733f89d8bc34ddc 100644 --- a/lnet/include/lnet/lnetctl.h +++ b/lnet/include/lnet/lnetctl.h @@ -54,8 +54,10 @@ int jt_ptl_txmem (int argc, char **argv); int jt_ptl_nagle (int argc, char **argv); int jt_ptl_add_route (int argc, char **argv); int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_notify_router (int argc, char **argv); int jt_ptl_print_routes (int argc, char **argv); int jt_ptl_fail_nid (int argc, char **argv); +int jt_ptl_lwt(int argc, char **argv); int dbg_initialize(int argc, char **argv); int jt_dbg_filter(int argc, char **argv); diff --git a/lnet/include/lnet/ptlctl.h b/lnet/include/lnet/ptlctl.h index 827811127a21cc8a68e4f5073b3e50ed793b5a94..7763f1b548be7333ab117ae02733f89d8bc34ddc 100644 --- a/lnet/include/lnet/ptlctl.h +++ b/lnet/include/lnet/ptlctl.h @@ -54,8 +54,10 @@ int jt_ptl_txmem (int argc, char **argv); int jt_ptl_nagle (int argc, char **argv); int jt_ptl_add_route (int argc, char **argv); int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_notify_router (int argc, char **argv); int jt_ptl_print_routes (int argc, char **argv); int jt_ptl_fail_nid (int argc, char **argv); +int jt_ptl_lwt(int argc, char **argv); int dbg_initialize(int argc, char **argv); int jt_dbg_filter(int argc, char **argv); diff --git a/lnet/klnds/qswlnd/qswlnd.c b/lnet/klnds/qswlnd/qswlnd.c index b5e1e3986b30664e6a4a7eff09da997288e32f76..0841d641dc3134709c69e280a35b9feb46f74913 100644 --- a/lnet/klnds/qswlnd/qswlnd.c +++ b/lnet/klnds/qswlnd/qswlnd.c @@ -32,6 +32,7 @@ kpr_nal_interface_t kqswnal_router_interface = { kprni_nalid: QSWNAL, kprni_arg: NULL, kprni_fwd: kqswnal_fwd_packet, + kprni_notify: NULL, /* we're connectionless */ }; diff --git a/lnet/klnds/qswlnd/qswlnd.h b/lnet/klnds/qswlnd/qswlnd.h index 294f3e5e6696d9610eae745583b026f4a187d931..0d8e2fa61d59de54787c60beab6f12e81d5aaef1 100644 --- a/lnet/klnds/qswlnd/qswlnd.h +++ b/lnet/klnds/qswlnd/qswlnd.h @@ -164,6 +164,7 @@ typedef struct void *ktx_args[2]; /* completion passthru */ E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ + unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ /* debug/info fields */ pid_t ktx_launcher; /* pid of launching process */ diff --git a/lnet/klnds/qswlnd/qswlnd_cb.c b/lnet/klnds/qswlnd/qswlnd_cb.c index 8d5f70b02b39f24145011850ef9cc8cc3846e456..99f299f47446e78cb5392de5d83116053a31ae4f 100644 --- a/lnet/klnds/qswlnd/qswlnd_cb.c +++ b/lnet/klnds/qswlnd/qswlnd_cb.c @@ -421,7 +421,9 @@ static void kqswnal_txhandler(EP_TXD *txd, void *arg, int status) { kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; - + struct timeval now; + time_t then; + LASSERT (txd != NULL); LASSERT (ktx != NULL); @@ -432,7 +434,15 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) if (status != EP_SUCCESS) { - CERROR ("kqswnal: Transmit failed with %d\n", status); + CERROR ("Tx completion to "LPX64" failed: %d\n", + ktx->ktx_nid, status); + + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; + + kpr_notify (&kqswnal_data.kqn_router, + ktx->ktx_nid, 0, then); + status = -EIO; } @@ -447,33 +457,40 @@ kqswnal_launch (kqswnal_tx_t *ktx) int dest = kqswnal_nid2elanid (ktx->ktx_nid); long flags; int rc; - + + ktx->ktx_launchtime = jiffies; + LASSERT (dest >= 0); /* must be a peer */ rc = ep_transmit_large(kqswnal_data.kqn_eptx, dest, ktx->ktx_port, attr, kqswnal_txhandler, ktx, ktx->ktx_iov, ktx->ktx_niov); - if (rc == 0) + switch (rc) { + case 0: /* success */ atomic_inc (&kqswnal_packets_launched); + return (0); - if (rc != ENOMEM) - return (rc); + case ENOMEM: /* can't allocate ep txd => queue for later */ + LASSERT (in_interrupt()); - /* can't allocate ep txd => queue for later */ + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - LASSERT (in_interrupt()); /* not called by thread (not looping) */ + list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + return (0); - list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); - if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) - wake_up (&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + default: /* fatal error */ + CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc); - return (0); + /* Tell router I think a node is down */ + kpr_notify (&kqswnal_data.kqn_router, ktx->ktx_nid, + 0, ktx->ktx_launchtime); + return (rc); + } } - static char * hdr_type_string (ptl_hdr_t *hdr) { @@ -584,7 +601,8 @@ kqswnal_sendmsg (nal_cb_t *nal, } if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ - rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid); + rc = kpr_lookup (&kqswnal_data.kqn_router, nid, + sizeof (ptl_hdr_t) + payload_nob, &gatewaynid); if (rc != 0) { CERROR("Can't route to "LPX64": router error %d\n", nid, rc); @@ -678,7 +696,7 @@ kqswnal_sendmsg (nal_cb_t *nal, return (PTL_FAIL); } - CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid); + CDEBUG(D_NET, "sent "LPSZ" bytes to "LPX64"\n", payload_nob, nid); return (PTL_OK); } diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index e7232a05a91b9eec42339d1039a394f6cb7273c8..c844dd6547550597e2a0fd466d1ef3000568f6eb 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -37,8 +37,34 @@ kpr_nal_interface_t ksocknal_router_interface = { kprni_nalid: SOCKNAL, kprni_arg: &ksocknal_data, kprni_fwd: ksocknal_fwd_packet, + kprni_notify: ksocknal_notify, }; +#define SOCKNAL_SYSCTL 200 + +#define SOCKNAL_SYSCTL_TIMEOUT 1 +#define SOCKNAL_SYSCTL_EAGER_ACK 2 +#define SOCKNAL_SYSCTL_ZERO_COPY 3 + +static ctl_table ksocknal_ctl_table[] = { + {SOCKNAL_SYSCTL_TIMEOUT, "timeout", + &ksocknal_data.ksnd_io_timeout, sizeof (int), + 0644, NULL, &proc_dointvec}, + {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", + &ksocknal_data.ksnd_eager_ack, sizeof (int), + 0644, NULL, &proc_dointvec}, +#if SOCKNAL_ZC + {SOCKNAL_SYSCTL_EAGER_ACK, "zero_copy", + &ksocknal_data.ksnd_zc_min_frag, sizeof (int), + 0644, NULL, &proc_dointvec}, +#endif + { 0 } +}; + +static ctl_table ksocknal_top_ctl_table[] = { + {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, + { 0 } +}; int ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, @@ -172,7 +198,7 @@ ksocknal_bind_irq (unsigned int irq) ksock_route_t * ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, - int irq_affinity, int xchange_nids, int nonagel) + int nonagel, int xchange_nids, int irq_affinity, int eager) { ksock_route_t *route; @@ -183,7 +209,7 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, atomic_set (&route->ksnr_refcount, 1); route->ksnr_sharecount = 0; route->ksnr_peer = NULL; - route->ksnr_timeout = jiffies_64; + route->ksnr_timeout = jiffies; route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; route->ksnr_ipaddr = ipaddr; route->ksnr_port = port; @@ -191,6 +217,7 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, route->ksnr_irq_affinity = irq_affinity; route->ksnr_xchange_nids = xchange_nids; route->ksnr_nonagel = nonagel; + route->ksnr_eager = eager; route->ksnr_connecting = 0; route->ksnr_deleted = 0; route->ksnr_generation = 0; @@ -371,7 +398,8 @@ ksocknal_get_route_by_idx (int index) int ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, - int nonagle, int xchange_nids, int bind_irq, int share) + int nonagle, int xchange_nids, int bind_irq, + int share, int eager) { unsigned long flags; ksock_peer_t *peer; @@ -388,8 +416,8 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, if (peer == NULL) return (-ENOMEM); - route = ksocknal_create_route (ipaddr, port, bufnob, - nonagle, xchange_nids, bind_irq); + route = ksocknal_create_route (ipaddr, port, bufnob, nonagle, + xchange_nids, bind_irq, eager); if (route == NULL) { ksocknal_put_peer (peer); return (-ENOMEM); @@ -454,7 +482,7 @@ ksocknal_del_route_locked (ksock_route_t *route, int share, int keep_conn) if (conn != NULL) { if (!keep_conn) - ksocknal_close_conn_locked (conn); + ksocknal_close_conn_locked (conn, 0); else { /* keeping the conn; just dissociate it and route... */ conn->ksnc_route = NULL; @@ -568,14 +596,12 @@ ksocknal_get_peer_addr (ksock_conn_t *conn) struct sockaddr_in sin; int len = sizeof (sin); int rc; - - rc = ksocknal_getconnsock (conn); - LASSERT (rc == 0); rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, (struct sockaddr *)&sin, &len, 2); + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); LASSERT (len <= sizeof (sin)); - ksocknal_putconnsock (conn); if (rc != 0) { CERROR ("Error %d getting sock peer IP\n", rc); @@ -590,12 +616,8 @@ unsigned int ksocknal_conn_irq (ksock_conn_t *conn) { int irq = 0; - int rc; struct dst_entry *dst; - rc = ksocknal_getconnsock (conn); - LASSERT (rc == 0); - dst = sk_dst_get (conn->ksnc_sock->sk); if (dst != NULL) { if (dst->dev != NULL) { @@ -608,7 +630,8 @@ ksocknal_conn_irq (ksock_conn_t *conn) dst_release (dst); } - ksocknal_putconnsock (conn); + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); return (irq); } @@ -660,11 +683,13 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, int rc; /* NB, sock has an associated file since (a) this connection might - * have been created in userland and (b) we need the refcounting so - * that we don't close the socket while I/O is being done on it. */ + * have been created in userland and (b) we need to refcount the + * socket so that we don't close it while I/O is being done on + * it, and sock->file has that pre-cooked... */ LASSERT (sock->file != NULL); + LASSERT (file_count(sock->file) > 0); - rc = ksocknal_set_linger (sock); + rc = ksocknal_setup_sock (sock); if (rc != 0) return (rc); @@ -696,9 +721,6 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, ksocknal_new_packet (conn, 0); INIT_LIST_HEAD (&conn->ksnc_tx_queue); -#if SOCKNAL_ZC - INIT_LIST_HEAD (&conn->ksnc_tx_pending); -#endif conn->ksnc_tx_ready = 0; conn->ksnc_tx_scheduled = 0; atomic_set (&conn->ksnc_tx_nob, 0); @@ -753,6 +775,8 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, conn->ksnc_peer = peer; atomic_inc (&peer->ksnp_refcount); + peer->ksnp_last_alive = jiffies; + peer->ksnp_error = 0; list_add (&conn->ksnc_list, &peer->ksnp_conns); atomic_inc (&conn->ksnc_refcount); @@ -797,7 +821,7 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, } void -ksocknal_close_conn_locked (ksock_conn_t *conn) +ksocknal_close_conn_locked (ksock_conn_t *conn, int error) { /* This just does the immmediate housekeeping, and queues the * connection for the reaper to terminate. @@ -805,6 +829,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) ksock_peer_t *peer = conn->ksnc_peer; ksock_route_t *route; + LASSERT (peer->ksnp_error == 0); LASSERT (!conn->ksnc_closing); conn->ksnc_closing = 1; atomic_inc (&ksocknal_data.ksnd_nclosing_conns); @@ -825,11 +850,16 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) /* ksnd_deathrow_conns takes over peer's ref */ list_del (&conn->ksnc_list); - if (list_empty (&peer->ksnp_conns) && - list_empty (&peer->ksnp_routes)) { - /* I've just closed last conn belonging to a - * non-autoconnecting peer */ - ksocknal_unlink_peer_locked (peer); + if (list_empty (&peer->ksnp_conns)) { + /* No more connections to this peer */ + + peer->ksnp_error = error; /* stash last conn close reason */ + + if (list_empty (&peer->ksnp_routes)) { + /* I've just closed last conn belonging to a + * non-autoconnecting peer */ + ksocknal_unlink_peer_locked (peer); + } } spin_lock (&ksocknal_data.ksnd_reaper_lock); @@ -842,7 +872,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) } int -ksocknal_close_conn_unlocked (ksock_conn_t *conn) +ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why) { unsigned long flags; int did_it = 0; @@ -851,7 +881,7 @@ ksocknal_close_conn_unlocked (ksock_conn_t *conn) if (!conn->ksnc_closing) { did_it = 1; - ksocknal_close_conn_locked (conn); + ksocknal_close_conn_locked (conn, why); } write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); @@ -867,6 +897,10 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * ksnc_refcount will eventually hit zero, and then the reaper will * destroy it. */ unsigned long flags; + ksock_peer_t *peer = conn->ksnc_peer; + struct timeval now; + time_t then = 0; + int notify = 0; /* serialise with callbacks */ write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); @@ -886,6 +920,16 @@ ksocknal_terminate_conn (ksock_conn_t *conn) conn->ksnc_scheduler->kss_nconns--; + if (peer->ksnp_error != 0) { + /* peer's last conn closed in error */ + LASSERT (list_empty (&peer->ksnp_conns)); + + /* convert peer's last-known-alive timestamp from jiffies */ + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - peer->ksnp_last_alive)/HZ; + notify = 1; + } + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); /* The socket is closed on the final put; either here, or in @@ -894,6 +938,10 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * immediately, aborting anything buffered in it. Any hung * zero-copy transmits will therefore complete in finite time. */ ksocknal_putconnsock (conn); + + if (notify) + kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid, + 0, then); } void @@ -906,9 +954,7 @@ ksocknal_destroy_conn (ksock_conn_t *conn) LASSERT (conn->ksnc_route == NULL); LASSERT (!conn->ksnc_tx_scheduled); LASSERT (!conn->ksnc_rx_scheduled); -#if SOCKNAL_ZC - LASSERT (list_empty (&conn->ksnc_tx_pending)); -#endif + /* complete queued packets */ while (!list_empty (&conn->ksnc_tx_queue)) { ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, @@ -1010,7 +1056,7 @@ ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) continue; rc = 0; - ksocknal_close_conn_locked (conn); + ksocknal_close_conn_locked (conn, 0); } } } @@ -1020,6 +1066,24 @@ ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) return (rc); } +void +ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive) +{ + /* The router is telling me she's been notified of a change in + * gateway state.... */ + + CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down"); + + if (!alive) { + /* If the gateway crashed, close all open connections... */ + ksocknal_close_conn (gw_nid, 0); + return; + } + + /* ...otherwise do nothing. We can only establish new connections + * if we have autroutes, and these connect on demand. */ +} + #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct tcp_opt *sock2tcp_opt(struct sock *sk) { @@ -1177,7 +1241,8 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) data->ioc_wait = route->ksnr_sharecount; data->ioc_flags = (route->ksnr_nonagel ? 1 : 0) | (route->ksnr_xchange_nids ? 2 : 0) | - (route->ksnr_irq_affinity ? 4 : 0); + (route->ksnr_irq_affinity ? 4 : 0) | + (route->ksnr_eager ? 8 : 0); ksocknal_put_route (route); } break; @@ -1185,10 +1250,11 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) case NAL_CMD_ADD_AUTOCONN: { rc = ksocknal_add_route (data->ioc_nid, data->ioc_id, data->ioc_misc, data->ioc_size, - (data->ioc_flags & 1) != 0, - (data->ioc_flags & 2) != 0, - (data->ioc_flags & 4) != 0, - (data->ioc_flags & 8) != 0); + (data->ioc_flags & 0x01) != 0, + (data->ioc_flags & 0x02) != 0, + (data->ioc_flags & 0x04) != 0, + (data->ioc_flags & 0x08) != 0, + (data->ioc_flags & 0x10) != 0); break; } case NAL_CMD_DEL_AUTOCONN: { @@ -1287,6 +1353,10 @@ ksocknal_module_fini (void) LASSERT (0); case SOCKNAL_INIT_ALL: +#if CONFIG_SYSCTL + if (ksocknal_data.ksnd_sysctl != NULL) + unregister_sysctl_table (ksocknal_data.ksnd_sysctl); +#endif kportal_nal_unregister(SOCKNAL); PORTAL_SYMBOL_UNREGISTER (ksocknal_ni); /* fall through */ @@ -1364,6 +1434,9 @@ ksocknal_module_init (void) /* packet descriptor must fit in a router descriptor's scratchpad */ LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + /* the following must be sizeof(int) for proc_dointvec() */ + LASSERT(sizeof (ksocknal_data.ksnd_io_timeout) == sizeof (int)); + LASSERT(sizeof (ksocknal_data.ksnd_eager_ack) == sizeof (int)); LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); @@ -1379,6 +1452,12 @@ ksocknal_module_init (void) memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + ksocknal_data.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; + ksocknal_data.ksnd_eager_ack = SOCKNAL_EAGER_ACK; +#if SOCKNAL_ZC + ksocknal_data.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; +#endif + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; PORTAL_ALLOC (ksocknal_data.ksnd_peers, sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); @@ -1561,9 +1640,13 @@ ksocknal_module_init (void) PORTAL_SYMBOL_REGISTER(ksocknal_ni); +#ifdef CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + ksocknal_data.ksnd_sysctl = register_sysctl_table (ksocknal_top_ctl_table, 0); +#endif /* flag everything initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; - + printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial " "mem %d)\n", kpr_routing (&ksocknal_data.ksnd_router) ? diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 69daa027f1870d289edda839f0bcffb9327047ab..abd8e7b718c211e15b35b6f23aafb48f82c2fd2c 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -48,6 +48,7 @@ #include <linux/stat.h> #include <linux/list.h> #include <linux/kmod.h> +#include <linux/sysctl.h> #include <asm/uaccess.h> #include <asm/segment.h> #include <asm/div64.h> @@ -68,7 +69,12 @@ #define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ #define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ -#define SOCKNAL_IO_TIMEOUT (60*HZ) /* default comms timeout */ +/* default vals for runtime tunables */ +#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define SOCKNAL_EAGER_ACK 1 /* default eager ack (boolean) */ +#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ + +#define SOCKNAL_USE_KEEPALIVES 0 /* use tcp/ip keepalive? */ #define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ @@ -78,8 +84,6 @@ # define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ #endif -#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ - #define SOCKNAL_NLTXS 128 /* # normal transmit messages */ #define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ @@ -95,10 +99,6 @@ #define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10) -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -# define jiffies_64 jiffies -#endif - #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) # define sk_data_ready data_ready # define sk_write_space write_space @@ -136,6 +136,12 @@ typedef struct { typedef struct { int ksnd_init; /* initialisation state */ + int ksnd_io_timeout; /* "stuck" socket timeout (seconds) */ + int ksnd_eager_ack; /* make TCP ack eagerly? */ +#if SOCKNAL_ZC + unsigned int ksnd_zc_min_frag; /* minimum zero copy frag size */ +#endif + struct ctl_table_header *ksnd_sysctl; /* sysctl interface */ rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */ struct list_head *ksnd_peers; /* hash table of all my known peers */ @@ -205,7 +211,6 @@ struct ksock_route; /* forward ref */ typedef struct /* transmit packet */ { struct list_head tx_list; /* queue on conn for transmission etc */ - __u64 tx_deadline; /* when (in jiffies) tx times out */ char tx_isfwd; /* forwarding / sourced here */ int tx_nob; /* # packet bytes */ int tx_resid; /* residual bytes */ @@ -291,10 +296,11 @@ typedef struct ksock_conn __u32 ksnc_ipaddr; /* peer's IP */ int ksnc_port; /* peer's port */ int ksnc_closing; /* being shut down */ - + /* READER */ struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ - __u64 ksnc_rx_deadline; /* when receive times out */ + unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */ + int ksnc_rx_started; /* started receiving a message */ int ksnc_rx_ready; /* data ready to read */ int ksnc_rx_scheduled; /* being progressed */ int ksnc_rx_state; /* what is being read */ @@ -311,9 +317,7 @@ typedef struct ksock_conn /* WRITER */ struct list_head ksnc_tx_list; /* where I enq waiting for output space */ struct list_head ksnc_tx_queue; /* packets waiting to be sent */ -#if SOCKNAL_ZC - struct list_head ksnc_tx_pending; /* zc packets pending callback */ -#endif + unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */ atomic_t ksnc_tx_nob; /* # bytes queued */ int ksnc_tx_ready; /* write space */ int ksnc_tx_scheduled; /* being progressed */ @@ -326,7 +330,7 @@ typedef struct ksock_route struct ksock_peer *ksnr_peer; /* owning peer */ atomic_t ksnr_refcount; /* # users */ int ksnr_sharecount; /* lconf usage counter */ - __u64 ksnr_timeout; /* when reconnection can happen next */ + unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */ unsigned int ksnr_retry_interval; /* how long between retries */ __u32 ksnr_ipaddr; /* an IP address for this peer */ int ksnr_port; /* port to connect to */ @@ -334,8 +338,9 @@ typedef struct ksock_route unsigned int ksnr_irq_affinity:1; /* set affinity? */ unsigned int ksnr_xchange_nids:1; /* do hello protocol? */ unsigned int ksnr_nonagel:1; /* disable nagle? */ - unsigned int ksnr_connecting; /* autoconnect in progress? */ - unsigned int ksnr_deleted; /* been removed from peer? */ + unsigned int ksnr_eager:1; /* connect eagery? */ + unsigned int ksnr_connecting:1; /* autoconnect in progress? */ + unsigned int ksnr_deleted:1; /* been removed from peer? */ int ksnr_generation; /* connection incarnation # */ ksock_conn_t *ksnr_conn; /* NULL/active connection */ } ksock_route_t; @@ -346,13 +351,14 @@ typedef struct ksock_peer ptl_nid_t ksnp_nid; /* who's on the other end(s) */ atomic_t ksnp_refcount; /* # users */ int ksnp_closing; /* being closed */ + int ksnp_error; /* errno on closing last conn */ struct list_head ksnp_conns; /* all active connections */ struct list_head ksnp_routes; /* routes */ struct list_head ksnp_tx_queue; /* waiting packets */ + unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */ } ksock_peer_t; - extern nal_cb_t ksocknal_lib; extern ksock_nal_data_t ksocknal_data; @@ -393,8 +399,8 @@ extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int single, int keep_conn); extern int ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, struct socket *sock, int bind_irq); -extern void ksocknal_close_conn_locked (ksock_conn_t *conn); -extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn); +extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why); +extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why); extern void ksocknal_terminate_conn (ksock_conn_t *conn); extern void ksocknal_destroy_conn (ksock_conn_t *conn); extern void ksocknal_put_conn (ksock_conn_t *conn); @@ -404,6 +410,7 @@ extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch); extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); extern void ksocknal_fmb_callback (void *arg, int error); +extern void ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive); extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); extern int ksocknal_scheduler (void *arg); @@ -411,4 +418,4 @@ extern void ksocknal_data_ready(struct sock *sk, int n); extern void ksocknal_write_space(struct sock *sk); extern int ksocknal_autoconnectd (void *arg); extern int ksocknal_reaper (void *arg); -extern int ksocknal_set_linger (struct socket *sock); +extern int ksocknal_setup_sock (struct socket *sock); diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 656a0c5cce68f2c17a1312712182b1dd681595a2..f946d87438d40ae2c2df4f3b967389ed3b97e349 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -25,12 +25,6 @@ #include "socknal.h" -int ksocknal_io_timeout = SOCKNAL_IO_TIMEOUT; -#if SOCKNAL_ZC -int ksocknal_do_zc = 1; -int ksocknal_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; -#endif - /* * LIB functions follow * @@ -225,7 +219,7 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *iov = tx->tx_iov; int fragsize = iov->iov_len; unsigned long vaddr = (unsigned long)iov->iov_base; - int more = !list_empty (&conn->ksnc_tx_queue) | + int more = (!list_empty (&conn->ksnc_tx_queue)) | (tx->tx_niov > 1) | (tx->tx_nkiov > 1); #if SOCKNAL_ZC @@ -241,10 +235,9 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (tx->tx_niov > 0); #if SOCKNAL_ZC - if (ksocknal_do_zc && + if (zcsize >= ksocknal_data.ksnd_zc_min_frag && (sock->sk->route_caps & NETIF_F_SG) && (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && - zcsize >= ksocknal_zc_min_frag && (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", @@ -306,7 +299,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) int fragsize = kiov->kiov_len; struct page *page = kiov->kiov_page; int offset = kiov->kiov_offset; - int more = !list_empty (&conn->ksnc_tx_queue) | + int more = (!list_empty (&conn->ksnc_tx_queue)) | (tx->tx_nkiov > 1); int rc; @@ -318,10 +311,9 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (tx->tx_nkiov > 0); #if SOCKNAL_ZC - if (ksocknal_do_zc && + if (fragsize >= ksocknal_data.ksnd_zc_min_frag && (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && - fragsize >= ksocknal_zc_min_frag) { + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { CDEBUG(D_NET, "page %p + offset %x for %d\n", page, offset, fragsize); @@ -349,6 +341,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) set_fs (KERNEL_DS); rc = sock_sendmsg(sock, &msg, fragsize); set_fs (oldmm); + kunmap (page); } @@ -410,6 +403,16 @@ ksocknal_sendmsg (ksock_conn_t *conn, ksock_tx_t *tx) break; } + /* Consider the connection alive since we managed to chuck + * more data into it. Really, we'd like to consider it + * alive only when the peer ACKs something, but + * write_space() only gets called back while SOCK_NOSPACE + * is set. Instead, we presume peer death has occurred if + * the socket doesn't drain within a timout */ + conn->ksnc_tx_deadline = jiffies + + ksocknal_data.ksnd_io_timeout * HZ; + conn->ksnc_peer->ksnp_last_alive = jiffies; + if (tx->tx_resid == 0) { /* sent everything */ rc = 0; break; @@ -420,6 +423,24 @@ ksocknal_sendmsg (ksock_conn_t *conn, ksock_tx_t *tx) RETURN (rc); } +void +ksocknal_eager_ack (ksock_conn_t *conn) +{ + int opt = 1; + mm_segment_t oldmm = get_fs(); + struct socket *sock = conn->ksnc_sock; + + /* Remind the socket to ACK eagerly. If I don't, the socket might + * think I'm about to send something it could piggy-back the ACK + * on, introducing delay in completing zero-copy sends in my + * peer. */ + + set_fs(KERNEL_DS); + sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK, + (char *)&opt, sizeof (opt)); + set_fs(oldmm); +} + int ksocknal_recv_iov (ksock_conn_t *conn) { @@ -453,6 +474,13 @@ ksocknal_recv_iov (ksock_conn_t *conn) if (rc <= 0) return (rc); + /* received something... */ + conn->ksnc_peer->ksnp_last_alive = jiffies; + conn->ksnc_rx_deadline = jiffies + + ksocknal_data.ksnd_io_timeout * HZ; + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + conn->ksnc_rx_nob_wanted -= rc; conn->ksnc_rx_nob_left -= rc; @@ -499,11 +527,19 @@ ksocknal_recv_kiov (ksock_conn_t *conn) rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); /* NB this is just a boolean............................^ */ set_fs (oldmm); + kunmap (page); if (rc <= 0) return (rc); + /* received something... */ + conn->ksnc_peer->ksnp_last_alive = jiffies; + conn->ksnc_rx_deadline = jiffies + + ksocknal_data.ksnd_io_timeout * HZ; + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + conn->ksnc_rx_nob_wanted -= rc; conn->ksnc_rx_nob_left -= rc; @@ -541,20 +577,33 @@ ksocknal_recvmsg (ksock_conn_t *conn) rc = -ESHUTDOWN; break; } - + if (conn->ksnc_rx_niov != 0) rc = ksocknal_recv_iov (conn); else rc = ksocknal_recv_kiov (conn); - + if (rc <= 0) { /* error/EOF or partial receive */ - if (rc == -EAGAIN) + if (rc == -EAGAIN) { rc = 1; + } else if (rc == 0 && conn->ksnc_rx_started) { + /* EOF in the middle of a message */ + rc = -EPROTO; + } break; } + /* Completed a fragment */ + if (conn->ksnc_rx_nob_wanted == 0) { + /* Completed a message segment (header or payload) */ + if (ksocknal_data.ksnd_eager_ack && + (conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) { + /* Remind the socket to ack eagerly... */ + ksocknal_eager_ack(conn); + } rc = 1; break; } @@ -577,7 +626,6 @@ ksocknal_zc_callback (zccd_t *zcd) spin_lock_irqsave (&sched->kss_lock, flags); - list_del (&tx->tx_list); /* remove from kss_zctxpending_list */ list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); if (waitqueue_active (&sched->kss_waitq)) wake_up (&sched->kss_waitq); @@ -628,23 +676,12 @@ ksocknal_tx_launched (ksock_tx_t *tx) { #if SOCKNAL_ZC if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { - unsigned long flags; ksock_conn_t *conn = tx->tx_conn; - ksock_sched_t *sched = conn->ksnc_scheduler; /* zccd skbufs are still in-flight. First take a ref on * conn, so it hangs about for ksocknal_tx_done... */ atomic_inc (&conn->ksnc_refcount); - /* Stash it for timeout... - * NB We have to hold a lock to stash the tx, and we have - * stash it before we zcc_put(), but we have to _not_ hold - * this lock when we zcc_put(), otherwise we could deadlock - * if it turns out to be the last put. Aaaaarrrrggghhh! */ - spin_lock_irqsave (&sched->kss_lock, flags); - list_add_tail (&tx->tx_list, &conn->ksnc_tx_pending); - spin_unlock_irqrestore (&sched->kss_lock, flags); - /* ...then drop the initial ref on zccd, so the zero copy * callback can occur */ zccd_put (&tx->tx_zccd); @@ -659,10 +696,10 @@ ksocknal_tx_launched (ksock_tx_t *tx) void ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) { - ksock_conn_t *conn; - ksock_tx_t *tx; - int rc; - + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + LASSERT (!list_empty (&sched->kss_tx_conns)); conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list); list_del (&conn->ksnc_tx_list); @@ -686,7 +723,7 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); if (rc != 0) { - if (ksocknal_close_conn_unlocked (conn)) { + if (ksocknal_close_conn_unlocked (conn, rc)) { /* I'm the first to close */ CERROR ("[%p] Error %d on write to "LPX64" ip %08x:%d\n", conn, rc, conn->ksnc_peer->ksnp_nid, @@ -696,7 +733,6 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) spin_lock_irqsave (&sched->kss_lock, *irq_flags); } else if (tx->tx_resid == 0) { - /* everything went; assume more can go, and avoid * write_space locking */ conn->ksnc_tx_ready = 1; @@ -763,7 +799,8 @@ ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid) return (NULL); } - rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &target_nid); + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob, + &target_nid); if (rc != 0) { CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); return (NULL); @@ -820,8 +857,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) #endif spin_lock_irqsave (&sched->kss_lock, flags); - - tx->tx_deadline = jiffies_64 + ksocknal_io_timeout; + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); if (conn->ksnc_tx_ready && /* able to send */ @@ -839,7 +875,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) } ksock_route_t * -ksocknal_find_connectable_route_locked (ksock_peer_t *peer) +ksocknal_find_connectable_route_locked (ksock_peer_t *peer, int eager_only) { struct list_head *tmp; ksock_route_t *route; @@ -849,7 +885,8 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) if (route->ksnr_conn == NULL && /* not connected */ !route->ksnr_connecting && /* not connecting */ - route->ksnr_timeout <= jiffies_64) /* OK to retry */ + (!eager_only || route->ksnr_eager) && /* wants to connect */ + time_after_eq (jiffies, route->ksnr_timeout)) /* OK to retry */ return (route); } @@ -880,7 +917,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) ksock_conn_t *conn; ksock_route_t *route; rwlock_t *g_lock; - + /* Ensure the frags we've been given EXACTLY match the number of * bytes we want to send. Many TCP/IP stacks disregard any total * size parameters passed to them and just look at the frags. @@ -907,17 +944,19 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) return (PTL_FAIL); } - /* Any routes need to be connected? (need write lock if so) */ - if (ksocknal_find_connectable_route_locked (peer) == NULL) { + if (ksocknal_find_connectable_route_locked(peer, 1) == NULL) { conn = ksocknal_find_conn_locked (tx, peer); if (conn != NULL) { + /* I've got no unconnected autoconnect routes that + * need to be connected, and I do have an actual + * connection... */ ksocknal_queue_tx_locked (tx, conn); read_unlock (g_lock); return (PTL_OK); } } - /* need a write lock now to change peer state... */ + /* Making one or more connections; I'll need a write lock... */ atomic_inc (&peer->ksnp_refcount); /* +1 ref for me while I unlock */ read_unlock (g_lock); @@ -930,23 +969,37 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) } ksocknal_put_peer (peer); /* drop ref I got above */ - /* I may launch autoconnects, now we're write locked... */ - while ((route = ksocknal_find_connectable_route_locked (peer)) != NULL) + + for (;;) { + /* launch all eager autoconnections */ + route = ksocknal_find_connectable_route_locked (peer, 1); + if (route == NULL) + break; + ksocknal_launch_autoconnect_locked (route); + } conn = ksocknal_find_conn_locked (tx, peer); if (conn != NULL) { + /* Connection exists; queue message on it */ ksocknal_queue_tx_locked (tx, conn); write_unlock_irqrestore (g_lock, flags); return (PTL_OK); } - + if (ksocknal_find_connecting_route_locked (peer) == NULL) { - /* no routes actually connecting now */ - write_unlock_irqrestore (g_lock, flags); - return (PTL_FAIL); + /* no autoconnect routes actually connecting now. Scrape + * the barrel for non-eager autoconnects */ + route = ksocknal_find_connectable_route_locked (peer, 0); + if (route != NULL) { + ksocknal_launch_autoconnect_locked (route); + } else { + write_unlock_irqrestore (g_lock, flags); + return (PTL_FAIL); + } } + /* At least 1 connection is being established; queue the message... */ list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); write_unlock_irqrestore (g_lock, flags); @@ -1279,7 +1332,6 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) conn->ksnc_cookie = fmb; /* stash fmb for later */ conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; /* start timeout */ /* payload is desc's iov-ed buffer, but skipping the hdr */ LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / @@ -1323,7 +1375,7 @@ ksocknal_fwd_parse (ksock_conn_t *conn) dest_nid, body_len); ksocknal_new_packet (conn, 0); /* on to new packet */ - ksocknal_close_conn_unlocked (conn); /* give up on conn */ + ksocknal_close_conn_unlocked (conn, -EINVAL); /* give up on conn */ return; } @@ -1373,6 +1425,9 @@ ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) int skipped; if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_started = 0; + mb (); /* racing with timeout thread */ + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); @@ -1464,7 +1519,7 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) rc = ksocknal_recvmsg(conn); if (rc <= 0) { - if (ksocknal_close_conn_unlocked (conn)) { + if (ksocknal_close_conn_unlocked (conn, rc)) { /* I'm the first to close */ if (rc < 0) CERROR ("[%p] Error %d on read from "LPX64" ip %08x:%d\n", @@ -1478,6 +1533,7 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) goto out; } + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ goto out; /* try again later */ @@ -1506,9 +1562,6 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) /* sets wanted_len, iovs etc */ lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); - /* start timeout (lib is waiting for finalize) */ - conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; - if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ conn->ksnc_rx_state = SOCKNAL_RX_BODY; goto try_read; /* go read the payload */ @@ -1517,7 +1570,6 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) case SOCKNAL_RX_BODY: /* payload all received */ - conn->ksnc_rx_deadline = 0; /* cancel timeout */ lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); /* Fall through */ @@ -1534,9 +1586,6 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) NTOH__u64 (conn->ksnc_hdr.dest_nid), conn->ksnc_rx_nob_left); - /* cancel timeout (only needed it while fmb allocated) */ - conn->ksnc_rx_deadline = 0; - /* forward the packet. NB ksocknal_init_fmb() put fmb into * conn->ksnc_cookie */ fmb = (ksock_fmb_t *)conn->ksnc_cookie; @@ -1631,15 +1680,20 @@ int ksocknal_scheduler (void *arg) int id = sched - ksocknal_data.ksnd_schedulers; char name[16]; - snprintf (name, sizeof (name),"ksocknald[%d]", id); + snprintf (name, sizeof (name),"ksocknald_%02d", id); kportal_daemonize (name); kportal_blockallsigs (); #if (CONFIG_SMP && CPU_AFFINITY) - if ((cpu_online_map & (1 << id)) != 0) + if ((cpu_online_map & (1 << id)) != 0) { +#if 0 current->cpus_allowed = (1 << id); - else +#else + set_cpus_allowed (current, 1<<id); +#endif + } else { CERROR ("Can't set CPU affinity for %s\n", name); + } #endif /* CONFIG_SMP && CPU_AFFINITY */ spin_lock_irqsave (&sched->kss_lock, flags); @@ -1722,7 +1776,10 @@ ksocknal_data_ready (struct sock *sk, int n) if (conn == NULL) { /* raced with ksocknal_close_sock */ LASSERT (sk->sk_data_ready != &ksocknal_data_ready); sk->sk_data_ready (sk, n); - } else if (!conn->ksnc_rx_ready) { /* new news */ + goto out; + } + + if (!conn->ksnc_rx_ready) { /* new news */ /* Set ASAP in case of concurrent calls to me */ conn->ksnc_rx_ready = 1; @@ -1747,6 +1804,7 @@ ksocknal_data_ready (struct sock *sk, int n) spin_unlock_irqrestore (&sched->kss_lock, flags); } + out: read_unlock (&ksocknal_data.ksnd_global_lock); EXIT; @@ -1776,7 +1834,12 @@ ksocknal_write_space (struct sock *sk) if (conn == NULL) { /* raced with ksocknal_close_sock */ LASSERT (sk->sk_write_space != &ksocknal_write_space); sk->sk_write_space (sk); - } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ + + read_unlock (&ksocknal_data.ksnd_global_lock); + return; + } + + if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); if (!conn->ksnc_tx_ready) { /* new news */ @@ -1967,7 +2030,7 @@ ksocknal_exchange_nids (struct socket *sock, ptl_nid_t nid) } int -ksocknal_set_linger (struct socket *sock) +ksocknal_setup_sock (struct socket *sock) { mm_segment_t oldmm = get_fs (); int rc; @@ -1998,7 +2061,51 @@ ksocknal_set_linger (struct socket *sock) CERROR ("Can't set SO_LINGER2: %d\n", rc); return (rc); } + +#if SOCKNAL_USE_KEEPALIVES + /* Keepalives: If 3/4 of the timeout elapses, start probing every + * second until the timeout elapses. */ + + option = (ksocknal_data.ksnd_io_timeout * 3) / 4; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc); + return (rc); + } + option = 1; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); + return (rc); + } + + option = ksocknal_data.ksnd_io_timeout / 4; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); + return (rc); + } + + option = 1; + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); + return (rc); + } +#endif return (0); } @@ -2007,7 +2114,6 @@ ksocknal_connect_peer (ksock_route_t *route) { struct sockaddr_in peer_addr; mm_segment_t oldmm = get_fs(); - __u64 n; struct timeval tv; int fd; struct socket *sock; @@ -2020,8 +2126,8 @@ ksocknal_connect_peer (ksock_route_t *route) } /* Ugh; have to map_fd for compatibility with sockets passed in - * from userspace. And we actually need the refcounting that - * this gives you :) */ + * from userspace. And we actually need the sock->file refcounting + * that this gives you :) */ fd = sock_map_fd (sock); if (fd < 0) { @@ -2033,22 +2139,19 @@ ksocknal_connect_peer (ksock_route_t *route) /* NB the fd now owns the ref on sock->file */ LASSERT (sock->file != NULL); LASSERT (file_count(sock->file) == 1); - + /* Set the socket timeouts, so our connection attempt completes in * finite time */ - tv.tv_sec = ksocknal_io_timeout / HZ; - n = ksocknal_io_timeout % HZ; - n = n * 1000000 + HZ - 1; - do_div (n, HZ); - tv.tv_usec = n; + tv.tv_sec = ksocknal_data.ksnd_io_timeout; + tv.tv_usec = 0; set_fs (KERNEL_DS); rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof (tv)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set send timeout %d (in HZ): %d\n", - ksocknal_io_timeout, rc); + CERROR ("Can't set send timeout %d: %d\n", + ksocknal_data.ksnd_io_timeout, rc); goto out; } @@ -2057,8 +2160,8 @@ ksocknal_connect_peer (ksock_route_t *route) (char *)&tv, sizeof (tv)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set receive timeout %d (in HZ): %d\n", - ksocknal_io_timeout, rc); + CERROR ("Can't set receive timeout %d: %d\n", + ksocknal_data.ksnd_io_timeout, rc); goto out; } @@ -2154,7 +2257,7 @@ ksocknal_autoconnect (ksock_route_t *route) route->ksnr_connecting = 0; LASSERT (route->ksnr_retry_interval != 0); - route->ksnr_timeout = jiffies_64 + route->ksnr_retry_interval; + route->ksnr_timeout = jiffies + route->ksnr_retry_interval; route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2, SOCKNAL_MAX_RECONNECT_INTERVAL); @@ -2198,7 +2301,7 @@ ksocknal_autoconnectd (void *arg) ksock_route_t *route; int rc; - snprintf (name, sizeof (name), "ksocknal_ad[%ld]", id); + snprintf (name, sizeof (name), "ksocknal_ad%02ld", id); kportal_daemonize (name); kportal_blockallsigs (); @@ -2239,55 +2342,47 @@ ksock_conn_t * ksocknal_find_timed_out_conn (ksock_peer_t *peer) { /* We're called with a shared lock on ksnd_global_lock */ - unsigned long flags; ksock_conn_t *conn; struct list_head *ctmp; - ksock_tx_t *tx; - struct list_head *ttmp; ksock_sched_t *sched; list_for_each (ctmp, &peer->ksnp_conns) { conn = list_entry (ctmp, ksock_conn_t, ksnc_list); sched = conn->ksnc_scheduler; - - if (conn->ksnc_rx_deadline != 0 && - conn->ksnc_rx_deadline <= jiffies_64) - goto timed_out; - spin_lock_irqsave (&sched->kss_lock, flags); + /* Don't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); - list_for_each (ttmp, &conn->ksnc_tx_queue) { - tx = list_entry (ttmp, ksock_tx_t, tx_list); - LASSERT (tx->tx_deadline != 0); - - if (tx->tx_deadline <= jiffies_64) - goto timed_out_locked; + if (conn->ksnc_rx_started && + time_after_eq (jiffies, conn->ksnc_rx_deadline)) { + /* Timed out incomplete incoming message */ + atomic_inc (&conn->ksnc_refcount); + CERROR ("Timed out RX from "LPX64" %p\n", + peer->ksnp_nid, conn); + return (conn); } -#if SOCKNAL_ZC - list_for_each (ttmp, &conn->ksnc_tx_pending) { - tx = list_entry (ttmp, ksock_tx_t, tx_list); - LASSERT (tx->tx_deadline != 0); - - if (tx->tx_deadline <= jiffies_64) - goto timed_out_locked; + + if ((!list_empty (&conn->ksnc_tx_queue) || + conn->ksnc_sock->sk->wmem_queued != 0) && + time_after_eq (jiffies, conn->ksnc_tx_deadline)) { + /* Timed out messages queued for sending, or + * messages buffered in the socket's send buffer */ + atomic_inc (&conn->ksnc_refcount); + CERROR ("Timed out TX to "LPX64" %s%d %p\n", + peer->ksnp_nid, + list_empty (&conn->ksnc_tx_queue) ? "" : "Q ", + conn->ksnc_sock->sk->wmem_queued, conn); + return (conn); } -#endif - spin_unlock_irqrestore (&sched->kss_lock, flags); - continue; - - timed_out_locked: - spin_unlock_irqrestore (&sched->kss_lock, flags); - timed_out: - atomic_inc (&conn->ksnc_refcount); - return (conn); } return (NULL); } void -ksocknal_check_peer_timeouts (struct list_head *peers) +ksocknal_check_peer_timeouts (int idx) { + struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; struct list_head *ptmp; ksock_peer_t *peer; ksock_conn_t *conn; @@ -2305,7 +2400,7 @@ ksocknal_check_peer_timeouts (struct list_head *peers) if (conn != NULL) { read_unlock (&ksocknal_data.ksnd_global_lock); - if (ksocknal_close_conn_unlocked (conn)) { + if (ksocknal_close_conn_unlocked (conn, -ETIMEDOUT)) { /* I actually closed... */ CERROR ("Timeout out conn->"LPX64" ip %x:%d\n", peer->ksnp_nid, conn->ksnc_ipaddr, @@ -2330,8 +2425,9 @@ ksocknal_reaper (void *arg) unsigned long flags; ksock_conn_t *conn; int timeout; + int i; int peer_index = 0; - __u64 deadline = jiffies_64; + unsigned long deadline = jiffies; kportal_daemonize ("ksocknal_reaper"); kportal_blockallsigs (); @@ -2371,12 +2467,32 @@ ksocknal_reaper (void *arg) spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); - while ((timeout = deadline - jiffies_64) <= 0) { - /* Time to check for timeouts on a few more peers */ - ksocknal_check_peer_timeouts (&ksocknal_data.ksnd_peers[peer_index]); + /* careful with the jiffy wrap... */ + while ((timeout = ((int)deadline - (int)jiffies)) <= 0) { + const int n = 4; + const int p = 1; + int chunk = ksocknal_data.ksnd_peer_hash_size; + + /* Time to check for timeouts on a few more peers: I do + * checks every 'p' seconds on a proportion of the peer + * table and I need to check every connection 'n' times + * within a timeout interval, to ensure I detect a + * timeout on any connection within (n+1)/n times the + * timeout interval. */ + + if (ksocknal_data.ksnd_io_timeout > n * p) + chunk = (chunk * n * p) / + ksocknal_data.ksnd_io_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + ksocknal_check_peer_timeouts (peer_index); + peer_index = (peer_index + 1) % + ksocknal_data.ksnd_peer_hash_size; + } - peer_index = (peer_index + 1) % SOCKNAL_PEER_HASH_SIZE; - deadline += HZ; + deadline += p * HZ; } add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); diff --git a/lnet/klnds/toelnd/toenal_cb.c b/lnet/klnds/toelnd/toenal_cb.c index abd07315ce4a3bf58779d28159570187dac1f64c..478f3c1387ed7687adaa710a45b99a960ab42ad3 100644 --- a/lnet/klnds/toelnd/toenal_cb.c +++ b/lnet/klnds/toelnd/toenal_cb.c @@ -431,7 +431,8 @@ ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie, if ((conn = ktoenal_get_conn (nid)) == NULL) { /* It's not a peer; try to find a gateway */ - rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid); + rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, payload_niov, + &gatewaynid); if (rc != 0) { CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); diff --git a/lnet/libcfs/Makefile.am b/lnet/libcfs/Makefile.am index 20d7fbd4f6a37eb7787c2cde3092765d3c905761..cf9220b5a2a2b07dba3165841f1f39854695d682 100644 --- a/lnet/libcfs/Makefile.am +++ b/lnet/libcfs/Makefile.am @@ -20,7 +20,7 @@ link-stamp: echo timestamp > link-stamp DEFS = -portals_SOURCES = $(LINKS) module.c proc.c debug.c +portals_SOURCES = $(LINKS) module.c proc.c debug.c lwt.c # Don't distribute any patched files. dist-hook: diff --git a/lnet/libcfs/debug.c b/lnet/libcfs/debug.c index 63e71eec712f2e284e03aa9bc68b9f29a82cfa3c..90eb185da7243cced18e3d1186c889e0475f7340 100644 --- a/lnet/libcfs/debug.c +++ b/lnet/libcfs/debug.c @@ -790,41 +790,61 @@ void portals_debug_set_level(unsigned int debug_level) portal_debug = debug_level; } +void portals_run_upcall(char **argv) +{ + int rc; + int argc; + char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + ENTRY; + + argv[0] = portals_upcall; + argc = 1; + while (argv[argc] != NULL) + argc++; + + LASSERT(argc >= 2); + + rc = call_usermodehelper(argv[0], argv, envp); + if (rc < 0) { + CERROR("Error %d invoking portals upcall %s %s%s%s%s%s%s%s%s; " + "check /proc/sys/portals/upcall\n", + rc, argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } else { + CERROR("Invoked portals upcall %s %s%s%s%s%s%s%s%s\n", + argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } +} + void portals_run_lbug_upcall(char *file, const char *fn, const int line) { char *argv[6]; - char *envp[3]; char buf[32]; - int rc; ENTRY; snprintf (buf, sizeof buf, "%d", line); - argv[0] = portals_upcall; argv[1] = "LBUG"; argv[2] = file; argv[3] = (char *)fn; argv[4] = buf; argv[5] = NULL; - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - rc = call_usermodehelper(argv[0], argv, envp); - if (rc < 0) { - CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check " - "/proc/sys/portals/upcall\n", - argv[0], argv[1], argv[2], argv[3], argv[4], rc); - - } else { - CERROR("Invoked upcall %s %s %s %s %s\n", - argv[0], argv[1], argv[2], argv[3], argv[4]); - } + portals_run_upcall (argv); } - EXPORT_SYMBOL(portals_debug_dumplog); EXPORT_SYMBOL(portals_debug_msg); EXPORT_SYMBOL(portals_debug_set_level); +EXPORT_SYMBOL(portals_run_upcall); EXPORT_SYMBOL(portals_run_lbug_upcall); diff --git a/lnet/libcfs/lwt.c b/lnet/libcfs/lwt.c new file mode 100644 index 0000000000000000000000000000000000000000..a40a7ed5127e07b7073fb2b8cd31f7e350b8588a --- /dev/null +++ b/lnet/libcfs/lwt.c @@ -0,0 +1,235 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2003 Cluster File Systems, Inc. + * Author: Eric Barton <eeb@clusterfs.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/kernel.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/smp_lock.h> +#include <linux/unistd.h> +#include <linux/interrupt.h> +#include <asm/system.h> +#include <asm/uaccess.h> + +#define DEBUG_SUBSYSTEM S_PORTALS + +#include <linux/kp30.h> + +#if LWT_SUPPORT + +#define LWT_MEMORY (1<<20) /* 1Mb of trace memory */ +#define LWT_MAX_CPUS 4 + +int lwt_enabled; +int lwt_pages_per_cpu; +lwt_cpu_t lwt_cpus[LWT_MAX_CPUS]; + +/* NB only root is allowed to retrieve LWT info; it's an open door into the + * kernel... */ + +int +lwt_lookup_string (int *size, char *knl_ptr, + char *user_ptr, int user_size) +{ + /* knl_ptr was retrieved from an LWT snapshot and the caller wants to + * turn it into a string. NB we can crash with an access violation + * trying to determine the string length, so we're trusting our + * caller... */ + + if (!capable(CAP_SYS_ADMIN)) + return (-EPERM); + + *size = strlen (knl_ptr) + 1; + + if (user_ptr != NULL && + copy_to_user (user_ptr, knl_ptr, *size)) + return (-EFAULT); + + return (0); +} + +int +lwt_control (int enable, int clear) +{ + lwt_page_t *p; + int i; + int j; + + if (!capable(CAP_SYS_ADMIN)) + return (-EPERM); + + if (clear) + for (i = 0; i < num_online_cpus(); i++) { + p = lwt_cpus[i].lwtc_current_page; + + for (j = 0; j < lwt_pages_per_cpu; j++) { + + memset (p->lwtp_events, 0, PAGE_SIZE); + + p = list_entry (p->lwtp_list.next, + lwt_page_t, lwtp_list); + } + } + + lwt_enabled = enable; + mb(); + if (!enable) { + /* give people some time to stop adding traces */ + schedule_timeout(10); + } + + return (0); +} + +int +lwt_snapshot (int *ncpu, int *total_size, + void *user_ptr, int user_size) +{ + const int events_per_page = PAGE_SIZE / sizeof(lwt_event_t); + const int bytes_per_page = events_per_page * sizeof(lwt_event_t); + lwt_page_t *p; + int i; + int j; + + if (!capable(CAP_SYS_ADMIN)) + return (-EPERM); + + *ncpu = num_online_cpus(); + *total_size = num_online_cpus() * lwt_pages_per_cpu * bytes_per_page; + + if (user_ptr == NULL) + return (0); + + for (i = 0; i < num_online_cpus(); i++) { + p = lwt_cpus[i].lwtc_current_page; + + for (j = 0; j < lwt_pages_per_cpu; j++) { + if (copy_to_user(user_ptr, p->lwtp_events, + bytes_per_page)) + return (-EFAULT); + + user_ptr = ((char *)user_ptr) + bytes_per_page; + p = list_entry(p->lwtp_list.next, + lwt_page_t, lwtp_list); + + } + } + + return (0); +} + +int +lwt_init () +{ + int i; + int j; + + if (num_online_cpus() > LWT_MAX_CPUS) { + CERROR ("Too many CPUs\n"); + return (-EINVAL); + } + + /* NULL pointers, zero scalars */ + memset (lwt_cpus, 0, sizeof (lwt_cpus)); + lwt_pages_per_cpu = LWT_MEMORY / (num_online_cpus() * PAGE_SIZE); + + for (i = 0; i < num_online_cpus(); i++) + for (j = 0; j < lwt_pages_per_cpu; j++) { + struct page *page = alloc_page (GFP_KERNEL); + lwt_page_t *lwtp; + + if (page == NULL) { + CERROR ("Can't allocate page\n"); + lwt_fini (); + return (-ENOMEM); + } + + PORTAL_ALLOC(lwtp, sizeof (*lwtp)); + if (lwtp == NULL) { + CERROR ("Can't allocate lwtp\n"); + __free_page(page); + lwt_fini (); + return (-ENOMEM); + } + + lwtp->lwtp_page = page; + lwtp->lwtp_events = page_address(page); + memset (lwtp->lwtp_events, 0, PAGE_SIZE); + + if (j == 0) { + INIT_LIST_HEAD (&lwtp->lwtp_list); + lwt_cpus[i].lwtc_current_page = lwtp; + } else { + list_add (&lwtp->lwtp_list, + &lwt_cpus[i].lwtc_current_page->lwtp_list); + } + } + + lwt_enabled = 1; + mb(); + + return (0); +} + +void +lwt_fini () +{ + int i; + + if (num_online_cpus() > LWT_MAX_CPUS) + return; + + for (i = 0; i < num_online_cpus(); i++) + while (lwt_cpus[i].lwtc_current_page != NULL) { + lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page; + + if (list_empty (&lwtp->lwtp_list)) { + lwt_cpus[i].lwtc_current_page = NULL; + } else { + lwt_cpus[i].lwtc_current_page = + list_entry (lwtp->lwtp_list.next, + lwt_page_t, lwtp_list); + + list_del (&lwtp->lwtp_list); + } + + __free_page (lwtp->lwtp_page); + PORTAL_FREE (lwtp, sizeof (*lwtp)); + } +} + +EXPORT_SYMBOL(lwt_enabled); +EXPORT_SYMBOL(lwt_cpus); + +EXPORT_SYMBOL(lwt_init); +EXPORT_SYMBOL(lwt_fini); +EXPORT_SYMBOL(lwt_lookup_string); +EXPORT_SYMBOL(lwt_control); +EXPORT_SYMBOL(lwt_snapshot); +#endif diff --git a/lnet/libcfs/module.c b/lnet/libcfs/module.c index c72dffc07bdca7cb57fe97876e1673d35caf0681..11e4f4c0e42fdbb613f612a1d07a217d81fa2144 100644 --- a/lnet/libcfs/module.c +++ b/lnet/libcfs/module.c @@ -122,8 +122,8 @@ static inline void freedata(void *data, int len) } static int -kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, - ptl_nid_t hi_nid) +kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid) { int rc; kpr_control_interface_t *ci; @@ -139,7 +139,8 @@ kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, } static int -kportal_del_route(ptl_nid_t target) +kportal_del_route(int gw_nalid, ptl_nid_t gw_nid, + ptl_nid_t lo, ptl_nid_t hi) { int rc; kpr_control_interface_t *ci; @@ -148,7 +149,24 @@ kportal_del_route(ptl_nid_t target) if (ci == NULL) return (-ENODEV); - rc = ci->kprci_del_route (target); + rc = ci->kprci_del_route (gw_nalid, gw_nid, lo, hi); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_notify_router (int gw_nalid, ptl_nid_t gw_nid, + int alive, time_t when) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_notify (gw_nalid, gw_nid, alive, when); PORTAL_SYMBOL_PUT(kpr_control_interface); return (rc); @@ -156,12 +174,13 @@ kportal_del_route(ptl_nid_t target) static int kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, - ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp) + ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp, int *alivep) { int gateway_nalid; ptl_nid_t gateway_nid; ptl_nid_t lo_nid; ptl_nid_t hi_nid; + int alive; int rc; kpr_control_interface_t *ci; @@ -169,17 +188,19 @@ kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, if (ci == NULL) return (-ENODEV); - rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid, - &hi_nid); + rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, + &lo_nid, &hi_nid, &alive); if (rc == 0) { - CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n", - index, gateway_nalid, gateway_nid, lo_nid, hi_nid); + CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64", %s\n", + index, gateway_nalid, gateway_nid, lo_nid, hi_nid, + alive ? "up" : "down"); *gateway_nalidp = (__u32)gateway_nalid; - *gateway_nidp = (__u32)gateway_nid; - *lo_nidp = (__u32)lo_nid; - *hi_nidp = (__u32)hi_nid; + *gateway_nidp = gateway_nid; + *lo_nidp = lo_nid; + *hi_nidp = hi_nid; + *alivep = alive; } PORTAL_SYMBOL_PUT (kpr_control_interface); @@ -367,23 +388,38 @@ static int kportal_ioctl(struct inode *inode, struct file *file, case IOC_PORTAL_ADD_ROUTE: CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n", - data->ioc_nal, data->ioc_nid, data->ioc_nid2, - data->ioc_nid3); + data->ioc_nal, data->ioc_nid, + data->ioc_nid2, data->ioc_nid3); err = kportal_add_route(data->ioc_nal, data->ioc_nid, - MIN (data->ioc_nid2, data->ioc_nid3), - MAX (data->ioc_nid2, data->ioc_nid3)); + data->ioc_nid2, data->ioc_nid3); break; case IOC_PORTAL_DEL_ROUTE: - CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid); - err = kportal_del_route (data->ioc_nid); + CDEBUG (D_IOCTL, "Removing routes via [%d] "LPU64" : "LPU64" - "LPU64"\n", + data->ioc_nal, data->ioc_nid, + data->ioc_nid2, data->ioc_nid3); + err = kportal_del_route (data->ioc_nal, data->ioc_nid, + data->ioc_nid2, data->ioc_nid3); break; + case IOC_PORTAL_NOTIFY_ROUTER: { + CDEBUG (D_IOCTL, "Notifying peer [%d] "LPU64" %s @ %ld\n", + data->ioc_nal, data->ioc_nid, + data->ioc_flags ? "Enabling" : "Disabling", + (time_t)data->ioc_nid3); + + err = kportal_notify_router (data->ioc_nal, data->ioc_nid, + data->ioc_flags, + (time_t)data->ioc_nid3); + break; + } + case IOC_PORTAL_GET_ROUTE: CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count); err = kportal_get_route(data->ioc_count, &data->ioc_nal, - &data->ioc_nid, &data->ioc_nid2, - &data->ioc_nid3); + &data->ioc_nid, + &data->ioc_nid2, &data->ioc_nid3, + &data->ioc_flags); if (err == 0) if (copy_to_user((char *)arg, data, sizeof (*data))) err = -EFAULT; @@ -432,7 +468,27 @@ static int kportal_ioctl(struct inode *inode, struct file *file, kportal_put_ni (data->ioc_nal); break; } - +#if LWT_SUPPORT + case IOC_PORTAL_LWT_CONTROL: + err = lwt_control (data->ioc_flags, data->ioc_misc); + break; + + case IOC_PORTAL_LWT_SNAPSHOT: + err = lwt_snapshot (&data->ioc_count, &data->ioc_misc, + data->ioc_pbuf1, data->ioc_plen1); + if (err == 0 && + copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_LWT_LOOKUP_STRING: + err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, + data->ioc_pbuf2, data->ioc_plen2); + if (err == 0 && + copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; +#endif default: err = -EINVAL; break; @@ -471,12 +527,19 @@ static int init_kportals_module(void) return (rc); } +#if LWT_SUPPORT + rc = lwt_init(); + if (rc != 0) { + CERROR("lwt_init: error %d\n", rc); + goto cleanup_debug; + } +#endif sema_init(&nal_cmd_sem, 1); rc = misc_register(&portal_dev); if (rc) { CERROR("misc_register: error %d\n", rc); - goto cleanup_debug; + goto cleanup_lwt; } rc = PtlInit(); @@ -498,6 +561,10 @@ static int init_kportals_module(void) PtlFini(); cleanup_deregister: misc_deregister(&portal_dev); + cleanup_lwt: +#if LWT_SUPPORT + lwt_fini(); +#endif cleanup_debug: portals_debug_cleanup(); return rc; @@ -518,6 +585,10 @@ static void exit_kportals_module(void) if (rc) CERROR("misc_deregister error %d\n", rc); +#if LWT_SUPPORT + lwt_fini(); +#endif + if (atomic_read(&portal_kmemory) != 0) CERROR("Portals memory leaked: %d bytes\n", atomic_read(&portal_kmemory)); diff --git a/lnet/router/router.c b/lnet/router/router.c index 27a7fbae5f2e0183c636a7aba63d233e033e580e..a03fb423bd70069e4ae5ddcd3aff44baedbd284c 100644 --- a/lnet/router/router.c +++ b/lnet/router/router.c @@ -24,6 +24,7 @@ #include "router.h" LIST_HEAD(kpr_routes); +LIST_HEAD(kpr_gateways); LIST_HEAD(kpr_nals); unsigned long long kpr_fwd_bytes; @@ -42,6 +43,7 @@ kpr_router_interface_t kpr_router_interface = { kprri_lookup: kpr_lookup_target, kprri_fwd_start: kpr_forward_packet, kprri_fwd_done: kpr_complete_packet, + kprri_notify: kpr_nal_notify, kprri_shutdown: kpr_shutdown_nal, kprri_deregister: kpr_deregister_nal, }; @@ -50,6 +52,7 @@ kpr_control_interface_t kpr_control_interface = { kprci_add_route: kpr_add_route, kprci_del_route: kpr_del_route, kprci_get_route: kpr_get_route, + kprci_notify: kpr_sys_notify, }; int @@ -59,7 +62,7 @@ kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) struct list_head *e; kpr_nal_entry_t *ne; - CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid); + CDEBUG (D_NET, "Registering NAL %d\n", nalif->kprni_nalid); PORTAL_ALLOC (ne, sizeof (*ne)); if (ne == NULL) @@ -95,13 +98,182 @@ kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) return (0); } +void +kpr_do_upcall (void *arg) +{ + kpr_upcall_t *u = (kpr_upcall_t *)arg; + char nalstr[10]; + char nidstr[36]; + char whenstr[36]; + char *argv[] = { + NULL, + "ROUTER_NOTIFY", + nalstr, + nidstr, + u->kpru_alive ? "up" : "down", + whenstr, + NULL}; + + snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id); + snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid); + snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when); + + portals_run_upcall (argv); +} + +void +kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when) +{ + /* May be in arbitrary context */ + kpr_upcall_t *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC); + + if (u == NULL) { + CERROR ("Upcall out of memory: nal %d nid "LPX64" %s\n", + gw_nalid, gw_nid, alive ? "up" : "down"); + return; + } + + u->kpru_nal_id = gw_nalid; + u->kpru_nid = gw_nid; + u->kpru_alive = alive; + u->kpru_when = when; + + prepare_work (&u->kpru_tq, kpr_do_upcall, u); + schedule_work (&u->kpru_tq); +} + +int +kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, + int alive, time_t when) +{ + unsigned long flags; + int rc = -ENOENT; + kpr_nal_entry_t *ne = NULL; + kpr_gateway_entry_t *ge = NULL; + struct timeval now; + struct list_head *e; + struct list_head *n; + + CDEBUG (D_ERROR, "%s notifying [%d] "LPX64": %s\n", + byNal ? "NAL" : "userspace", + gateway_nalid, gateway_nid, alive ? "up" : "down"); + + /* can't do predictions... */ + do_gettimeofday (&now); + if (when > now.tv_sec) { + CERROR ("Ignoring prediction from %s of [%d] "LPX64" %s " + "%ld seconds in the future\n", + byNal ? "NAL" : "userspace", + gateway_nalid, gateway_nid, alive ? "up" : "down", + when - now.tv_sec); + return (EINVAL); + } + + LASSERT (when <= now.tv_sec); + + /* Serialise with lookups (i.e. write lock) */ + write_lock_irqsave(&kpr_rwlock, flags); + + list_for_each_safe (e, n, &kpr_gateways) { + + ge = list_entry(e, kpr_gateway_entry_t, kpge_list); + if ((gateway_nalid != 0 && + ge->kpge_nalid != gateway_nalid) || + ge->kpge_nid != gateway_nid) + continue; + + rc = 0; + break; + } + + if (rc != 0) { + /* gateway not found */ + write_unlock_irqrestore(&kpr_rwlock, flags); + CERROR ("Gateway not found\n"); + return (rc); + } + + if (when < ge->kpge_timestamp) { + /* out of date information */ + write_unlock_irqrestore (&kpr_rwlock, flags); + CERROR ("Out of date\n"); + return (0); + } + + /* update timestamp */ + ge->kpge_timestamp = when; + + if ((!ge->kpge_alive) == (!alive)) { + /* new date for old news */ + write_unlock_irqrestore (&kpr_rwlock, flags); + CERROR ("Old news\n"); + return (0); + } + + ge->kpge_alive = alive; + CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive); + + if (alive) { + /* Reset all gateway weights so the newly-enabled gateway + * doesn't have to play catch-up */ + list_for_each_safe (e, n, &kpr_gateways) { + kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t, + kpge_list); + atomic_set (&ge->kpge_weight, 0); + } + } + + if (!byNal) { + /* userland notified me: notify NAL? */ + ne = kpr_find_nal_entry_locked (ge->kpge_nalid); + if (ne != NULL) { + if (ne->kpne_shutdown || + ne->kpne_interface.kprni_notify == NULL) { + /* no need to notify */ + ne = NULL; + } else { + /* take a ref on this NAL until notifying + * it has completed... */ + atomic_inc (&ne->kpne_refcount); + } + } + } + + write_unlock_irqrestore(&kpr_rwlock, flags); + + if (ne != NULL) { + ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg, + gateway_nid, alive); + /* 'ne' can disappear now... */ + atomic_dec (&ne->kpne_refcount); + } + + if (byNal) { + /* It wasn't userland that notified me... */ + CERROR ("Doing upcall\n"); + kpr_upcall (gateway_nalid, gateway_nid, alive, when); + } else { + CERROR (" NOT Doing upcall\n"); + } + + return (0); +} + +void +kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when) +{ + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when); +} + void kpr_shutdown_nal (void *arg) { unsigned long flags; kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); LASSERT (!ne->kpne_shutdown); LASSERT (!in_interrupt()); @@ -126,7 +298,7 @@ kpr_deregister_nal (void *arg) unsigned long flags; kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */ @@ -142,15 +314,58 @@ kpr_deregister_nal (void *arg) PORTAL_MODULE_UNUSE; } +int +kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2) +{ + const int significant_bits = 0x00ffffff; + /* We use atomic_t to record/compare route weights for + * load-balancing. Here we limit ourselves to only using + * 'significant_bits' when we do an 'after' comparison */ + + int diff = (atomic_read (&ge1->kpge_weight) - + atomic_read (&ge2->kpge_weight)) & significant_bits; + int rc = (diff > (significant_bits >> 1)); + + CDEBUG(D_NET, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n", + ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight), + rc ? ">" : "<", + ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight)); + + return (rc); +} + +void +kpr_update_weight (kpr_gateway_entry_t *ge, int nob) +{ + int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t); + + /* We've chosen this route entry (i.e. gateway) to forward payload + * of length 'nob'; update the route's weight to make it less + * favoured. Note that the weight is 1 plus the payload size + * rounded and scaled to the portals header size, so we get better + * use of the significant bits in kpge_weight. */ + + CDEBUG(D_NET, "gateway [%p]"LPX64" += %d\n", ge, + ge->kpge_nid, weight); + + atomic_add (weight, &ge->kpge_weight); +} int -kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) +kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, + ptl_nid_t *gateway_nidp) { - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - struct list_head *e; - int rc = -ENOENT; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + struct list_head *e; + kpr_route_entry_t *re; + kpr_gateway_entry_t *ge = NULL; + int rc = -ENOENT; - CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid); + /* Caller wants to know if 'target_nid' can be reached via a gateway + * ON HER OWN NETWORK */ + + CDEBUG (D_NET, "lookup "LPX64" from NAL %d\n", target_nid, + ne->kpne_interface.kprni_nalid); if (ne->kpne_shutdown) /* caller is shutting down */ return (-ENOENT); @@ -159,9 +374,8 @@ kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) /* Search routes for one that has a gateway to target_nid on the callers network */ - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) - { - kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + list_for_each (e, &kpr_routes) { + re = list_entry (e, kpr_route_entry_t, kpre_list); if (re->kpre_lo_nid > target_nid || re->kpre_hi_nid < target_nid) @@ -169,33 +383,66 @@ kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) /* found table entry */ - if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */ - rc = -EHOSTUNREACH; - else - { - rc = 0; - *gateway_nidp = re->kpre_gateway_nid; - } - break; + if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid || + !re->kpre_gateway->kpge_alive) { + /* different NAL or gateway down */ + rc = -EHOSTUNREACH; + continue; + } + + if (ge == NULL || + kpr_ge_isbetter (re->kpre_gateway, ge)) + ge = re->kpre_gateway; } + if (ge != NULL) { + kpr_update_weight (ge, nob); + *gateway_nidp = ge->kpge_nid; + rc = 0; + } + read_unlock (&kpr_rwlock); - CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", + /* NB can't deref 're' now; it might have been removed! */ + + CDEBUG (D_NET, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", target_nid, ne->kpne_interface.kprni_nalid, rc, (rc == 0) ? *gateway_nidp : (ptl_nid_t)0); return (rc); } +kpr_nal_entry_t * +kpr_find_nal_entry_locked (int nal_id) +{ + struct list_head *e; + + /* Called with kpr_rwlock held */ + + list_for_each (e, &kpr_nals) { + kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */ + continue; + + return (ne); + } + + return (NULL); +} + void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) { - kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; - ptl_nid_t target_nid = fwd->kprfd_target_nid; - int nob = fwd->kprfd_nob; - struct list_head *e; - - CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd, + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; + ptl_nid_t target_nid = fwd->kprfd_target_nid; + int nob = fwd->kprfd_nob; + kpr_gateway_entry_t *ge = NULL; + kpr_nal_entry_t *dst_ne = NULL; + struct list_head *e; + kpr_route_entry_t *re; + kpr_nal_entry_t *tmp_ne; + + CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd, target_nid, src_ne->kpne_interface.kprni_nalid); LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */ @@ -216,53 +463,58 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) - { - kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + list_for_each (e, &kpr_routes) { + re = list_entry (e, kpr_route_entry_t, kpre_list); if (re->kpre_lo_nid > target_nid || /* no match */ re->kpre_hi_nid < target_nid) continue; - CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd, - target_nid, src_ne->kpne_interface.kprni_nalid, - re->kpre_gateway_nid, re->kpre_gateway_nalid); - - if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid) - break; /* don't route to same NAL */ + if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid) + continue; /* don't route to same NAL */ - /* Search for gateway's NAL's entry */ - - for (e = kpr_nals.next; e != &kpr_nals; e = e->next) - { - kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list); + if (!re->kpre_gateway->kpge_alive) + continue; /* gateway is dead */ + + tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid); - if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */ - continue; + if (tmp_ne == NULL || + tmp_ne->kpne_shutdown) { + /* NAL must be registered and not shutting down */ + continue; + } - if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */ - break; + if (ge == NULL || + kpr_ge_isbetter (re->kpre_gateway, ge)) { + ge = re->kpre_gateway; + dst_ne = tmp_ne; + } + } + + if (ge != NULL) { + LASSERT (dst_ne != NULL); + + kpr_update_weight (ge, nob); - fwd->kprfd_gateway_nid = re->kpre_gateway_nid; - atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ + fwd->kprfd_gateway_nid = ge->kpge_nid; + atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ - read_unlock (&kpr_rwlock); + read_unlock (&kpr_rwlock); - CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd, - target_nid, src_ne->kpne_interface.kprni_nalid, - fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d: " + "to "LPX64" on NAL %d\n", + fwd, target_nid, src_ne->kpne_interface.kprni_nalid, + fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); - dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); - return; - } - break; + dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); + return; } - read_unlock (&kpr_rwlock); + read_unlock (&kpr_rwlock); out: kpr_fwd_errors++; - CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, + CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, target_nid, src_ne->kpne_interface.kprni_nalid); /* Can't find anywhere to forward to */ @@ -278,14 +530,14 @@ kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg; kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg; - CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, + CDEBUG (D_NET, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error); atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */ (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error); - CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd, + CDEBUG (D_NET, "complete(2) [%p] from NAL %d: %d\n", fwd, src_ne->kpne_interface.kprni_nalid, error); atomic_dec (&kpr_queue_depth); @@ -293,49 +545,76 @@ kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) } int -kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, - ptl_nid_t hi_nid) +kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid) { - unsigned long flags; - struct list_head *e; - kpr_route_entry_t *re; + unsigned long flags; + struct list_head *e; + kpr_route_entry_t *re; + kpr_gateway_entry_t *ge; + int dup = 0; - CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", + CDEBUG(D_NET, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", gateway_nalid, gateway_nid, lo_nid, hi_nid); - LASSERT(lo_nid <= hi_nid); + if (gateway_nalid == PTL_NID_ANY || + lo_nid == PTL_NID_ANY || + hi_nid == PTL_NID_ANY || + lo_nid > hi_nid) + return (-EINVAL); + + PORTAL_ALLOC (ge, sizeof (*ge)); + if (ge == NULL) + return (-ENOMEM); + + ge->kpge_nalid = gateway_nalid; + ge->kpge_nid = gateway_nid; + ge->kpge_alive = 1; + ge->kpge_timestamp = 0; + ge->kpge_refcount = 0; + atomic_set (&ge->kpge_weight, 0); PORTAL_ALLOC (re, sizeof (*re)); if (re == NULL) return (-ENOMEM); - re->kpre_gateway_nalid = gateway_nalid; - re->kpre_gateway_nid = gateway_nid; re->kpre_lo_nid = lo_nid; re->kpre_hi_nid = hi_nid; LASSERT(!in_interrupt()); write_lock_irqsave (&kpr_rwlock, flags); - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t, - kpre_list); - - if (re->kpre_lo_nid > re2->kpre_hi_nid || - re->kpre_hi_nid < re2->kpre_lo_nid) - continue; + list_for_each (e, &kpr_gateways) { + kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, + kpge_list); + + if (ge2->kpge_nalid == gateway_nalid && + ge2->kpge_nid == gateway_nid) { + PORTAL_FREE (ge, sizeof (*ge)); + ge = ge2; + dup = 1; + break; + } + } - CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]" - "to ["LPX64" - "LPX64"]\n", - re->kpre_lo_nid, re->kpre_hi_nid, - re2->kpre_lo_nid, re2->kpre_hi_nid); + if (!dup) { + /* Adding a new gateway... */ + + list_add (&ge->kpge_list, &kpr_gateways); - write_unlock_irqrestore (&kpr_rwlock, flags); + /* ...zero all gateway weights so this one doesn't have to + * play catch-up */ - PORTAL_FREE (re, sizeof (*re)); - return (-EINVAL); + list_for_each (e, &kpr_gateways) { + kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, + kpge_list); + atomic_set (&ge2->kpge_weight, 0); + } + } + re->kpre_gateway = ge; + ge->kpge_refcount++; list_add (&re->kpre_list, &kpr_routes); write_unlock_irqrestore (&kpr_rwlock, flags); @@ -343,49 +622,82 @@ kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, } int -kpr_del_route (ptl_nid_t nid) +kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid, + int alive, time_t when) +{ + return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when)); +} + +int +kpr_del_route (int gw_nalid, ptl_nid_t gw_nid, + ptl_nid_t lo, ptl_nid_t hi) { + int specific = (lo != PTL_NID_ANY); unsigned long flags; + int rc = -ENOENT; struct list_head *e; + struct list_head *n; - CDEBUG(D_OTHER, "Del route "LPX64"\n", nid); + CDEBUG(D_NET, "Del route [%d] "LPX64" : "LPX64" - "LPX64"\n", + gw_nalid, gw_nid, lo, hi); LASSERT(!in_interrupt()); + + /* NB Caller may specify either all routes via the given gateway + * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are + * actual NIDs) */ + + if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY)) + return (-EINVAL); + write_lock_irqsave(&kpr_rwlock, flags); - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + list_for_each_safe (e, n, &kpr_routes) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, kpre_list); - - if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid) + kpr_gateway_entry_t *ge = re->kpre_gateway; + + if (ge->kpge_nalid != gw_nalid || + ge->kpge_nid != gw_nid || + (specific && + (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid))) continue; - list_del (&re->kpre_list); - write_unlock_irqrestore(&kpr_rwlock, flags); + rc = 0; + if (--ge->kpge_refcount == 0) { + list_del (&ge->kpge_list); + PORTAL_FREE (ge, sizeof (*ge)); + } + + list_del (&re->kpre_list); PORTAL_FREE(re, sizeof (*re)); - return (0); + + if (specific) + break; } write_unlock_irqrestore(&kpr_rwlock, flags); - return (-ENOENT); + return (rc); } int -kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid) +kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive) { struct list_head *e; read_lock(&kpr_rwlock); for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, - kpre_list); - + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + kpr_gateway_entry_t *ge = re->kpre_gateway; + if (idx-- == 0) { - *gateway_nalid = re->kpre_gateway_nalid; - *gateway_nid = re->kpre_gateway_nid; + *gateway_nalid = ge->kpge_nalid; + *gateway_nid = ge->kpge_nid; + *alive = ge->kpge_alive; *lo_nid = re->kpre_lo_nid; *hi_nid = re->kpre_hi_nid; diff --git a/lnet/router/router.h b/lnet/router/router.h index b8c3bec4e83d001aa2888109092a4101917bf892..c5cc1d3efdfb4a9f8595b9f68db6d4a0942f7b34 100644 --- a/lnet/router/router.h +++ b/lnet/router/router.h @@ -48,19 +48,42 @@ typedef struct int kpne_shutdown; } kpr_nal_entry_t; +typedef struct +{ + struct list_head kpge_list; + atomic_t kpge_weight; + time_t kpge_timestamp; + int kpge_alive; + int kpge_nalid; + int kpge_refcount; + ptl_nid_t kpge_nid; +} kpr_gateway_entry_t; + typedef struct { struct list_head kpre_list; - int kpre_gateway_nalid; - ptl_nid_t kpre_gateway_nid; + kpr_gateway_entry_t *kpre_gateway; ptl_nid_t kpre_lo_nid; ptl_nid_t kpre_hi_nid; } kpr_route_entry_t; +typedef struct +{ + struct tq_struct kpru_tq; + int kpru_nal_id; + ptl_nid_t kpru_nid; + int kpru_alive; + time_t kpru_when; +} kpr_upcall_t; + extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp); -extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp); +extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, + ptl_nid_t *gateway_nidp); +extern kpr_nal_entry_t *kpr_find_nal_entry_locked (int nal_id); extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd); extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error); +extern void kpr_nal_notify (void *arg, ptl_nid_t peer, + int alive, time_t when); extern void kpr_shutdown_nal (void *arg); extern void kpr_deregister_nal (void *arg); @@ -69,9 +92,12 @@ extern void kpr_proc_fini (void); extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, ptl_nid_t hi_nid); -extern int kpr_del_route (ptl_nid_t nid); +extern int kpr_del_route (int gw_nal, ptl_nid_t gw_nid, + ptl_nid_t lo, ptl_nid_t hi); extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid); + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive); +extern int kpr_sys_notify (int gw_nalid, ptl_nid_t gw_nid, + int alive, time_t when); extern unsigned long long kpr_fwd_bytes; extern unsigned long kpr_fwd_packets; diff --git a/lnet/utils/parser.c b/lnet/utils/parser.c index 4d93645b164d3931285e29a87b83e0b43b3658eb..eccf50704800952aaf7a9f14e3bdabf41d952997 100644 --- a/lnet/utils/parser.c +++ b/lnet/utils/parser.c @@ -676,6 +676,7 @@ int Parser_bool (int *b, char *str) { if (!strcasecmp (str, "no") || !strcasecmp (str, "n") || !strcasecmp (str, "off") || + !strcasecmp (str, "down") || !strcasecmp (str, "disable")) { *b = 0; @@ -685,6 +686,7 @@ int Parser_bool (int *b, char *str) { if (!strcasecmp (str, "yes") || !strcasecmp (str, "y") || !strcasecmp (str, "on") || + !strcasecmp (str, "up") || !strcasecmp (str, "enable")) { *b = 1; diff --git a/lnet/utils/portals.c b/lnet/utils/portals.c index 4a05234e9a1838b827ca22baf08763a6415c1eeb..5fe1777d64dba9647c188ee5f3e48b1a8f5223ab 100644 --- a/lnet/utils/portals.c +++ b/lnet/utils/portals.c @@ -60,6 +60,7 @@ typedef struct } name2num_t; static name2num_t nalnames[] = { + {"any", 0}, {"tcp", SOCKNAL}, {"toe", TOENAL}, {"elan", QSWNAL}, @@ -95,7 +96,7 @@ ptl_name2nal (char *str) { name2num_t *e = name2num_lookup_name (nalnames, str); - return ((e == NULL) ? 0 : e->num); + return ((e == NULL) ? -1 : e->num); } static char * @@ -127,6 +128,49 @@ ptl_gethostbyname(char * hname) { return he; } +int +ptl_parse_port (int *port, char *str) +{ + char *end; + + *port = strtol (str, &end, 0); + + if (*end == 0 && /* parsed whole string */ + *port > 0 && *port < 65536) /* minimal sanity check */ + return (0); + + return (-1); +} + +int +ptl_parse_time (time_t *t, char *str) +{ + char *end; + int n; + struct tm tm; + + *t = strtol (str, &end, 0); + if (*end == 0) /* parsed whole string */ + return (0); + + memset (&tm, 0, sizeof (tm)); + n = sscanf (str, "%d-%d-%d %d:%d:%d", + &tm.tm_year, &tm.tm_mon, &tm.tm_mday, + &tm.tm_hour, &tm.tm_min, &tm.tm_sec); + if (n != 6) + return (-1); + + tm.tm_mon--; /* convert to 0 == Jan */ + tm.tm_year -= 1900; /* y2k quirk */ + tm.tm_isdst = -1; /* dunno if it's daylight savings... */ + + *t = mktime (&tm); + if (*t == (time_t)-1) + return (-1); + + return (0); +} + int ptl_parse_ipaddr (__u32 *ipaddrp, char *str) { @@ -220,21 +264,29 @@ ptl_nid2str (char *buffer, ptl_nid_t nid) if (he != NULL) strcpy (buffer, he->h_name); else - sprintf (buffer, "0x"LPX64, nid); + sprintf (buffer, LPX64, nid); return (buffer); } -int g_nal_is_compatible (char *cmd, ...) +int g_nal_is_set () { - va_list ap; - int nal; - if (g_nal == 0) { fprintf (stderr, "Error: you must run the 'network' command first.\n"); return (0); } - + + return (1); +} + +int g_nal_is_compatible (char *cmd, ...) +{ + va_list ap; + int nal; + + if (!g_nal_is_set ()) + return (0); + va_start (ap, cmd); do { @@ -320,7 +372,7 @@ int jt_ptl_network(int argc, char **argv) int nal; if (argc == 2 && - (nal = ptl_name2nal (argv[1])) != 0) { + (nal = ptl_name2nal (argv[1])) >= 0) { g_nal = nal; return (0); } @@ -353,12 +405,14 @@ jt_ptl_print_autoconnects (int argc, char **argv) if (rc != 0) break; - printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s affinity %s share %d\n", + printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s " + "affinity %s eager %s share %d\n", data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer), data.ioc_misc, data.ioc_count, data.ioc_size, (data.ioc_flags & 1) ? "on" : "off", (data.ioc_flags & 2) ? "on" : "off", (data.ioc_flags & 4) ? "on" : "off", + (data.ioc_flags & 8) ? "on" : "off", data.ioc_wait); } @@ -377,10 +431,11 @@ jt_ptl_add_autoconnect (int argc, char **argv) int xchange_nids = 0; int irq_affinity = 0; int share = 0; + int eager = 0; int rc; if (argc < 4 || argc > 5) { - fprintf (stderr, "usage: %s nid ipaddr port [ixs]\n", argv[0]); + fprintf (stderr, "usage: %s nid ipaddr port [ixse]\n", argv[0]); return 0; } @@ -398,8 +453,11 @@ jt_ptl_add_autoconnect (int argc, char **argv) return -1; } - port = atol (argv[3]); - + if (ptl_parse_port (&port, argv[3]) != 0) { + fprintf (stderr, "Can't parse port: %s\n", argv[3]); + return -1; + } + if (argc > 4) { char *opts = argv[4]; @@ -414,6 +472,9 @@ jt_ptl_add_autoconnect (int argc, char **argv) case 's': share = 1; break; + case 'e': + eager = 1; + break; default: fprintf (stderr, "Can't parse options: %s\n", argv[4]); @@ -429,10 +490,11 @@ jt_ptl_add_autoconnect (int argc, char **argv) data.ioc_misc = port; /* only passing one buffer size! */ data.ioc_size = MAX (g_socket_rxmem, g_socket_txmem); - data.ioc_flags = (g_socket_nonagle ? 1 : 0) | - (xchange_nids ? 2 : 0) | - (irq_affinity ? 4 : 0) | - (share ? 8 : 0); + data.ioc_flags = (g_socket_nonagle ? 0x01 : 0) | + (xchange_nids ? 0x02 : 0) | + (irq_affinity ? 0x04 : 0) | + (share ? 0x08 : 0) | + (eager ? 0x10 : 0); rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); if (rc != 0) { @@ -532,7 +594,7 @@ jt_ptl_print_connections (int argc, char **argv) if (rc != 0) break; - printf (LPD64"@%s:%d\n", + printf (LPX64"@%s:%d\n", data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer), data.ioc_misc); @@ -646,7 +708,11 @@ int jt_ptl_connect(int argc, char **argv) return -1; } - port = atol(argv[2]); + if (ptl_parse_port (&port, argv[2]) != 0) { + fprintf (stderr, "Can't parse port: %s\n", argv[2]); + return -1; + } + if (argc > 3) for (flag = argv[3]; *flag != 0; flag++) switch (*flag) @@ -902,11 +968,8 @@ int jt_ptl_ping(int argc, char **argv) return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return -1; - } if (ptl_parse_nid (&nid, argv[1]) != 0) { @@ -957,11 +1020,9 @@ int jt_ptl_shownid(int argc, char **argv) return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command first\n"); + if (!g_nal_is_set()) return -1; - } - + PORTAL_IOC_INIT (data); data.ioc_nal = g_nal; rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); @@ -987,11 +1048,8 @@ int jt_ptl_mynid(int argc, char **argv) return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return -1; - } if (argc >= 2) nidstr = argv[1]; @@ -1037,11 +1095,8 @@ jt_ptl_fail_nid (int argc, char **argv) return (0); } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return (-1); - } if (!strcmp (argv[1], "_all_")) nid = PTL_NID_ANY; @@ -1120,7 +1175,7 @@ jt_ptl_nagle (int argc, char **argv) if (Parser_bool (&enable, argv[1]) != 0) { fprintf (stderr, "Can't parse boolean %s\n", argv[1]); - return (0); + return (-1); } g_socket_nonagle = !enable; } @@ -1143,11 +1198,8 @@ jt_ptl_add_route (int argc, char **argv) return (0); } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return (-1); - } if (ptl_parse_nid (&gateway_nid, argv[1]) != 0) { @@ -1190,6 +1242,8 @@ jt_ptl_del_route (int argc, char **argv) { struct portal_ioctl_data data; ptl_nid_t nid; + ptl_nid_t nid1 = PTL_NID_ANY; + ptl_nid_t nid2 = PTL_NID_ANY; int rc; if (argc < 2) @@ -1198,14 +1252,43 @@ jt_ptl_del_route (int argc, char **argv) return (0); } + if (!g_nal_is_set()) + return (-1); + if (ptl_parse_nid (&nid, argv[1]) != 0) { - fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); + fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); return (-1); } + if (argc >= 3 && + ptl_parse_nid (&nid1, argv[2]) != 0) + { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[2]); + return (-1); + } + + if (argc < 4) { + nid2 = nid1; + } else { + if (ptl_parse_nid (&nid2, argv[3]) != 0) { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[3]); + return (-1); + } + + if (nid1 > nid2) { + ptl_nid_t tmp = nid1; + + nid1 = nid2; + nid2 = tmp; + } + } + PORTAL_IOC_INIT(data); + data.ioc_nal = g_nal; data.ioc_nid = nid; + data.ioc_nid2 = nid1; + data.ioc_nid3 = nid2; rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data); if (rc != 0) @@ -1217,6 +1300,67 @@ jt_ptl_del_route (int argc, char **argv) return (0); } +int +jt_ptl_notify_router (int argc, char **argv) +{ + struct portal_ioctl_data data; + int enable; + ptl_nid_t nid; + int rc; + struct timeval now; + time_t when; + + if (argc < 3) + { + fprintf (stderr, "usage: %s targetNID <up/down> [<time>]\n", + argv[0]); + return (0); + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); + return (-1); + } + + if (Parser_bool (&enable, argv[2]) != 0) { + fprintf (stderr, "Can't parse boolean %s\n", argv[2]); + return (-1); + } + + gettimeofday(&now, NULL); + + if (argc < 4) { + when = now.tv_sec; + } else if (ptl_parse_time (&when, argv[3]) != 0) { + fprintf(stderr, "Can't parse time %s\n" + "Please specify either 'YYYY-MM-DD HH:MM:SS'\n" + "or an absolute unix time in seconds\n", argv[3]); + return (-1); + } else if (when > now.tv_sec) { + fprintf (stderr, "%s specifies a time in the future\n", + argv[3]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nal = g_nal; + data.ioc_nid = nid; + data.ioc_flags = enable; + /* Yeuch; 'cept I need a __u64 on 64 bit machines... */ + data.ioc_nid3 = (__u64)when; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NOTIFY_ROUTER, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_NOTIFY_ROUTER ("LPX64") failed: %s\n", + nid, strerror (errno)); + return (-1); + } + + return (0); +} + int jt_ptl_print_routes (int argc, char **argv) { @@ -1228,8 +1372,8 @@ jt_ptl_print_routes (int argc, char **argv) ptl_nid_t gateway_nid; ptl_nid_t nid1; ptl_nid_t nid2; - - + int alive; + for (index = 0;;index++) { PORTAL_IOC_INIT(data); @@ -1243,13 +1387,332 @@ jt_ptl_print_routes (int argc, char **argv) gateway_nid = data.ioc_nid; nid1 = data.ioc_nid2; nid2 = data.ioc_nid3; - - printf ("%8s %18s : %s - %s\n", + alive = data.ioc_flags; + + printf ("%8s %18s : %s - %s, %s\n", nal2name (gateway_nal), ptl_nid2str (buffer[0], gateway_nid), ptl_nid2str (buffer[1], nid1), - ptl_nid2str (buffer[2], nid2)); + ptl_nid2str (buffer[2], nid2), + alive ? "up" : "down"); } return (0); } +static int +lwt_control(int enable, int clear) +{ + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + data.ioc_flags = enable; + data.ioc_misc = clear; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_CONTROL, &data); + if (rc == 0) + return (0); + + fprintf(stderr, "IOC_PORTAL_LWT_CONTROL failed: %s\n", + strerror(errno)); + return (-1); +} + +static int +lwt_snapshot(int *ncpu, int *totalsize, lwt_event_t *events, int size) +{ + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + data.ioc_pbuf1 = (char *)events; + data.ioc_plen1 = size; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_SNAPSHOT, &data); + if (rc != 0) { + fprintf(stderr, "IOC_PORTAL_LWT_SNAPSHOT failed: %s\n", + strerror(errno)); + return (-1); + } + + LASSERT (data.ioc_count != 0); + LASSERT (data.ioc_misc != 0); + + if (ncpu != NULL) + *ncpu = data.ioc_count; + + if (totalsize != NULL) + *totalsize = data.ioc_misc; + + return (0); +} + +static char * +lwt_get_string(char *kstr) +{ + char *ustr; + struct portal_ioctl_data data; + int size; + int rc; + + /* FIXME: this could maintain a symbol table since we expect to be + * looking up the same strings all the time... */ + + PORTAL_IOC_INIT(data); + data.ioc_pbuf1 = kstr; + data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */ + data.ioc_pbuf2 = NULL; + data.ioc_plen2 = 0; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data); + if (rc != 0) { + fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n", + strerror(errno)); + return (NULL); + } + + size = data.ioc_count; + ustr = (char *)malloc(size); + if (ustr == NULL) { + fprintf(stderr, "Can't allocate string storage of size %d\n", + size); + return (NULL); + } + + PORTAL_IOC_INIT(data); + data.ioc_pbuf1 = kstr; + data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */ + data.ioc_pbuf2 = ustr; + data.ioc_plen2 = size; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data); + if (rc != 0) { + fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n", + strerror(errno)); + return (NULL); + } + + LASSERT(strlen(ustr) == size - 1); + return (ustr); +} + +static void +lwt_put_string(char *ustr) +{ + free(ustr); +} + +static int +lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e) +{ + char whenstr[32]; + char *where = lwt_get_string(e->lwte_where); + + if (where == NULL) + return (-1); + + sprintf(whenstr, LPD64, e->lwte_when - t0); + + fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n", + e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4, + (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0), + (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz, + where); + + lwt_put_string(where); + + return (0); +} + +double +get_cycles_per_usec () +{ + FILE *f = fopen ("/proc/cpuinfo", "r"); + double mhz; + char line[64]; + + if (f != NULL) { + while (fgets (line, sizeof (line), f) != NULL) + if (sscanf (line, "cpu MHz : %lf", &mhz) == 1) { + fclose (f); + return (mhz); + } + fclose (f); + } + + fprintf (stderr, "Can't read/parse /proc/cpuinfo\n"); + return (1000.0); +} + +int +jt_ptl_lwt(int argc, char **argv) +{ +#define MAX_CPUS 8 + int ncpus; + int totalspace; + int nevents_per_cpu; + lwt_event_t *events; + lwt_event_t *cpu_event[MAX_CPUS + 1]; + lwt_event_t *next_event[MAX_CPUS]; + lwt_event_t *first_event[MAX_CPUS]; + int cpu; + lwt_event_t *e; + int rc; + int i; + double mhz; + cycles_t t0; + cycles_t tlast; + FILE *f = stdout; + + if (argc < 2 || + (strcmp(argv[1], "start") && + strcmp(argv[1], "stop"))) { + fprintf(stderr, + "usage: %s start\n" + " %s stop [fname]\n", argv[0], argv[0]); + return (-1); + } + + if (!strcmp(argv[1], "start")) { + /* disable */ + if (lwt_control(0, 0) != 0) + return (-1); + + /* clear */ + if (lwt_control(0, 1) != 0) + return (-1); + + /* enable */ + if (lwt_control(1, 0) != 0) + return (-1); + + return (0); + } + + if (lwt_snapshot(&ncpus, &totalspace, NULL, 0) != 0) + return (-1); + + if (ncpus > MAX_CPUS) { + fprintf(stderr, "Too many cpus: %d (%d)\n", ncpus, MAX_CPUS); + return (-1); + } + + events = (lwt_event_t *)malloc(totalspace); + if (events == NULL) { + fprintf(stderr, "Can't allocate %d\n", totalspace); + return (-1); + } + + if (lwt_control(0, 0) != 0) { /* disable */ + free(events); + return (-1); + } + + if (lwt_snapshot(NULL, NULL, events, totalspace)) { + free(events); + return (-1); + } + + if (argc > 2) { + f = fopen (argv[2], "w"); + if (f == NULL) { + fprintf(stderr, "Can't open %s for writing: %s\n", argv[2], strerror (errno)); + free(events); + return (-1); + } + } + + mhz = get_cycles_per_usec(); + + /* carve events into per-cpu slices */ + nevents_per_cpu = totalspace / (ncpus * sizeof(lwt_event_t)); + for (cpu = 0; cpu <= ncpus; cpu++) + cpu_event[cpu] = &events[cpu * nevents_per_cpu]; + + /* find the earliest event on each cpu */ + for (cpu = 0; cpu < ncpus; cpu++) { + first_event[cpu] = NULL; + + for (e = cpu_event[cpu]; e < cpu_event[cpu + 1]; e++) { + + if (e->lwte_where == NULL) /* not an event */ + continue; + + if (first_event[cpu] == NULL || + first_event[cpu]->lwte_when > e->lwte_when) + first_event[cpu] = e; + } + + next_event[cpu] = first_event[cpu]; + } + + t0 = tlast = 0; + for (cpu = 0; cpu < ncpus; cpu++) { + e = first_event[cpu]; + if (e == NULL) /* no events this cpu */ + continue; + + if (e == cpu_event[cpu]) + e = cpu_event[cpu + 1] - 1; + else + e = e - 1; + + /* If there's an event immediately before the first one, this + * cpu wrapped its event buffer */ + if (e->lwte_where == NULL) + continue; + + /* We should only start outputting events from the most recent + * first event in any wrapped cpu. Events before this time on + * other cpus won't have any events from this CPU to interleave + * with. */ + if (t0 < first_event[cpu]->lwte_when) + t0 = first_event[cpu]->lwte_when; + } + + for (;;) { + /* find which cpu has the next event */ + cpu = -1; + for (i = 0; i < ncpus; i++) { + + if (next_event[i] == NULL) /* this cpu exhausted */ + continue; + + if (cpu < 0 || + next_event[i]->lwte_when < next_event[cpu]->lwte_when) + cpu = i; + } + + if (cpu < 0) /* all cpus exhausted */ + break; + + if (t0 == 0) { + /* no wrapped cpus and this is he first ever event */ + t0 = next_event[cpu]->lwte_when; + } + + if (t0 <= next_event[cpu]->lwte_when) { + /* on or after the first event */ + rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]); + if (rc != 0) + break; + } + + tlast = next_event[cpu]->lwte_when; + + next_event[cpu]++; + if (next_event[cpu] == cpu_event[cpu + 1]) + next_event[cpu] = cpu_event[cpu]; + + if (next_event[cpu]->lwte_where == NULL || + next_event[cpu] == first_event[cpu]) + next_event[cpu] = NULL; + } + + if (f != stdout) + fclose(f); + + free(events); + return (0); +#undef MAX_CPUS +} diff --git a/lnet/utils/ptlctl.c b/lnet/utils/ptlctl.c index c083e483366f1dddedd3e30f97d7a02c963604ab..1a8e637291ee307f49aa58533029d5da234f7f51 100644 --- a/lnet/utils/ptlctl.c +++ b/lnet/utils/ptlctl.c @@ -31,7 +31,7 @@ command_t list[] = { {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, {"print_autoconns", jt_ptl_print_autoconnects, 0, "print autoconnect entries (no args)"}, - {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixs])"}, + {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixse])"}, {"del_autoconn", jt_ptl_del_autoconnect, 0, "delete autoconnect entry (args: [nid] [host] [ks])"}, {"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"}, {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [xi])"}, @@ -41,8 +41,12 @@ command_t list[] = { {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, {"shownid", jt_ptl_shownid, 0, "print the local NID"}, {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, - {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, - {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"}, + {"add_route", jt_ptl_add_route, 0, + "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, + {"del_route", jt_ptl_del_route, 0, + "delete all routes via a gateway from the routing table (args: gatewayNID"}, + {"set_route", jt_ptl_notify_router, 0, + "enable/disable a route in the routing table (args: gatewayNID up/down [time]"}, {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"}, {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"}, {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"}, diff --git a/lustre/portals/include/linux/kp30.h b/lustre/portals/include/linux/kp30.h index 750d16c06800cc390ef04200e6a1070aa810cea9..0a1cc57a78b4e8bec72c552a828ae9583d294ad9 100644 --- a/lustre/portals/include/linux/kp30.h +++ b/lustre/portals/include/linux/kp30.h @@ -371,12 +371,14 @@ typedef struct { } kpr_fwd_desc_t; typedef void (*kpr_fwd_t)(void *arg, kpr_fwd_desc_t *fwd); +typedef void (*kpr_notify_t)(void *arg, ptl_nid_t peer, int alive); /* NAL's routing interface (Kernel Portals Routing Nal Interface) */ typedef const struct { int kprni_nalid; /* NAL's id */ void *kprni_arg; /* Arg to pass when calling into NAL */ kpr_fwd_t kprni_fwd; /* NAL's forwarding entrypoint */ + kpr_notify_t kprni_notify; /* NAL's notification entrypoint */ } kpr_nal_interface_t; /* Router's routing interface (Kernel Portals Routing Router Interface) */ @@ -386,9 +388,10 @@ typedef const struct { int (*kprri_register) (kpr_nal_interface_t *nal_interface, void **router_arg); - /* ask the router to find a gateway that forwards to 'nid' and is a peer - * of the calling NAL */ - int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, + /* ask the router to find a gateway that forwards to 'nid' and is a + * peer of the calling NAL; assume caller will send 'nob' bytes of + * payload there */ + int (*kprri_lookup) (void *router_arg, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid); /* hand a packet over to the router for forwarding */ @@ -398,6 +401,10 @@ typedef const struct { void (*kprri_fwd_done) (void *router_arg, kpr_fwd_desc_t *fwd, int error); + /* notify the router about peer state */ + void (*kprri_notify) (void *router_arg, ptl_nid_t peer, + int alive, time_t when); + /* the calling NAL is shutting down */ void (*kprri_shutdown) (void *router_arg); @@ -416,10 +423,14 @@ typedef struct { typedef const struct { int (*kprci_add_route)(int gateway_nal, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, ptl_nid_t hi_nid); - int (*kprci_del_route)(ptl_nid_t nid); + int (*kprci_del_route)(int gateway_nal, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid); int (*kprci_get_route)(int index, int *gateway_nal, - ptl_nid_t *gateway, ptl_nid_t *lo_nid, - ptl_nid_t *hi_nid); + ptl_nid_t *gateway, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, + int *alive); + int (*kprci_notify)(int gateway_nal, ptl_nid_t gateway_nid, + int alive, time_t when); } kpr_control_interface_t; extern kpr_control_interface_t kpr_control_interface; @@ -449,12 +460,12 @@ kpr_routing (kpr_router_t *router) } static inline int -kpr_lookup (kpr_router_t *router, ptl_nid_t nid, ptl_nid_t *gateway_nid) +kpr_lookup (kpr_router_t *router, ptl_nid_t nid, int nob, ptl_nid_t *gateway_nid) { if (!kpr_routing (router)) - return (-EHOSTUNREACH); + return (-ENETUNREACH); - return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, + return (router->kpr_interface->kprri_lookup(router->kpr_arg, nid, nob, gateway_nid)); } @@ -476,7 +487,7 @@ static inline void kpr_fwd_start (kpr_router_t *router, kpr_fwd_desc_t *fwd) { if (!kpr_routing (router)) - fwd->kprfd_callback (fwd->kprfd_callback_arg, -EHOSTUNREACH); + fwd->kprfd_callback (fwd->kprfd_callback_arg, -ENETUNREACH); else router->kpr_interface->kprri_fwd_start (router->kpr_arg, fwd); } @@ -488,6 +499,16 @@ kpr_fwd_done (kpr_router_t *router, kpr_fwd_desc_t *fwd, int error) router->kpr_interface->kprri_fwd_done (router->kpr_arg, fwd, error); } +static inline void +kpr_notify (kpr_router_t *router, + ptl_nid_t peer, int alive, time_t when) +{ + if (!kpr_routing (router)) + return; + + router->kpr_interface->kprri_notify(router->kpr_arg, peer, alive, when); +} + static inline void kpr_shutdown (kpr_router_t *router) { @@ -554,6 +575,7 @@ extern struct prof_ent prof_ents[MAX_PROFS]; #endif /* PORTALS_PROFILING */ /* debug.c */ +void portals_run_upcall(char **argv); void portals_run_lbug_upcall(char * file, const char *fn, const int line); void portals_debug_dumplog(void); int portals_debug_init(unsigned long bufsize); @@ -622,6 +644,92 @@ extern void kportal_blockallsigs (void); # define CURRENT_TIME time(0) #endif +/******************************************************************************/ +/* Light-weight trace + * Support for temporary event tracing with minimal Heisenberg effect. */ +#define LWT_SUPPORT 1 + +typedef struct { + cycles_t lwte_when; + char *lwte_where; + void *lwte_task; + long lwte_p1; + long lwte_p2; + long lwte_p3; + long lwte_p4; +} lwt_event_t; + +#if LWT_SUPPORT +#ifdef __KERNEL__ +#define LWT_EVENTS_PER_PAGE (PAGE_SIZE / sizeof (lwt_event_t)) + +typedef struct _lwt_page { + struct list_head lwtp_list; + struct page *lwtp_page; + lwt_event_t *lwtp_events; +} lwt_page_t; + +typedef struct { + int lwtc_current_index; + lwt_page_t *lwtc_current_page; +} lwt_cpu_t; + +extern int lwt_enabled; +extern lwt_cpu_t lwt_cpus[]; + +extern int lwt_init (void); +extern void lwt_fini (void); +extern int lwt_lookup_string (int *size, char *knlptr, + char *usrptr, int usrsize); +extern int lwt_control (int enable, int clear); +extern int lwt_snapshot (int *ncpu, int *total_size, + void *user_ptr, int user_size); + +/* Note that we _don't_ define LWT_EVENT at all if LWT_SUPPORT isn't set. + * This stuff is meant for finding specific problems; it never stays in + * production code... */ + +#define LWTSTR(n) #n +#define LWTWHERE(f,l) f ":" LWTSTR(l) + +#define LWT_EVENT(p1, p2, p3, p4) \ +do { \ + unsigned long flags; \ + lwt_cpu_t *cpu; \ + lwt_page_t *p; \ + lwt_event_t *e; \ + \ + local_irq_save (flags); \ + \ + if (lwt_enabled) { \ + cpu = &lwt_cpus[smp_processor_id()]; \ + p = cpu->lwtc_current_page; \ + e = &p->lwtp_events[cpu->lwtc_current_index++]; \ + \ + if (cpu->lwtc_current_index >= LWT_EVENTS_PER_PAGE) { \ + cpu->lwtc_current_page = \ + list_entry (p->lwtp_list.next, \ + lwt_page_t, lwtp_list); \ + cpu->lwtc_current_index = 0; \ + } \ + \ + e->lwte_when = get_cycles(); \ + e->lwte_where = LWTWHERE(__FILE__,__LINE__); \ + e->lwte_task = current; \ + e->lwte_p1 = (long)(p1); \ + e->lwte_p2 = (long)(p2); \ + e->lwte_p3 = (long)(p3); \ + e->lwte_p4 = (long)(p4); \ + } \ + \ + local_irq_restore (flags); \ +} while (0) +#else /* __KERNEL__ */ +#define LWT_EVENT(p1,p2,p3,p4) /* no userland implementation yet */ +#endif /* __KERNEL__ */ +#endif /* LWT_SUPPORT */ + + #include <linux/portals_lib.h> /* @@ -856,8 +964,11 @@ static inline int portal_ioctl_getdata(char *buf, char *end, void *arg) #define IOC_PORTAL_GET_NID _IOWR('e', 39, long) #define IOC_PORTAL_FAIL_NID _IOWR('e', 40, long) #define IOC_PORTAL_SET_DAEMON _IOWR('e', 41, long) - -#define IOC_PORTAL_MAX_NR 41 +#define IOC_PORTAL_NOTIFY_ROUTER _IOWR('e', 42, long) +#define IOC_PORTAL_LWT_CONTROL _IOWR('e', 43, long) +#define IOC_PORTAL_LWT_SNAPSHOT _IOWR('e', 44, long) +#define IOC_PORTAL_LWT_LOOKUP_STRING _IOWR('e', 45, long) +#define IOC_PORTAL_MAX_NR 45 enum { QSWNAL = 1, diff --git a/lustre/portals/include/portals/ptlctl.h b/lustre/portals/include/portals/ptlctl.h index 827811127a21cc8a68e4f5073b3e50ed793b5a94..7763f1b548be7333ab117ae02733f89d8bc34ddc 100644 --- a/lustre/portals/include/portals/ptlctl.h +++ b/lustre/portals/include/portals/ptlctl.h @@ -54,8 +54,10 @@ int jt_ptl_txmem (int argc, char **argv); int jt_ptl_nagle (int argc, char **argv); int jt_ptl_add_route (int argc, char **argv); int jt_ptl_del_route (int argc, char **argv); +int jt_ptl_notify_router (int argc, char **argv); int jt_ptl_print_routes (int argc, char **argv); int jt_ptl_fail_nid (int argc, char **argv); +int jt_ptl_lwt(int argc, char **argv); int dbg_initialize(int argc, char **argv); int jt_dbg_filter(int argc, char **argv); diff --git a/lustre/portals/knals/qswnal/qswnal.c b/lustre/portals/knals/qswnal/qswnal.c index b5e1e3986b30664e6a4a7eff09da997288e32f76..0841d641dc3134709c69e280a35b9feb46f74913 100644 --- a/lustre/portals/knals/qswnal/qswnal.c +++ b/lustre/portals/knals/qswnal/qswnal.c @@ -32,6 +32,7 @@ kpr_nal_interface_t kqswnal_router_interface = { kprni_nalid: QSWNAL, kprni_arg: NULL, kprni_fwd: kqswnal_fwd_packet, + kprni_notify: NULL, /* we're connectionless */ }; diff --git a/lustre/portals/knals/qswnal/qswnal.h b/lustre/portals/knals/qswnal/qswnal.h index 294f3e5e6696d9610eae745583b026f4a187d931..0d8e2fa61d59de54787c60beab6f12e81d5aaef1 100644 --- a/lustre/portals/knals/qswnal/qswnal.h +++ b/lustre/portals/knals/qswnal/qswnal.h @@ -164,6 +164,7 @@ typedef struct void *ktx_args[2]; /* completion passthru */ E3_Addr ktx_ebuffer; /* elan address of ktx_buffer */ char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */ + unsigned long ktx_launchtime; /* when (in jiffies) the transmit was launched */ /* debug/info fields */ pid_t ktx_launcher; /* pid of launching process */ diff --git a/lustre/portals/knals/qswnal/qswnal_cb.c b/lustre/portals/knals/qswnal/qswnal_cb.c index 8d5f70b02b39f24145011850ef9cc8cc3846e456..99f299f47446e78cb5392de5d83116053a31ae4f 100644 --- a/lustre/portals/knals/qswnal/qswnal_cb.c +++ b/lustre/portals/knals/qswnal/qswnal_cb.c @@ -421,7 +421,9 @@ static void kqswnal_txhandler(EP_TXD *txd, void *arg, int status) { kqswnal_tx_t *ktx = (kqswnal_tx_t *)arg; - + struct timeval now; + time_t then; + LASSERT (txd != NULL); LASSERT (ktx != NULL); @@ -432,7 +434,15 @@ kqswnal_txhandler(EP_TXD *txd, void *arg, int status) if (status != EP_SUCCESS) { - CERROR ("kqswnal: Transmit failed with %d\n", status); + CERROR ("Tx completion to "LPX64" failed: %d\n", + ktx->ktx_nid, status); + + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - ktx->ktx_launchtime)/HZ; + + kpr_notify (&kqswnal_data.kqn_router, + ktx->ktx_nid, 0, then); + status = -EIO; } @@ -447,33 +457,40 @@ kqswnal_launch (kqswnal_tx_t *ktx) int dest = kqswnal_nid2elanid (ktx->ktx_nid); long flags; int rc; - + + ktx->ktx_launchtime = jiffies; + LASSERT (dest >= 0); /* must be a peer */ rc = ep_transmit_large(kqswnal_data.kqn_eptx, dest, ktx->ktx_port, attr, kqswnal_txhandler, ktx, ktx->ktx_iov, ktx->ktx_niov); - if (rc == 0) + switch (rc) { + case 0: /* success */ atomic_inc (&kqswnal_packets_launched); + return (0); - if (rc != ENOMEM) - return (rc); + case ENOMEM: /* can't allocate ep txd => queue for later */ + LASSERT (in_interrupt()); - /* can't allocate ep txd => queue for later */ + spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); - LASSERT (in_interrupt()); /* not called by thread (not looping) */ + list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); + if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) + wake_up (&kqswnal_data.kqn_sched_waitq); - spin_lock_irqsave (&kqswnal_data.kqn_sched_lock, flags); + spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + return (0); - list_add_tail (&ktx->ktx_delayed_list, &kqswnal_data.kqn_delayedtxds); - if (waitqueue_active (&kqswnal_data.kqn_sched_waitq)) - wake_up (&kqswnal_data.kqn_sched_waitq); - - spin_unlock_irqrestore (&kqswnal_data.kqn_sched_lock, flags); + default: /* fatal error */ + CERROR ("Tx to "LPX64" failed: %d\n", ktx->ktx_nid, rc); - return (0); + /* Tell router I think a node is down */ + kpr_notify (&kqswnal_data.kqn_router, ktx->ktx_nid, + 0, ktx->ktx_launchtime); + return (rc); + } } - static char * hdr_type_string (ptl_hdr_t *hdr) { @@ -584,7 +601,8 @@ kqswnal_sendmsg (nal_cb_t *nal, } if (kqswnal_nid2elanid (nid) < 0) { /* Can't send direct: find gateway? */ - rc = kpr_lookup (&kqswnal_data.kqn_router, nid, &gatewaynid); + rc = kpr_lookup (&kqswnal_data.kqn_router, nid, + sizeof (ptl_hdr_t) + payload_nob, &gatewaynid); if (rc != 0) { CERROR("Can't route to "LPX64": router error %d\n", nid, rc); @@ -678,7 +696,7 @@ kqswnal_sendmsg (nal_cb_t *nal, return (PTL_FAIL); } - CDEBUG(D_NET, "send to "LPSZ" bytes to "LPX64"\n", payload_nob, nid); + CDEBUG(D_NET, "sent "LPSZ" bytes to "LPX64"\n", payload_nob, nid); return (PTL_OK); } diff --git a/lustre/portals/knals/socknal/socknal.c b/lustre/portals/knals/socknal/socknal.c index e7232a05a91b9eec42339d1039a394f6cb7273c8..c844dd6547550597e2a0fd466d1ef3000568f6eb 100644 --- a/lustre/portals/knals/socknal/socknal.c +++ b/lustre/portals/knals/socknal/socknal.c @@ -37,8 +37,34 @@ kpr_nal_interface_t ksocknal_router_interface = { kprni_nalid: SOCKNAL, kprni_arg: &ksocknal_data, kprni_fwd: ksocknal_fwd_packet, + kprni_notify: ksocknal_notify, }; +#define SOCKNAL_SYSCTL 200 + +#define SOCKNAL_SYSCTL_TIMEOUT 1 +#define SOCKNAL_SYSCTL_EAGER_ACK 2 +#define SOCKNAL_SYSCTL_ZERO_COPY 3 + +static ctl_table ksocknal_ctl_table[] = { + {SOCKNAL_SYSCTL_TIMEOUT, "timeout", + &ksocknal_data.ksnd_io_timeout, sizeof (int), + 0644, NULL, &proc_dointvec}, + {SOCKNAL_SYSCTL_EAGER_ACK, "eager_ack", + &ksocknal_data.ksnd_eager_ack, sizeof (int), + 0644, NULL, &proc_dointvec}, +#if SOCKNAL_ZC + {SOCKNAL_SYSCTL_EAGER_ACK, "zero_copy", + &ksocknal_data.ksnd_zc_min_frag, sizeof (int), + 0644, NULL, &proc_dointvec}, +#endif + { 0 } +}; + +static ctl_table ksocknal_top_ctl_table[] = { + {SOCKNAL_SYSCTL, "socknal", NULL, 0, 0555, ksocknal_ctl_table}, + { 0 } +}; int ksocknal_api_forward(nal_t *nal, int id, void *args, size_t args_len, @@ -172,7 +198,7 @@ ksocknal_bind_irq (unsigned int irq) ksock_route_t * ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, - int irq_affinity, int xchange_nids, int nonagel) + int nonagel, int xchange_nids, int irq_affinity, int eager) { ksock_route_t *route; @@ -183,7 +209,7 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, atomic_set (&route->ksnr_refcount, 1); route->ksnr_sharecount = 0; route->ksnr_peer = NULL; - route->ksnr_timeout = jiffies_64; + route->ksnr_timeout = jiffies; route->ksnr_retry_interval = SOCKNAL_MIN_RECONNECT_INTERVAL; route->ksnr_ipaddr = ipaddr; route->ksnr_port = port; @@ -191,6 +217,7 @@ ksocknal_create_route (__u32 ipaddr, int port, int buffer_size, route->ksnr_irq_affinity = irq_affinity; route->ksnr_xchange_nids = xchange_nids; route->ksnr_nonagel = nonagel; + route->ksnr_eager = eager; route->ksnr_connecting = 0; route->ksnr_deleted = 0; route->ksnr_generation = 0; @@ -371,7 +398,8 @@ ksocknal_get_route_by_idx (int index) int ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, - int nonagle, int xchange_nids, int bind_irq, int share) + int nonagle, int xchange_nids, int bind_irq, + int share, int eager) { unsigned long flags; ksock_peer_t *peer; @@ -388,8 +416,8 @@ ksocknal_add_route (ptl_nid_t nid, __u32 ipaddr, int port, int bufnob, if (peer == NULL) return (-ENOMEM); - route = ksocknal_create_route (ipaddr, port, bufnob, - nonagle, xchange_nids, bind_irq); + route = ksocknal_create_route (ipaddr, port, bufnob, nonagle, + xchange_nids, bind_irq, eager); if (route == NULL) { ksocknal_put_peer (peer); return (-ENOMEM); @@ -454,7 +482,7 @@ ksocknal_del_route_locked (ksock_route_t *route, int share, int keep_conn) if (conn != NULL) { if (!keep_conn) - ksocknal_close_conn_locked (conn); + ksocknal_close_conn_locked (conn, 0); else { /* keeping the conn; just dissociate it and route... */ conn->ksnc_route = NULL; @@ -568,14 +596,12 @@ ksocknal_get_peer_addr (ksock_conn_t *conn) struct sockaddr_in sin; int len = sizeof (sin); int rc; - - rc = ksocknal_getconnsock (conn); - LASSERT (rc == 0); rc = conn->ksnc_sock->ops->getname (conn->ksnc_sock, (struct sockaddr *)&sin, &len, 2); + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); LASSERT (len <= sizeof (sin)); - ksocknal_putconnsock (conn); if (rc != 0) { CERROR ("Error %d getting sock peer IP\n", rc); @@ -590,12 +616,8 @@ unsigned int ksocknal_conn_irq (ksock_conn_t *conn) { int irq = 0; - int rc; struct dst_entry *dst; - rc = ksocknal_getconnsock (conn); - LASSERT (rc == 0); - dst = sk_dst_get (conn->ksnc_sock->sk); if (dst != NULL) { if (dst->dev != NULL) { @@ -608,7 +630,8 @@ ksocknal_conn_irq (ksock_conn_t *conn) dst_release (dst); } - ksocknal_putconnsock (conn); + /* Didn't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); return (irq); } @@ -660,11 +683,13 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, int rc; /* NB, sock has an associated file since (a) this connection might - * have been created in userland and (b) we need the refcounting so - * that we don't close the socket while I/O is being done on it. */ + * have been created in userland and (b) we need to refcount the + * socket so that we don't close it while I/O is being done on + * it, and sock->file has that pre-cooked... */ LASSERT (sock->file != NULL); + LASSERT (file_count(sock->file) > 0); - rc = ksocknal_set_linger (sock); + rc = ksocknal_setup_sock (sock); if (rc != 0) return (rc); @@ -696,9 +721,6 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, ksocknal_new_packet (conn, 0); INIT_LIST_HEAD (&conn->ksnc_tx_queue); -#if SOCKNAL_ZC - INIT_LIST_HEAD (&conn->ksnc_tx_pending); -#endif conn->ksnc_tx_ready = 0; conn->ksnc_tx_scheduled = 0; atomic_set (&conn->ksnc_tx_nob, 0); @@ -753,6 +775,8 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, conn->ksnc_peer = peer; atomic_inc (&peer->ksnp_refcount); + peer->ksnp_last_alive = jiffies; + peer->ksnp_error = 0; list_add (&conn->ksnc_list, &peer->ksnp_conns); atomic_inc (&conn->ksnc_refcount); @@ -797,7 +821,7 @@ ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, } void -ksocknal_close_conn_locked (ksock_conn_t *conn) +ksocknal_close_conn_locked (ksock_conn_t *conn, int error) { /* This just does the immmediate housekeeping, and queues the * connection for the reaper to terminate. @@ -805,6 +829,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) ksock_peer_t *peer = conn->ksnc_peer; ksock_route_t *route; + LASSERT (peer->ksnp_error == 0); LASSERT (!conn->ksnc_closing); conn->ksnc_closing = 1; atomic_inc (&ksocknal_data.ksnd_nclosing_conns); @@ -825,11 +850,16 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) /* ksnd_deathrow_conns takes over peer's ref */ list_del (&conn->ksnc_list); - if (list_empty (&peer->ksnp_conns) && - list_empty (&peer->ksnp_routes)) { - /* I've just closed last conn belonging to a - * non-autoconnecting peer */ - ksocknal_unlink_peer_locked (peer); + if (list_empty (&peer->ksnp_conns)) { + /* No more connections to this peer */ + + peer->ksnp_error = error; /* stash last conn close reason */ + + if (list_empty (&peer->ksnp_routes)) { + /* I've just closed last conn belonging to a + * non-autoconnecting peer */ + ksocknal_unlink_peer_locked (peer); + } } spin_lock (&ksocknal_data.ksnd_reaper_lock); @@ -842,7 +872,7 @@ ksocknal_close_conn_locked (ksock_conn_t *conn) } int -ksocknal_close_conn_unlocked (ksock_conn_t *conn) +ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why) { unsigned long flags; int did_it = 0; @@ -851,7 +881,7 @@ ksocknal_close_conn_unlocked (ksock_conn_t *conn) if (!conn->ksnc_closing) { did_it = 1; - ksocknal_close_conn_locked (conn); + ksocknal_close_conn_locked (conn, why); } write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); @@ -867,6 +897,10 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * ksnc_refcount will eventually hit zero, and then the reaper will * destroy it. */ unsigned long flags; + ksock_peer_t *peer = conn->ksnc_peer; + struct timeval now; + time_t then = 0; + int notify = 0; /* serialise with callbacks */ write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); @@ -886,6 +920,16 @@ ksocknal_terminate_conn (ksock_conn_t *conn) conn->ksnc_scheduler->kss_nconns--; + if (peer->ksnp_error != 0) { + /* peer's last conn closed in error */ + LASSERT (list_empty (&peer->ksnp_conns)); + + /* convert peer's last-known-alive timestamp from jiffies */ + do_gettimeofday (&now); + then = now.tv_sec - (jiffies - peer->ksnp_last_alive)/HZ; + notify = 1; + } + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); /* The socket is closed on the final put; either here, or in @@ -894,6 +938,10 @@ ksocknal_terminate_conn (ksock_conn_t *conn) * immediately, aborting anything buffered in it. Any hung * zero-copy transmits will therefore complete in finite time. */ ksocknal_putconnsock (conn); + + if (notify) + kpr_notify (&ksocknal_data.ksnd_router, peer->ksnp_nid, + 0, then); } void @@ -906,9 +954,7 @@ ksocknal_destroy_conn (ksock_conn_t *conn) LASSERT (conn->ksnc_route == NULL); LASSERT (!conn->ksnc_tx_scheduled); LASSERT (!conn->ksnc_rx_scheduled); -#if SOCKNAL_ZC - LASSERT (list_empty (&conn->ksnc_tx_pending)); -#endif + /* complete queued packets */ while (!list_empty (&conn->ksnc_tx_queue)) { ksock_tx_t *tx = list_entry (conn->ksnc_tx_queue.next, @@ -1010,7 +1056,7 @@ ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) continue; rc = 0; - ksocknal_close_conn_locked (conn); + ksocknal_close_conn_locked (conn, 0); } } } @@ -1020,6 +1066,24 @@ ksocknal_close_conn (ptl_nid_t nid, __u32 ipaddr) return (rc); } +void +ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive) +{ + /* The router is telling me she's been notified of a change in + * gateway state.... */ + + CDEBUG (D_NET, "gw "LPX64" %s\n", gw_nid, alive ? "up" : "down"); + + if (!alive) { + /* If the gateway crashed, close all open connections... */ + ksocknal_close_conn (gw_nid, 0); + return; + } + + /* ...otherwise do nothing. We can only establish new connections + * if we have autroutes, and these connect on demand. */ +} + #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) struct tcp_opt *sock2tcp_opt(struct sock *sk) { @@ -1177,7 +1241,8 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) data->ioc_wait = route->ksnr_sharecount; data->ioc_flags = (route->ksnr_nonagel ? 1 : 0) | (route->ksnr_xchange_nids ? 2 : 0) | - (route->ksnr_irq_affinity ? 4 : 0); + (route->ksnr_irq_affinity ? 4 : 0) | + (route->ksnr_eager ? 8 : 0); ksocknal_put_route (route); } break; @@ -1185,10 +1250,11 @@ ksocknal_cmd(struct portal_ioctl_data * data, void * private) case NAL_CMD_ADD_AUTOCONN: { rc = ksocknal_add_route (data->ioc_nid, data->ioc_id, data->ioc_misc, data->ioc_size, - (data->ioc_flags & 1) != 0, - (data->ioc_flags & 2) != 0, - (data->ioc_flags & 4) != 0, - (data->ioc_flags & 8) != 0); + (data->ioc_flags & 0x01) != 0, + (data->ioc_flags & 0x02) != 0, + (data->ioc_flags & 0x04) != 0, + (data->ioc_flags & 0x08) != 0, + (data->ioc_flags & 0x10) != 0); break; } case NAL_CMD_DEL_AUTOCONN: { @@ -1287,6 +1353,10 @@ ksocknal_module_fini (void) LASSERT (0); case SOCKNAL_INIT_ALL: +#if CONFIG_SYSCTL + if (ksocknal_data.ksnd_sysctl != NULL) + unregister_sysctl_table (ksocknal_data.ksnd_sysctl); +#endif kportal_nal_unregister(SOCKNAL); PORTAL_SYMBOL_UNREGISTER (ksocknal_ni); /* fall through */ @@ -1364,6 +1434,9 @@ ksocknal_module_init (void) /* packet descriptor must fit in a router descriptor's scratchpad */ LASSERT(sizeof (ksock_tx_t) <= sizeof (kprfd_scratch_t)); + /* the following must be sizeof(int) for proc_dointvec() */ + LASSERT(sizeof (ksocknal_data.ksnd_io_timeout) == sizeof (int)); + LASSERT(sizeof (ksocknal_data.ksnd_eager_ack) == sizeof (int)); LASSERT (ksocknal_data.ksnd_init == SOCKNAL_INIT_NOTHING); @@ -1379,6 +1452,12 @@ ksocknal_module_init (void) memset (&ksocknal_data, 0, sizeof (ksocknal_data)); /* zero pointers */ + ksocknal_data.ksnd_io_timeout = SOCKNAL_IO_TIMEOUT; + ksocknal_data.ksnd_eager_ack = SOCKNAL_EAGER_ACK; +#if SOCKNAL_ZC + ksocknal_data.ksnd_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; +#endif + ksocknal_data.ksnd_peer_hash_size = SOCKNAL_PEER_HASH_SIZE; PORTAL_ALLOC (ksocknal_data.ksnd_peers, sizeof (struct list_head) * ksocknal_data.ksnd_peer_hash_size); @@ -1561,9 +1640,13 @@ ksocknal_module_init (void) PORTAL_SYMBOL_REGISTER(ksocknal_ni); +#ifdef CONFIG_SYSCTL + /* Press on regardless even if registering sysctl doesn't work */ + ksocknal_data.ksnd_sysctl = register_sysctl_table (ksocknal_top_ctl_table, 0); +#endif /* flag everything initialised */ ksocknal_data.ksnd_init = SOCKNAL_INIT_ALL; - + printk(KERN_INFO "Routing socket NAL loaded (Routing %s, initial " "mem %d)\n", kpr_routing (&ksocknal_data.ksnd_router) ? diff --git a/lustre/portals/knals/socknal/socknal.h b/lustre/portals/knals/socknal/socknal.h index 69daa027f1870d289edda839f0bcffb9327047ab..abd8e7b718c211e15b35b6f23aafb48f82c2fd2c 100644 --- a/lustre/portals/knals/socknal/socknal.h +++ b/lustre/portals/knals/socknal/socknal.h @@ -48,6 +48,7 @@ #include <linux/stat.h> #include <linux/list.h> #include <linux/kmod.h> +#include <linux/sysctl.h> #include <asm/uaccess.h> #include <asm/segment.h> #include <asm/div64.h> @@ -68,7 +69,12 @@ #define SOCKNAL_MIN_RECONNECT_INTERVAL HZ /* first failed connection retry... */ #define SOCKNAL_MAX_RECONNECT_INTERVAL (60*HZ) /* ...exponentially increasing to this */ -#define SOCKNAL_IO_TIMEOUT (60*HZ) /* default comms timeout */ +/* default vals for runtime tunables */ +#define SOCKNAL_IO_TIMEOUT 50 /* default comms timeout (seconds) */ +#define SOCKNAL_EAGER_ACK 1 /* default eager ack (boolean) */ +#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ + +#define SOCKNAL_USE_KEEPALIVES 0 /* use tcp/ip keepalive? */ #define SOCKNAL_PEER_HASH_SIZE 101 /* # peer lists */ @@ -78,8 +84,6 @@ # define SOCKNAL_MAX_FWD_PAYLOAD (64<<10) /* biggest payload I can forward */ #endif -#define SOCKNAL_ZC_MIN_FRAG (2<<10) /* default smallest zerocopy fragment */ - #define SOCKNAL_NLTXS 128 /* # normal transmit messages */ #define SOCKNAL_NNBLK_LTXS 128 /* # transmit messages reserved if can't block */ @@ -95,10 +99,6 @@ #define SOCKNAL_TX_LOW_WATER(sk) (((sk)->sk_sndbuf*8)/10) -#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) -# define jiffies_64 jiffies -#endif - #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,72)) # define sk_data_ready data_ready # define sk_write_space write_space @@ -136,6 +136,12 @@ typedef struct { typedef struct { int ksnd_init; /* initialisation state */ + int ksnd_io_timeout; /* "stuck" socket timeout (seconds) */ + int ksnd_eager_ack; /* make TCP ack eagerly? */ +#if SOCKNAL_ZC + unsigned int ksnd_zc_min_frag; /* minimum zero copy frag size */ +#endif + struct ctl_table_header *ksnd_sysctl; /* sysctl interface */ rwlock_t ksnd_global_lock; /* stabilize peer/conn ops */ struct list_head *ksnd_peers; /* hash table of all my known peers */ @@ -205,7 +211,6 @@ struct ksock_route; /* forward ref */ typedef struct /* transmit packet */ { struct list_head tx_list; /* queue on conn for transmission etc */ - __u64 tx_deadline; /* when (in jiffies) tx times out */ char tx_isfwd; /* forwarding / sourced here */ int tx_nob; /* # packet bytes */ int tx_resid; /* residual bytes */ @@ -291,10 +296,11 @@ typedef struct ksock_conn __u32 ksnc_ipaddr; /* peer's IP */ int ksnc_port; /* peer's port */ int ksnc_closing; /* being shut down */ - + /* READER */ struct list_head ksnc_rx_list; /* where I enq waiting input or a forwarding descriptor */ - __u64 ksnc_rx_deadline; /* when receive times out */ + unsigned long ksnc_rx_deadline; /* when (in jiffies) receive times out */ + int ksnc_rx_started; /* started receiving a message */ int ksnc_rx_ready; /* data ready to read */ int ksnc_rx_scheduled; /* being progressed */ int ksnc_rx_state; /* what is being read */ @@ -311,9 +317,7 @@ typedef struct ksock_conn /* WRITER */ struct list_head ksnc_tx_list; /* where I enq waiting for output space */ struct list_head ksnc_tx_queue; /* packets waiting to be sent */ -#if SOCKNAL_ZC - struct list_head ksnc_tx_pending; /* zc packets pending callback */ -#endif + unsigned long ksnc_tx_deadline; /* when (in jiffies) tx times out */ atomic_t ksnc_tx_nob; /* # bytes queued */ int ksnc_tx_ready; /* write space */ int ksnc_tx_scheduled; /* being progressed */ @@ -326,7 +330,7 @@ typedef struct ksock_route struct ksock_peer *ksnr_peer; /* owning peer */ atomic_t ksnr_refcount; /* # users */ int ksnr_sharecount; /* lconf usage counter */ - __u64 ksnr_timeout; /* when reconnection can happen next */ + unsigned long ksnr_timeout; /* when (in jiffies) reconnection can happen next */ unsigned int ksnr_retry_interval; /* how long between retries */ __u32 ksnr_ipaddr; /* an IP address for this peer */ int ksnr_port; /* port to connect to */ @@ -334,8 +338,9 @@ typedef struct ksock_route unsigned int ksnr_irq_affinity:1; /* set affinity? */ unsigned int ksnr_xchange_nids:1; /* do hello protocol? */ unsigned int ksnr_nonagel:1; /* disable nagle? */ - unsigned int ksnr_connecting; /* autoconnect in progress? */ - unsigned int ksnr_deleted; /* been removed from peer? */ + unsigned int ksnr_eager:1; /* connect eagery? */ + unsigned int ksnr_connecting:1; /* autoconnect in progress? */ + unsigned int ksnr_deleted:1; /* been removed from peer? */ int ksnr_generation; /* connection incarnation # */ ksock_conn_t *ksnr_conn; /* NULL/active connection */ } ksock_route_t; @@ -346,13 +351,14 @@ typedef struct ksock_peer ptl_nid_t ksnp_nid; /* who's on the other end(s) */ atomic_t ksnp_refcount; /* # users */ int ksnp_closing; /* being closed */ + int ksnp_error; /* errno on closing last conn */ struct list_head ksnp_conns; /* all active connections */ struct list_head ksnp_routes; /* routes */ struct list_head ksnp_tx_queue; /* waiting packets */ + unsigned long ksnp_last_alive; /* when (in jiffies) I was last alive */ } ksock_peer_t; - extern nal_cb_t ksocknal_lib; extern ksock_nal_data_t ksocknal_data; @@ -393,8 +399,8 @@ extern int ksocknal_del_route (ptl_nid_t nid, __u32 ipaddr, int single, int keep_conn); extern int ksocknal_create_conn (ptl_nid_t nid, ksock_route_t *route, struct socket *sock, int bind_irq); -extern void ksocknal_close_conn_locked (ksock_conn_t *conn); -extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn); +extern void ksocknal_close_conn_locked (ksock_conn_t *conn, int why); +extern int ksocknal_close_conn_unlocked (ksock_conn_t *conn, int why); extern void ksocknal_terminate_conn (ksock_conn_t *conn); extern void ksocknal_destroy_conn (ksock_conn_t *conn); extern void ksocknal_put_conn (ksock_conn_t *conn); @@ -404,6 +410,7 @@ extern void ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn); extern void ksocknal_tx_done (ksock_tx_t *tx, int asynch); extern void ksocknal_fwd_packet (void *arg, kpr_fwd_desc_t *fwd); extern void ksocknal_fmb_callback (void *arg, int error); +extern void ksocknal_notify (void *arg, ptl_nid_t gw_nid, int alive); extern int ksocknal_thread_start (int (*fn)(void *arg), void *arg); extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); extern int ksocknal_scheduler (void *arg); @@ -411,4 +418,4 @@ extern void ksocknal_data_ready(struct sock *sk, int n); extern void ksocknal_write_space(struct sock *sk); extern int ksocknal_autoconnectd (void *arg); extern int ksocknal_reaper (void *arg); -extern int ksocknal_set_linger (struct socket *sock); +extern int ksocknal_setup_sock (struct socket *sock); diff --git a/lustre/portals/knals/socknal/socknal_cb.c b/lustre/portals/knals/socknal/socknal_cb.c index 656a0c5cce68f2c17a1312712182b1dd681595a2..f946d87438d40ae2c2df4f3b967389ed3b97e349 100644 --- a/lustre/portals/knals/socknal/socknal_cb.c +++ b/lustre/portals/knals/socknal/socknal_cb.c @@ -25,12 +25,6 @@ #include "socknal.h" -int ksocknal_io_timeout = SOCKNAL_IO_TIMEOUT; -#if SOCKNAL_ZC -int ksocknal_do_zc = 1; -int ksocknal_zc_min_frag = SOCKNAL_ZC_MIN_FRAG; -#endif - /* * LIB functions follow * @@ -225,7 +219,7 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) struct iovec *iov = tx->tx_iov; int fragsize = iov->iov_len; unsigned long vaddr = (unsigned long)iov->iov_base; - int more = !list_empty (&conn->ksnc_tx_queue) | + int more = (!list_empty (&conn->ksnc_tx_queue)) | (tx->tx_niov > 1) | (tx->tx_nkiov > 1); #if SOCKNAL_ZC @@ -241,10 +235,9 @@ ksocknal_send_iov (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (tx->tx_niov > 0); #if SOCKNAL_ZC - if (ksocknal_do_zc && + if (zcsize >= ksocknal_data.ksnd_zc_min_frag && (sock->sk->route_caps & NETIF_F_SG) && (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && - zcsize >= ksocknal_zc_min_frag && (page = ksocknal_kvaddr_to_page (vaddr)) != NULL) { CDEBUG(D_NET, "vaddr %p, page %p->%p + offset %x for %d\n", @@ -306,7 +299,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) int fragsize = kiov->kiov_len; struct page *page = kiov->kiov_page; int offset = kiov->kiov_offset; - int more = !list_empty (&conn->ksnc_tx_queue) | + int more = (!list_empty (&conn->ksnc_tx_queue)) | (tx->tx_nkiov > 1); int rc; @@ -318,10 +311,9 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) LASSERT (tx->tx_nkiov > 0); #if SOCKNAL_ZC - if (ksocknal_do_zc && + if (fragsize >= ksocknal_data.ksnd_zc_min_frag && (sock->sk->route_caps & NETIF_F_SG) && - (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)) && - fragsize >= ksocknal_zc_min_frag) { + (sock->sk->route_caps & (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM))) { CDEBUG(D_NET, "page %p + offset %x for %d\n", page, offset, fragsize); @@ -349,6 +341,7 @@ ksocknal_send_kiov (ksock_conn_t *conn, ksock_tx_t *tx) set_fs (KERNEL_DS); rc = sock_sendmsg(sock, &msg, fragsize); set_fs (oldmm); + kunmap (page); } @@ -410,6 +403,16 @@ ksocknal_sendmsg (ksock_conn_t *conn, ksock_tx_t *tx) break; } + /* Consider the connection alive since we managed to chuck + * more data into it. Really, we'd like to consider it + * alive only when the peer ACKs something, but + * write_space() only gets called back while SOCK_NOSPACE + * is set. Instead, we presume peer death has occurred if + * the socket doesn't drain within a timout */ + conn->ksnc_tx_deadline = jiffies + + ksocknal_data.ksnd_io_timeout * HZ; + conn->ksnc_peer->ksnp_last_alive = jiffies; + if (tx->tx_resid == 0) { /* sent everything */ rc = 0; break; @@ -420,6 +423,24 @@ ksocknal_sendmsg (ksock_conn_t *conn, ksock_tx_t *tx) RETURN (rc); } +void +ksocknal_eager_ack (ksock_conn_t *conn) +{ + int opt = 1; + mm_segment_t oldmm = get_fs(); + struct socket *sock = conn->ksnc_sock; + + /* Remind the socket to ACK eagerly. If I don't, the socket might + * think I'm about to send something it could piggy-back the ACK + * on, introducing delay in completing zero-copy sends in my + * peer. */ + + set_fs(KERNEL_DS); + sock->ops->setsockopt (sock, SOL_TCP, TCP_QUICKACK, + (char *)&opt, sizeof (opt)); + set_fs(oldmm); +} + int ksocknal_recv_iov (ksock_conn_t *conn) { @@ -453,6 +474,13 @@ ksocknal_recv_iov (ksock_conn_t *conn) if (rc <= 0) return (rc); + /* received something... */ + conn->ksnc_peer->ksnp_last_alive = jiffies; + conn->ksnc_rx_deadline = jiffies + + ksocknal_data.ksnd_io_timeout * HZ; + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + conn->ksnc_rx_nob_wanted -= rc; conn->ksnc_rx_nob_left -= rc; @@ -499,11 +527,19 @@ ksocknal_recv_kiov (ksock_conn_t *conn) rc = sock_recvmsg (conn->ksnc_sock, &msg, fragsize, MSG_DONTWAIT); /* NB this is just a boolean............................^ */ set_fs (oldmm); + kunmap (page); if (rc <= 0) return (rc); + /* received something... */ + conn->ksnc_peer->ksnp_last_alive = jiffies; + conn->ksnc_rx_deadline = jiffies + + ksocknal_data.ksnd_io_timeout * HZ; + mb(); /* order with setting rx_started */ + conn->ksnc_rx_started = 1; + conn->ksnc_rx_nob_wanted -= rc; conn->ksnc_rx_nob_left -= rc; @@ -541,20 +577,33 @@ ksocknal_recvmsg (ksock_conn_t *conn) rc = -ESHUTDOWN; break; } - + if (conn->ksnc_rx_niov != 0) rc = ksocknal_recv_iov (conn); else rc = ksocknal_recv_kiov (conn); - + if (rc <= 0) { /* error/EOF or partial receive */ - if (rc == -EAGAIN) + if (rc == -EAGAIN) { rc = 1; + } else if (rc == 0 && conn->ksnc_rx_started) { + /* EOF in the middle of a message */ + rc = -EPROTO; + } break; } + /* Completed a fragment */ + if (conn->ksnc_rx_nob_wanted == 0) { + /* Completed a message segment (header or payload) */ + if (ksocknal_data.ksnd_eager_ack && + (conn->ksnc_rx_state == SOCKNAL_RX_BODY || + conn->ksnc_rx_state == SOCKNAL_RX_BODY_FWD)) { + /* Remind the socket to ack eagerly... */ + ksocknal_eager_ack(conn); + } rc = 1; break; } @@ -577,7 +626,6 @@ ksocknal_zc_callback (zccd_t *zcd) spin_lock_irqsave (&sched->kss_lock, flags); - list_del (&tx->tx_list); /* remove from kss_zctxpending_list */ list_add_tail (&tx->tx_list, &sched->kss_zctxdone_list); if (waitqueue_active (&sched->kss_waitq)) wake_up (&sched->kss_waitq); @@ -628,23 +676,12 @@ ksocknal_tx_launched (ksock_tx_t *tx) { #if SOCKNAL_ZC if (atomic_read (&tx->tx_zccd.zccd_count) != 1) { - unsigned long flags; ksock_conn_t *conn = tx->tx_conn; - ksock_sched_t *sched = conn->ksnc_scheduler; /* zccd skbufs are still in-flight. First take a ref on * conn, so it hangs about for ksocknal_tx_done... */ atomic_inc (&conn->ksnc_refcount); - /* Stash it for timeout... - * NB We have to hold a lock to stash the tx, and we have - * stash it before we zcc_put(), but we have to _not_ hold - * this lock when we zcc_put(), otherwise we could deadlock - * if it turns out to be the last put. Aaaaarrrrggghhh! */ - spin_lock_irqsave (&sched->kss_lock, flags); - list_add_tail (&tx->tx_list, &conn->ksnc_tx_pending); - spin_unlock_irqrestore (&sched->kss_lock, flags); - /* ...then drop the initial ref on zccd, so the zero copy * callback can occur */ zccd_put (&tx->tx_zccd); @@ -659,10 +696,10 @@ ksocknal_tx_launched (ksock_tx_t *tx) void ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) { - ksock_conn_t *conn; - ksock_tx_t *tx; - int rc; - + ksock_conn_t *conn; + ksock_tx_t *tx; + int rc; + LASSERT (!list_empty (&sched->kss_tx_conns)); conn = list_entry(sched->kss_tx_conns.next, ksock_conn_t, ksnc_tx_list); list_del (&conn->ksnc_tx_list); @@ -686,7 +723,7 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) CDEBUG (D_NET, "send(%d) %d\n", tx->tx_resid, rc); if (rc != 0) { - if (ksocknal_close_conn_unlocked (conn)) { + if (ksocknal_close_conn_unlocked (conn, rc)) { /* I'm the first to close */ CERROR ("[%p] Error %d on write to "LPX64" ip %08x:%d\n", conn, rc, conn->ksnc_peer->ksnp_nid, @@ -696,7 +733,6 @@ ksocknal_process_transmit (ksock_sched_t *sched, unsigned long *irq_flags) spin_lock_irqsave (&sched->kss_lock, *irq_flags); } else if (tx->tx_resid == 0) { - /* everything went; assume more can go, and avoid * write_space locking */ conn->ksnc_tx_ready = 1; @@ -763,7 +799,8 @@ ksocknal_find_target_peer_locked (ksock_tx_t *tx, ptl_nid_t nid) return (NULL); } - rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, &target_nid); + rc = kpr_lookup (&ksocknal_data.ksnd_router, nid, tx->tx_nob, + &target_nid); if (rc != 0) { CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); return (NULL); @@ -820,8 +857,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) #endif spin_lock_irqsave (&sched->kss_lock, flags); - - tx->tx_deadline = jiffies_64 + ksocknal_io_timeout; + list_add_tail (&tx->tx_list, &conn->ksnc_tx_queue); if (conn->ksnc_tx_ready && /* able to send */ @@ -839,7 +875,7 @@ ksocknal_queue_tx_locked (ksock_tx_t *tx, ksock_conn_t *conn) } ksock_route_t * -ksocknal_find_connectable_route_locked (ksock_peer_t *peer) +ksocknal_find_connectable_route_locked (ksock_peer_t *peer, int eager_only) { struct list_head *tmp; ksock_route_t *route; @@ -849,7 +885,8 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) if (route->ksnr_conn == NULL && /* not connected */ !route->ksnr_connecting && /* not connecting */ - route->ksnr_timeout <= jiffies_64) /* OK to retry */ + (!eager_only || route->ksnr_eager) && /* wants to connect */ + time_after_eq (jiffies, route->ksnr_timeout)) /* OK to retry */ return (route); } @@ -880,7 +917,7 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) ksock_conn_t *conn; ksock_route_t *route; rwlock_t *g_lock; - + /* Ensure the frags we've been given EXACTLY match the number of * bytes we want to send. Many TCP/IP stacks disregard any total * size parameters passed to them and just look at the frags. @@ -907,17 +944,19 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) return (PTL_FAIL); } - /* Any routes need to be connected? (need write lock if so) */ - if (ksocknal_find_connectable_route_locked (peer) == NULL) { + if (ksocknal_find_connectable_route_locked(peer, 1) == NULL) { conn = ksocknal_find_conn_locked (tx, peer); if (conn != NULL) { + /* I've got no unconnected autoconnect routes that + * need to be connected, and I do have an actual + * connection... */ ksocknal_queue_tx_locked (tx, conn); read_unlock (g_lock); return (PTL_OK); } } - /* need a write lock now to change peer state... */ + /* Making one or more connections; I'll need a write lock... */ atomic_inc (&peer->ksnp_refcount); /* +1 ref for me while I unlock */ read_unlock (g_lock); @@ -930,23 +969,37 @@ ksocknal_launch_packet (ksock_tx_t *tx, ptl_nid_t nid) } ksocknal_put_peer (peer); /* drop ref I got above */ - /* I may launch autoconnects, now we're write locked... */ - while ((route = ksocknal_find_connectable_route_locked (peer)) != NULL) + + for (;;) { + /* launch all eager autoconnections */ + route = ksocknal_find_connectable_route_locked (peer, 1); + if (route == NULL) + break; + ksocknal_launch_autoconnect_locked (route); + } conn = ksocknal_find_conn_locked (tx, peer); if (conn != NULL) { + /* Connection exists; queue message on it */ ksocknal_queue_tx_locked (tx, conn); write_unlock_irqrestore (g_lock, flags); return (PTL_OK); } - + if (ksocknal_find_connecting_route_locked (peer) == NULL) { - /* no routes actually connecting now */ - write_unlock_irqrestore (g_lock, flags); - return (PTL_FAIL); + /* no autoconnect routes actually connecting now. Scrape + * the barrel for non-eager autoconnects */ + route = ksocknal_find_connectable_route_locked (peer, 0); + if (route != NULL) { + ksocknal_launch_autoconnect_locked (route); + } else { + write_unlock_irqrestore (g_lock, flags); + return (PTL_FAIL); + } } + /* At least 1 connection is being established; queue the message... */ list_add_tail (&tx->tx_list, &peer->ksnp_tx_queue); write_unlock_irqrestore (g_lock, flags); @@ -1279,7 +1332,6 @@ ksocknal_init_fmb (ksock_conn_t *conn, ksock_fmb_t *fmb) conn->ksnc_cookie = fmb; /* stash fmb for later */ conn->ksnc_rx_state = SOCKNAL_RX_BODY_FWD; /* read in the payload */ - conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; /* start timeout */ /* payload is desc's iov-ed buffer, but skipping the hdr */ LASSERT (niov <= sizeof (conn->ksnc_rx_iov_space) / @@ -1323,7 +1375,7 @@ ksocknal_fwd_parse (ksock_conn_t *conn) dest_nid, body_len); ksocknal_new_packet (conn, 0); /* on to new packet */ - ksocknal_close_conn_unlocked (conn); /* give up on conn */ + ksocknal_close_conn_unlocked (conn, -EINVAL); /* give up on conn */ return; } @@ -1373,6 +1425,9 @@ ksocknal_new_packet (ksock_conn_t *conn, int nob_to_skip) int skipped; if (nob_to_skip == 0) { /* right at next packet boundary now */ + conn->ksnc_rx_started = 0; + mb (); /* racing with timeout thread */ + conn->ksnc_rx_state = SOCKNAL_RX_HEADER; conn->ksnc_rx_nob_wanted = sizeof (ptl_hdr_t); conn->ksnc_rx_nob_left = sizeof (ptl_hdr_t); @@ -1464,7 +1519,7 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) rc = ksocknal_recvmsg(conn); if (rc <= 0) { - if (ksocknal_close_conn_unlocked (conn)) { + if (ksocknal_close_conn_unlocked (conn, rc)) { /* I'm the first to close */ if (rc < 0) CERROR ("[%p] Error %d on read from "LPX64" ip %08x:%d\n", @@ -1478,6 +1533,7 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) goto out; } + if (conn->ksnc_rx_nob_wanted != 0) /* short read */ goto out; /* try again later */ @@ -1506,9 +1562,6 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) /* sets wanted_len, iovs etc */ lib_parse(&ksocknal_lib, &conn->ksnc_hdr, conn); - /* start timeout (lib is waiting for finalize) */ - conn->ksnc_rx_deadline = jiffies_64 + ksocknal_io_timeout; - if (conn->ksnc_rx_nob_wanted != 0) { /* need to get payload? */ conn->ksnc_rx_state = SOCKNAL_RX_BODY; goto try_read; /* go read the payload */ @@ -1517,7 +1570,6 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) case SOCKNAL_RX_BODY: /* payload all received */ - conn->ksnc_rx_deadline = 0; /* cancel timeout */ lib_finalize(&ksocknal_lib, NULL, conn->ksnc_cookie); /* Fall through */ @@ -1534,9 +1586,6 @@ ksocknal_process_receive (ksock_sched_t *sched, unsigned long *irq_flags) NTOH__u64 (conn->ksnc_hdr.dest_nid), conn->ksnc_rx_nob_left); - /* cancel timeout (only needed it while fmb allocated) */ - conn->ksnc_rx_deadline = 0; - /* forward the packet. NB ksocknal_init_fmb() put fmb into * conn->ksnc_cookie */ fmb = (ksock_fmb_t *)conn->ksnc_cookie; @@ -1631,15 +1680,20 @@ int ksocknal_scheduler (void *arg) int id = sched - ksocknal_data.ksnd_schedulers; char name[16]; - snprintf (name, sizeof (name),"ksocknald[%d]", id); + snprintf (name, sizeof (name),"ksocknald_%02d", id); kportal_daemonize (name); kportal_blockallsigs (); #if (CONFIG_SMP && CPU_AFFINITY) - if ((cpu_online_map & (1 << id)) != 0) + if ((cpu_online_map & (1 << id)) != 0) { +#if 0 current->cpus_allowed = (1 << id); - else +#else + set_cpus_allowed (current, 1<<id); +#endif + } else { CERROR ("Can't set CPU affinity for %s\n", name); + } #endif /* CONFIG_SMP && CPU_AFFINITY */ spin_lock_irqsave (&sched->kss_lock, flags); @@ -1722,7 +1776,10 @@ ksocknal_data_ready (struct sock *sk, int n) if (conn == NULL) { /* raced with ksocknal_close_sock */ LASSERT (sk->sk_data_ready != &ksocknal_data_ready); sk->sk_data_ready (sk, n); - } else if (!conn->ksnc_rx_ready) { /* new news */ + goto out; + } + + if (!conn->ksnc_rx_ready) { /* new news */ /* Set ASAP in case of concurrent calls to me */ conn->ksnc_rx_ready = 1; @@ -1747,6 +1804,7 @@ ksocknal_data_ready (struct sock *sk, int n) spin_unlock_irqrestore (&sched->kss_lock, flags); } + out: read_unlock (&ksocknal_data.ksnd_global_lock); EXIT; @@ -1776,7 +1834,12 @@ ksocknal_write_space (struct sock *sk) if (conn == NULL) { /* raced with ksocknal_close_sock */ LASSERT (sk->sk_write_space != &ksocknal_write_space); sk->sk_write_space (sk); - } else if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ + + read_unlock (&ksocknal_data.ksnd_global_lock); + return; + } + + if (tcp_wspace(sk) >= SOCKNAL_TX_LOW_WATER(sk)) { /* got enough space */ clear_bit (SOCK_NOSPACE, &sk->sk_socket->flags); if (!conn->ksnc_tx_ready) { /* new news */ @@ -1967,7 +2030,7 @@ ksocknal_exchange_nids (struct socket *sock, ptl_nid_t nid) } int -ksocknal_set_linger (struct socket *sock) +ksocknal_setup_sock (struct socket *sock) { mm_segment_t oldmm = get_fs (); int rc; @@ -1998,7 +2061,51 @@ ksocknal_set_linger (struct socket *sock) CERROR ("Can't set SO_LINGER2: %d\n", rc); return (rc); } + +#if SOCKNAL_USE_KEEPALIVES + /* Keepalives: If 3/4 of the timeout elapses, start probing every + * second until the timeout elapses. */ + + option = (ksocknal_data.ksnd_io_timeout * 3) / 4; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPIDLE, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPIDLE: %d\n", rc); + return (rc); + } + option = 1; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPINTVL, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); + return (rc); + } + + option = ksocknal_data.ksnd_io_timeout / 4; + set_fs (KERNEL_DS); + rc = sock->ops->setsockopt (sock, SOL_TCP, TCP_KEEPCNT, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set TCP_KEEPINTVL: %d\n", rc); + return (rc); + } + + option = 1; + set_fs (KERNEL_DS); + rc = sock_setsockopt (sock, SOL_SOCKET, SO_KEEPALIVE, + (char *)&option, sizeof (option)); + set_fs (oldmm); + if (rc != 0) { + CERROR ("Can't set SO_KEEPALIVE: %d\n", rc); + return (rc); + } +#endif return (0); } @@ -2007,7 +2114,6 @@ ksocknal_connect_peer (ksock_route_t *route) { struct sockaddr_in peer_addr; mm_segment_t oldmm = get_fs(); - __u64 n; struct timeval tv; int fd; struct socket *sock; @@ -2020,8 +2126,8 @@ ksocknal_connect_peer (ksock_route_t *route) } /* Ugh; have to map_fd for compatibility with sockets passed in - * from userspace. And we actually need the refcounting that - * this gives you :) */ + * from userspace. And we actually need the sock->file refcounting + * that this gives you :) */ fd = sock_map_fd (sock); if (fd < 0) { @@ -2033,22 +2139,19 @@ ksocknal_connect_peer (ksock_route_t *route) /* NB the fd now owns the ref on sock->file */ LASSERT (sock->file != NULL); LASSERT (file_count(sock->file) == 1); - + /* Set the socket timeouts, so our connection attempt completes in * finite time */ - tv.tv_sec = ksocknal_io_timeout / HZ; - n = ksocknal_io_timeout % HZ; - n = n * 1000000 + HZ - 1; - do_div (n, HZ); - tv.tv_usec = n; + tv.tv_sec = ksocknal_data.ksnd_io_timeout; + tv.tv_usec = 0; set_fs (KERNEL_DS); rc = sock_setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof (tv)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set send timeout %d (in HZ): %d\n", - ksocknal_io_timeout, rc); + CERROR ("Can't set send timeout %d: %d\n", + ksocknal_data.ksnd_io_timeout, rc); goto out; } @@ -2057,8 +2160,8 @@ ksocknal_connect_peer (ksock_route_t *route) (char *)&tv, sizeof (tv)); set_fs (oldmm); if (rc != 0) { - CERROR ("Can't set receive timeout %d (in HZ): %d\n", - ksocknal_io_timeout, rc); + CERROR ("Can't set receive timeout %d: %d\n", + ksocknal_data.ksnd_io_timeout, rc); goto out; } @@ -2154,7 +2257,7 @@ ksocknal_autoconnect (ksock_route_t *route) route->ksnr_connecting = 0; LASSERT (route->ksnr_retry_interval != 0); - route->ksnr_timeout = jiffies_64 + route->ksnr_retry_interval; + route->ksnr_timeout = jiffies + route->ksnr_retry_interval; route->ksnr_retry_interval = MIN (route->ksnr_retry_interval * 2, SOCKNAL_MAX_RECONNECT_INTERVAL); @@ -2198,7 +2301,7 @@ ksocknal_autoconnectd (void *arg) ksock_route_t *route; int rc; - snprintf (name, sizeof (name), "ksocknal_ad[%ld]", id); + snprintf (name, sizeof (name), "ksocknal_ad%02ld", id); kportal_daemonize (name); kportal_blockallsigs (); @@ -2239,55 +2342,47 @@ ksock_conn_t * ksocknal_find_timed_out_conn (ksock_peer_t *peer) { /* We're called with a shared lock on ksnd_global_lock */ - unsigned long flags; ksock_conn_t *conn; struct list_head *ctmp; - ksock_tx_t *tx; - struct list_head *ttmp; ksock_sched_t *sched; list_for_each (ctmp, &peer->ksnp_conns) { conn = list_entry (ctmp, ksock_conn_t, ksnc_list); sched = conn->ksnc_scheduler; - - if (conn->ksnc_rx_deadline != 0 && - conn->ksnc_rx_deadline <= jiffies_64) - goto timed_out; - spin_lock_irqsave (&sched->kss_lock, flags); + /* Don't need the {get,put}connsock dance to deref ksnc_sock... */ + LASSERT (!conn->ksnc_closing); - list_for_each (ttmp, &conn->ksnc_tx_queue) { - tx = list_entry (ttmp, ksock_tx_t, tx_list); - LASSERT (tx->tx_deadline != 0); - - if (tx->tx_deadline <= jiffies_64) - goto timed_out_locked; + if (conn->ksnc_rx_started && + time_after_eq (jiffies, conn->ksnc_rx_deadline)) { + /* Timed out incomplete incoming message */ + atomic_inc (&conn->ksnc_refcount); + CERROR ("Timed out RX from "LPX64" %p\n", + peer->ksnp_nid, conn); + return (conn); } -#if SOCKNAL_ZC - list_for_each (ttmp, &conn->ksnc_tx_pending) { - tx = list_entry (ttmp, ksock_tx_t, tx_list); - LASSERT (tx->tx_deadline != 0); - - if (tx->tx_deadline <= jiffies_64) - goto timed_out_locked; + + if ((!list_empty (&conn->ksnc_tx_queue) || + conn->ksnc_sock->sk->wmem_queued != 0) && + time_after_eq (jiffies, conn->ksnc_tx_deadline)) { + /* Timed out messages queued for sending, or + * messages buffered in the socket's send buffer */ + atomic_inc (&conn->ksnc_refcount); + CERROR ("Timed out TX to "LPX64" %s%d %p\n", + peer->ksnp_nid, + list_empty (&conn->ksnc_tx_queue) ? "" : "Q ", + conn->ksnc_sock->sk->wmem_queued, conn); + return (conn); } -#endif - spin_unlock_irqrestore (&sched->kss_lock, flags); - continue; - - timed_out_locked: - spin_unlock_irqrestore (&sched->kss_lock, flags); - timed_out: - atomic_inc (&conn->ksnc_refcount); - return (conn); } return (NULL); } void -ksocknal_check_peer_timeouts (struct list_head *peers) +ksocknal_check_peer_timeouts (int idx) { + struct list_head *peers = &ksocknal_data.ksnd_peers[idx]; struct list_head *ptmp; ksock_peer_t *peer; ksock_conn_t *conn; @@ -2305,7 +2400,7 @@ ksocknal_check_peer_timeouts (struct list_head *peers) if (conn != NULL) { read_unlock (&ksocknal_data.ksnd_global_lock); - if (ksocknal_close_conn_unlocked (conn)) { + if (ksocknal_close_conn_unlocked (conn, -ETIMEDOUT)) { /* I actually closed... */ CERROR ("Timeout out conn->"LPX64" ip %x:%d\n", peer->ksnp_nid, conn->ksnc_ipaddr, @@ -2330,8 +2425,9 @@ ksocknal_reaper (void *arg) unsigned long flags; ksock_conn_t *conn; int timeout; + int i; int peer_index = 0; - __u64 deadline = jiffies_64; + unsigned long deadline = jiffies; kportal_daemonize ("ksocknal_reaper"); kportal_blockallsigs (); @@ -2371,12 +2467,32 @@ ksocknal_reaper (void *arg) spin_unlock_irqrestore (&ksocknal_data.ksnd_reaper_lock, flags); - while ((timeout = deadline - jiffies_64) <= 0) { - /* Time to check for timeouts on a few more peers */ - ksocknal_check_peer_timeouts (&ksocknal_data.ksnd_peers[peer_index]); + /* careful with the jiffy wrap... */ + while ((timeout = ((int)deadline - (int)jiffies)) <= 0) { + const int n = 4; + const int p = 1; + int chunk = ksocknal_data.ksnd_peer_hash_size; + + /* Time to check for timeouts on a few more peers: I do + * checks every 'p' seconds on a proportion of the peer + * table and I need to check every connection 'n' times + * within a timeout interval, to ensure I detect a + * timeout on any connection within (n+1)/n times the + * timeout interval. */ + + if (ksocknal_data.ksnd_io_timeout > n * p) + chunk = (chunk * n * p) / + ksocknal_data.ksnd_io_timeout; + if (chunk == 0) + chunk = 1; + + for (i = 0; i < chunk; i++) { + ksocknal_check_peer_timeouts (peer_index); + peer_index = (peer_index + 1) % + ksocknal_data.ksnd_peer_hash_size; + } - peer_index = (peer_index + 1) % SOCKNAL_PEER_HASH_SIZE; - deadline += HZ; + deadline += p * HZ; } add_wait_queue (&ksocknal_data.ksnd_reaper_waitq, &wait); diff --git a/lustre/portals/knals/toenal/toenal_cb.c b/lustre/portals/knals/toenal/toenal_cb.c index abd07315ce4a3bf58779d28159570187dac1f64c..478f3c1387ed7687adaa710a45b99a960ab42ad3 100644 --- a/lustre/portals/knals/toenal/toenal_cb.c +++ b/lustre/portals/knals/toenal/toenal_cb.c @@ -431,7 +431,8 @@ ktoenal_send(nal_cb_t *nal, void *private, lib_msg_t *cookie, if ((conn = ktoenal_get_conn (nid)) == NULL) { /* It's not a peer; try to find a gateway */ - rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, &gatewaynid); + rc = kpr_lookup (&ktoenal_data.ksnd_router, nid, payload_niov, + &gatewaynid); if (rc != 0) { CERROR ("Can't route to "LPX64": router error %d\n", nid, rc); diff --git a/lustre/portals/libcfs/Makefile.am b/lustre/portals/libcfs/Makefile.am index 20d7fbd4f6a37eb7787c2cde3092765d3c905761..cf9220b5a2a2b07dba3165841f1f39854695d682 100644 --- a/lustre/portals/libcfs/Makefile.am +++ b/lustre/portals/libcfs/Makefile.am @@ -20,7 +20,7 @@ link-stamp: echo timestamp > link-stamp DEFS = -portals_SOURCES = $(LINKS) module.c proc.c debug.c +portals_SOURCES = $(LINKS) module.c proc.c debug.c lwt.c # Don't distribute any patched files. dist-hook: diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c index 63e71eec712f2e284e03aa9bc68b9f29a82cfa3c..90eb185da7243cced18e3d1186c889e0475f7340 100644 --- a/lustre/portals/libcfs/debug.c +++ b/lustre/portals/libcfs/debug.c @@ -790,41 +790,61 @@ void portals_debug_set_level(unsigned int debug_level) portal_debug = debug_level; } +void portals_run_upcall(char **argv) +{ + int rc; + int argc; + char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL}; + ENTRY; + + argv[0] = portals_upcall; + argc = 1; + while (argv[argc] != NULL) + argc++; + + LASSERT(argc >= 2); + + rc = call_usermodehelper(argv[0], argv, envp); + if (rc < 0) { + CERROR("Error %d invoking portals upcall %s %s%s%s%s%s%s%s%s; " + "check /proc/sys/portals/upcall\n", + rc, argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } else { + CERROR("Invoked portals upcall %s %s%s%s%s%s%s%s%s\n", + argv[0], argv[1], + argc < 3 ? "" : ",", argc < 3 ? "" : argv[2], + argc < 4 ? "" : ",", argc < 4 ? "" : argv[3], + argc < 5 ? "" : ",", argc < 5 ? "" : argv[4], + argc < 6 ? "" : ",..."); + } +} + void portals_run_lbug_upcall(char *file, const char *fn, const int line) { char *argv[6]; - char *envp[3]; char buf[32]; - int rc; ENTRY; snprintf (buf, sizeof buf, "%d", line); - argv[0] = portals_upcall; argv[1] = "LBUG"; argv[2] = file; argv[3] = (char *)fn; argv[4] = buf; argv[5] = NULL; - envp[0] = "HOME=/"; - envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; - envp[2] = NULL; - - rc = call_usermodehelper(argv[0], argv, envp); - if (rc < 0) { - CERROR("Error invoking lbug upcall %s %s %s %s %s: %d; check " - "/proc/sys/portals/upcall\n", - argv[0], argv[1], argv[2], argv[3], argv[4], rc); - - } else { - CERROR("Invoked upcall %s %s %s %s %s\n", - argv[0], argv[1], argv[2], argv[3], argv[4]); - } + portals_run_upcall (argv); } - EXPORT_SYMBOL(portals_debug_dumplog); EXPORT_SYMBOL(portals_debug_msg); EXPORT_SYMBOL(portals_debug_set_level); +EXPORT_SYMBOL(portals_run_upcall); EXPORT_SYMBOL(portals_run_lbug_upcall); diff --git a/lustre/portals/libcfs/lwt.c b/lustre/portals/libcfs/lwt.c new file mode 100644 index 0000000000000000000000000000000000000000..a40a7ed5127e07b7073fb2b8cd31f7e350b8588a --- /dev/null +++ b/lustre/portals/libcfs/lwt.c @@ -0,0 +1,235 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (C) 2003 Cluster File Systems, Inc. + * Author: Eric Barton <eeb@clusterfs.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define EXPORT_SYMTAB + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/kmod.h> +#include <linux/kernel.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/string.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/smp_lock.h> +#include <linux/unistd.h> +#include <linux/interrupt.h> +#include <asm/system.h> +#include <asm/uaccess.h> + +#define DEBUG_SUBSYSTEM S_PORTALS + +#include <linux/kp30.h> + +#if LWT_SUPPORT + +#define LWT_MEMORY (1<<20) /* 1Mb of trace memory */ +#define LWT_MAX_CPUS 4 + +int lwt_enabled; +int lwt_pages_per_cpu; +lwt_cpu_t lwt_cpus[LWT_MAX_CPUS]; + +/* NB only root is allowed to retrieve LWT info; it's an open door into the + * kernel... */ + +int +lwt_lookup_string (int *size, char *knl_ptr, + char *user_ptr, int user_size) +{ + /* knl_ptr was retrieved from an LWT snapshot and the caller wants to + * turn it into a string. NB we can crash with an access violation + * trying to determine the string length, so we're trusting our + * caller... */ + + if (!capable(CAP_SYS_ADMIN)) + return (-EPERM); + + *size = strlen (knl_ptr) + 1; + + if (user_ptr != NULL && + copy_to_user (user_ptr, knl_ptr, *size)) + return (-EFAULT); + + return (0); +} + +int +lwt_control (int enable, int clear) +{ + lwt_page_t *p; + int i; + int j; + + if (!capable(CAP_SYS_ADMIN)) + return (-EPERM); + + if (clear) + for (i = 0; i < num_online_cpus(); i++) { + p = lwt_cpus[i].lwtc_current_page; + + for (j = 0; j < lwt_pages_per_cpu; j++) { + + memset (p->lwtp_events, 0, PAGE_SIZE); + + p = list_entry (p->lwtp_list.next, + lwt_page_t, lwtp_list); + } + } + + lwt_enabled = enable; + mb(); + if (!enable) { + /* give people some time to stop adding traces */ + schedule_timeout(10); + } + + return (0); +} + +int +lwt_snapshot (int *ncpu, int *total_size, + void *user_ptr, int user_size) +{ + const int events_per_page = PAGE_SIZE / sizeof(lwt_event_t); + const int bytes_per_page = events_per_page * sizeof(lwt_event_t); + lwt_page_t *p; + int i; + int j; + + if (!capable(CAP_SYS_ADMIN)) + return (-EPERM); + + *ncpu = num_online_cpus(); + *total_size = num_online_cpus() * lwt_pages_per_cpu * bytes_per_page; + + if (user_ptr == NULL) + return (0); + + for (i = 0; i < num_online_cpus(); i++) { + p = lwt_cpus[i].lwtc_current_page; + + for (j = 0; j < lwt_pages_per_cpu; j++) { + if (copy_to_user(user_ptr, p->lwtp_events, + bytes_per_page)) + return (-EFAULT); + + user_ptr = ((char *)user_ptr) + bytes_per_page; + p = list_entry(p->lwtp_list.next, + lwt_page_t, lwtp_list); + + } + } + + return (0); +} + +int +lwt_init () +{ + int i; + int j; + + if (num_online_cpus() > LWT_MAX_CPUS) { + CERROR ("Too many CPUs\n"); + return (-EINVAL); + } + + /* NULL pointers, zero scalars */ + memset (lwt_cpus, 0, sizeof (lwt_cpus)); + lwt_pages_per_cpu = LWT_MEMORY / (num_online_cpus() * PAGE_SIZE); + + for (i = 0; i < num_online_cpus(); i++) + for (j = 0; j < lwt_pages_per_cpu; j++) { + struct page *page = alloc_page (GFP_KERNEL); + lwt_page_t *lwtp; + + if (page == NULL) { + CERROR ("Can't allocate page\n"); + lwt_fini (); + return (-ENOMEM); + } + + PORTAL_ALLOC(lwtp, sizeof (*lwtp)); + if (lwtp == NULL) { + CERROR ("Can't allocate lwtp\n"); + __free_page(page); + lwt_fini (); + return (-ENOMEM); + } + + lwtp->lwtp_page = page; + lwtp->lwtp_events = page_address(page); + memset (lwtp->lwtp_events, 0, PAGE_SIZE); + + if (j == 0) { + INIT_LIST_HEAD (&lwtp->lwtp_list); + lwt_cpus[i].lwtc_current_page = lwtp; + } else { + list_add (&lwtp->lwtp_list, + &lwt_cpus[i].lwtc_current_page->lwtp_list); + } + } + + lwt_enabled = 1; + mb(); + + return (0); +} + +void +lwt_fini () +{ + int i; + + if (num_online_cpus() > LWT_MAX_CPUS) + return; + + for (i = 0; i < num_online_cpus(); i++) + while (lwt_cpus[i].lwtc_current_page != NULL) { + lwt_page_t *lwtp = lwt_cpus[i].lwtc_current_page; + + if (list_empty (&lwtp->lwtp_list)) { + lwt_cpus[i].lwtc_current_page = NULL; + } else { + lwt_cpus[i].lwtc_current_page = + list_entry (lwtp->lwtp_list.next, + lwt_page_t, lwtp_list); + + list_del (&lwtp->lwtp_list); + } + + __free_page (lwtp->lwtp_page); + PORTAL_FREE (lwtp, sizeof (*lwtp)); + } +} + +EXPORT_SYMBOL(lwt_enabled); +EXPORT_SYMBOL(lwt_cpus); + +EXPORT_SYMBOL(lwt_init); +EXPORT_SYMBOL(lwt_fini); +EXPORT_SYMBOL(lwt_lookup_string); +EXPORT_SYMBOL(lwt_control); +EXPORT_SYMBOL(lwt_snapshot); +#endif diff --git a/lustre/portals/libcfs/module.c b/lustre/portals/libcfs/module.c index c72dffc07bdca7cb57fe97876e1673d35caf0681..11e4f4c0e42fdbb613f612a1d07a217d81fa2144 100644 --- a/lustre/portals/libcfs/module.c +++ b/lustre/portals/libcfs/module.c @@ -122,8 +122,8 @@ static inline void freedata(void *data, int len) } static int -kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, - ptl_nid_t hi_nid) +kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid) { int rc; kpr_control_interface_t *ci; @@ -139,7 +139,8 @@ kportal_add_route(int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, } static int -kportal_del_route(ptl_nid_t target) +kportal_del_route(int gw_nalid, ptl_nid_t gw_nid, + ptl_nid_t lo, ptl_nid_t hi) { int rc; kpr_control_interface_t *ci; @@ -148,7 +149,24 @@ kportal_del_route(ptl_nid_t target) if (ci == NULL) return (-ENODEV); - rc = ci->kprci_del_route (target); + rc = ci->kprci_del_route (gw_nalid, gw_nid, lo, hi); + + PORTAL_SYMBOL_PUT(kpr_control_interface); + return (rc); +} + +static int +kportal_notify_router (int gw_nalid, ptl_nid_t gw_nid, + int alive, time_t when) +{ + int rc; + kpr_control_interface_t *ci; + + ci = (kpr_control_interface_t *)PORTAL_SYMBOL_GET(kpr_control_interface); + if (ci == NULL) + return (-ENODEV); + + rc = ci->kprci_notify (gw_nalid, gw_nid, alive, when); PORTAL_SYMBOL_PUT(kpr_control_interface); return (rc); @@ -156,12 +174,13 @@ kportal_del_route(ptl_nid_t target) static int kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, - ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp) + ptl_nid_t *lo_nidp, ptl_nid_t *hi_nidp, int *alivep) { int gateway_nalid; ptl_nid_t gateway_nid; ptl_nid_t lo_nid; ptl_nid_t hi_nid; + int alive; int rc; kpr_control_interface_t *ci; @@ -169,17 +188,19 @@ kportal_get_route(int index, __u32 *gateway_nalidp, ptl_nid_t *gateway_nidp, if (ci == NULL) return (-ENODEV); - rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, &lo_nid, - &hi_nid); + rc = ci->kprci_get_route(index, &gateway_nalid, &gateway_nid, + &lo_nid, &hi_nid, &alive); if (rc == 0) { - CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64"\n", - index, gateway_nalid, gateway_nid, lo_nid, hi_nid); + CDEBUG(D_IOCTL, "got route [%d] %d "LPX64":"LPX64" - "LPX64", %s\n", + index, gateway_nalid, gateway_nid, lo_nid, hi_nid, + alive ? "up" : "down"); *gateway_nalidp = (__u32)gateway_nalid; - *gateway_nidp = (__u32)gateway_nid; - *lo_nidp = (__u32)lo_nid; - *hi_nidp = (__u32)hi_nid; + *gateway_nidp = gateway_nid; + *lo_nidp = lo_nid; + *hi_nidp = hi_nid; + *alivep = alive; } PORTAL_SYMBOL_PUT (kpr_control_interface); @@ -367,23 +388,38 @@ static int kportal_ioctl(struct inode *inode, struct file *file, case IOC_PORTAL_ADD_ROUTE: CDEBUG(D_IOCTL, "Adding route: [%d] "LPU64" : "LPU64" - "LPU64"\n", - data->ioc_nal, data->ioc_nid, data->ioc_nid2, - data->ioc_nid3); + data->ioc_nal, data->ioc_nid, + data->ioc_nid2, data->ioc_nid3); err = kportal_add_route(data->ioc_nal, data->ioc_nid, - MIN (data->ioc_nid2, data->ioc_nid3), - MAX (data->ioc_nid2, data->ioc_nid3)); + data->ioc_nid2, data->ioc_nid3); break; case IOC_PORTAL_DEL_ROUTE: - CDEBUG (D_IOCTL, "Removing route to "LPU64"\n", data->ioc_nid); - err = kportal_del_route (data->ioc_nid); + CDEBUG (D_IOCTL, "Removing routes via [%d] "LPU64" : "LPU64" - "LPU64"\n", + data->ioc_nal, data->ioc_nid, + data->ioc_nid2, data->ioc_nid3); + err = kportal_del_route (data->ioc_nal, data->ioc_nid, + data->ioc_nid2, data->ioc_nid3); break; + case IOC_PORTAL_NOTIFY_ROUTER: { + CDEBUG (D_IOCTL, "Notifying peer [%d] "LPU64" %s @ %ld\n", + data->ioc_nal, data->ioc_nid, + data->ioc_flags ? "Enabling" : "Disabling", + (time_t)data->ioc_nid3); + + err = kportal_notify_router (data->ioc_nal, data->ioc_nid, + data->ioc_flags, + (time_t)data->ioc_nid3); + break; + } + case IOC_PORTAL_GET_ROUTE: CDEBUG (D_IOCTL, "Getting route [%d]\n", data->ioc_count); err = kportal_get_route(data->ioc_count, &data->ioc_nal, - &data->ioc_nid, &data->ioc_nid2, - &data->ioc_nid3); + &data->ioc_nid, + &data->ioc_nid2, &data->ioc_nid3, + &data->ioc_flags); if (err == 0) if (copy_to_user((char *)arg, data, sizeof (*data))) err = -EFAULT; @@ -432,7 +468,27 @@ static int kportal_ioctl(struct inode *inode, struct file *file, kportal_put_ni (data->ioc_nal); break; } - +#if LWT_SUPPORT + case IOC_PORTAL_LWT_CONTROL: + err = lwt_control (data->ioc_flags, data->ioc_misc); + break; + + case IOC_PORTAL_LWT_SNAPSHOT: + err = lwt_snapshot (&data->ioc_count, &data->ioc_misc, + data->ioc_pbuf1, data->ioc_plen1); + if (err == 0 && + copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; + + case IOC_PORTAL_LWT_LOOKUP_STRING: + err = lwt_lookup_string (&data->ioc_count, data->ioc_pbuf1, + data->ioc_pbuf2, data->ioc_plen2); + if (err == 0 && + copy_to_user((char *)arg, data, sizeof (*data))) + err = -EFAULT; + break; +#endif default: err = -EINVAL; break; @@ -471,12 +527,19 @@ static int init_kportals_module(void) return (rc); } +#if LWT_SUPPORT + rc = lwt_init(); + if (rc != 0) { + CERROR("lwt_init: error %d\n", rc); + goto cleanup_debug; + } +#endif sema_init(&nal_cmd_sem, 1); rc = misc_register(&portal_dev); if (rc) { CERROR("misc_register: error %d\n", rc); - goto cleanup_debug; + goto cleanup_lwt; } rc = PtlInit(); @@ -498,6 +561,10 @@ static int init_kportals_module(void) PtlFini(); cleanup_deregister: misc_deregister(&portal_dev); + cleanup_lwt: +#if LWT_SUPPORT + lwt_fini(); +#endif cleanup_debug: portals_debug_cleanup(); return rc; @@ -518,6 +585,10 @@ static void exit_kportals_module(void) if (rc) CERROR("misc_deregister error %d\n", rc); +#if LWT_SUPPORT + lwt_fini(); +#endif + if (atomic_read(&portal_kmemory) != 0) CERROR("Portals memory leaked: %d bytes\n", atomic_read(&portal_kmemory)); diff --git a/lustre/portals/router/router.c b/lustre/portals/router/router.c index 27a7fbae5f2e0183c636a7aba63d233e033e580e..a03fb423bd70069e4ae5ddcd3aff44baedbd284c 100644 --- a/lustre/portals/router/router.c +++ b/lustre/portals/router/router.c @@ -24,6 +24,7 @@ #include "router.h" LIST_HEAD(kpr_routes); +LIST_HEAD(kpr_gateways); LIST_HEAD(kpr_nals); unsigned long long kpr_fwd_bytes; @@ -42,6 +43,7 @@ kpr_router_interface_t kpr_router_interface = { kprri_lookup: kpr_lookup_target, kprri_fwd_start: kpr_forward_packet, kprri_fwd_done: kpr_complete_packet, + kprri_notify: kpr_nal_notify, kprri_shutdown: kpr_shutdown_nal, kprri_deregister: kpr_deregister_nal, }; @@ -50,6 +52,7 @@ kpr_control_interface_t kpr_control_interface = { kprci_add_route: kpr_add_route, kprci_del_route: kpr_del_route, kprci_get_route: kpr_get_route, + kprci_notify: kpr_sys_notify, }; int @@ -59,7 +62,7 @@ kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) struct list_head *e; kpr_nal_entry_t *ne; - CDEBUG (D_OTHER, "Registering NAL %d\n", nalif->kprni_nalid); + CDEBUG (D_NET, "Registering NAL %d\n", nalif->kprni_nalid); PORTAL_ALLOC (ne, sizeof (*ne)); if (ne == NULL) @@ -95,13 +98,182 @@ kpr_register_nal (kpr_nal_interface_t *nalif, void **argp) return (0); } +void +kpr_do_upcall (void *arg) +{ + kpr_upcall_t *u = (kpr_upcall_t *)arg; + char nalstr[10]; + char nidstr[36]; + char whenstr[36]; + char *argv[] = { + NULL, + "ROUTER_NOTIFY", + nalstr, + nidstr, + u->kpru_alive ? "up" : "down", + whenstr, + NULL}; + + snprintf (nalstr, sizeof(nalstr), "%d", u->kpru_nal_id); + snprintf (nidstr, sizeof(nidstr), LPX64, u->kpru_nid); + snprintf (whenstr, sizeof(whenstr), "%ld", u->kpru_when); + + portals_run_upcall (argv); +} + +void +kpr_upcall (int gw_nalid, ptl_nid_t gw_nid, int alive, time_t when) +{ + /* May be in arbitrary context */ + kpr_upcall_t *u = kmalloc (sizeof (kpr_upcall_t), GFP_ATOMIC); + + if (u == NULL) { + CERROR ("Upcall out of memory: nal %d nid "LPX64" %s\n", + gw_nalid, gw_nid, alive ? "up" : "down"); + return; + } + + u->kpru_nal_id = gw_nalid; + u->kpru_nid = gw_nid; + u->kpru_alive = alive; + u->kpru_when = when; + + prepare_work (&u->kpru_tq, kpr_do_upcall, u); + schedule_work (&u->kpru_tq); +} + +int +kpr_do_notify (int byNal, int gateway_nalid, ptl_nid_t gateway_nid, + int alive, time_t when) +{ + unsigned long flags; + int rc = -ENOENT; + kpr_nal_entry_t *ne = NULL; + kpr_gateway_entry_t *ge = NULL; + struct timeval now; + struct list_head *e; + struct list_head *n; + + CDEBUG (D_ERROR, "%s notifying [%d] "LPX64": %s\n", + byNal ? "NAL" : "userspace", + gateway_nalid, gateway_nid, alive ? "up" : "down"); + + /* can't do predictions... */ + do_gettimeofday (&now); + if (when > now.tv_sec) { + CERROR ("Ignoring prediction from %s of [%d] "LPX64" %s " + "%ld seconds in the future\n", + byNal ? "NAL" : "userspace", + gateway_nalid, gateway_nid, alive ? "up" : "down", + when - now.tv_sec); + return (EINVAL); + } + + LASSERT (when <= now.tv_sec); + + /* Serialise with lookups (i.e. write lock) */ + write_lock_irqsave(&kpr_rwlock, flags); + + list_for_each_safe (e, n, &kpr_gateways) { + + ge = list_entry(e, kpr_gateway_entry_t, kpge_list); + if ((gateway_nalid != 0 && + ge->kpge_nalid != gateway_nalid) || + ge->kpge_nid != gateway_nid) + continue; + + rc = 0; + break; + } + + if (rc != 0) { + /* gateway not found */ + write_unlock_irqrestore(&kpr_rwlock, flags); + CERROR ("Gateway not found\n"); + return (rc); + } + + if (when < ge->kpge_timestamp) { + /* out of date information */ + write_unlock_irqrestore (&kpr_rwlock, flags); + CERROR ("Out of date\n"); + return (0); + } + + /* update timestamp */ + ge->kpge_timestamp = when; + + if ((!ge->kpge_alive) == (!alive)) { + /* new date for old news */ + write_unlock_irqrestore (&kpr_rwlock, flags); + CERROR ("Old news\n"); + return (0); + } + + ge->kpge_alive = alive; + CDEBUG(D_NET, "set "LPX64" [%p] %d\n", gateway_nid, ge, alive); + + if (alive) { + /* Reset all gateway weights so the newly-enabled gateway + * doesn't have to play catch-up */ + list_for_each_safe (e, n, &kpr_gateways) { + kpr_gateway_entry_t *ge = list_entry(e, kpr_gateway_entry_t, + kpge_list); + atomic_set (&ge->kpge_weight, 0); + } + } + + if (!byNal) { + /* userland notified me: notify NAL? */ + ne = kpr_find_nal_entry_locked (ge->kpge_nalid); + if (ne != NULL) { + if (ne->kpne_shutdown || + ne->kpne_interface.kprni_notify == NULL) { + /* no need to notify */ + ne = NULL; + } else { + /* take a ref on this NAL until notifying + * it has completed... */ + atomic_inc (&ne->kpne_refcount); + } + } + } + + write_unlock_irqrestore(&kpr_rwlock, flags); + + if (ne != NULL) { + ne->kpne_interface.kprni_notify (ne->kpne_interface.kprni_arg, + gateway_nid, alive); + /* 'ne' can disappear now... */ + atomic_dec (&ne->kpne_refcount); + } + + if (byNal) { + /* It wasn't userland that notified me... */ + CERROR ("Doing upcall\n"); + kpr_upcall (gateway_nalid, gateway_nid, alive, when); + } else { + CERROR (" NOT Doing upcall\n"); + } + + return (0); +} + +void +kpr_nal_notify (void *arg, ptl_nid_t peer, int alive, time_t when) +{ + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + + kpr_do_notify (1, ne->kpne_interface.kprni_nalid, peer, alive, when); +} + void kpr_shutdown_nal (void *arg) { unsigned long flags; kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - CDEBUG (D_OTHER, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "Shutting down NAL %d\n", ne->kpne_interface.kprni_nalid); LASSERT (!ne->kpne_shutdown); LASSERT (!in_interrupt()); @@ -126,7 +298,7 @@ kpr_deregister_nal (void *arg) unsigned long flags; kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - CDEBUG (D_OTHER, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "Deregister NAL %d\n", ne->kpne_interface.kprni_nalid); LASSERT (ne->kpne_shutdown); /* caller must have issued shutdown already */ LASSERT (atomic_read (&ne->kpne_refcount) == 0); /* can't be busy */ @@ -142,15 +314,58 @@ kpr_deregister_nal (void *arg) PORTAL_MODULE_UNUSE; } +int +kpr_ge_isbetter (kpr_gateway_entry_t *ge1, kpr_gateway_entry_t *ge2) +{ + const int significant_bits = 0x00ffffff; + /* We use atomic_t to record/compare route weights for + * load-balancing. Here we limit ourselves to only using + * 'significant_bits' when we do an 'after' comparison */ + + int diff = (atomic_read (&ge1->kpge_weight) - + atomic_read (&ge2->kpge_weight)) & significant_bits; + int rc = (diff > (significant_bits >> 1)); + + CDEBUG(D_NET, "[%p]"LPX64"=%d %s [%p]"LPX64"=%d\n", + ge1, ge1->kpge_nid, atomic_read (&ge1->kpge_weight), + rc ? ">" : "<", + ge2, ge2->kpge_nid, atomic_read (&ge2->kpge_weight)); + + return (rc); +} + +void +kpr_update_weight (kpr_gateway_entry_t *ge, int nob) +{ + int weight = 1 + (nob + sizeof (ptl_hdr_t)/2)/sizeof (ptl_hdr_t); + + /* We've chosen this route entry (i.e. gateway) to forward payload + * of length 'nob'; update the route's weight to make it less + * favoured. Note that the weight is 1 plus the payload size + * rounded and scaled to the portals header size, so we get better + * use of the significant bits in kpge_weight. */ + + CDEBUG(D_NET, "gateway [%p]"LPX64" += %d\n", ge, + ge->kpge_nid, weight); + + atomic_add (weight, &ge->kpge_weight); +} int -kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) +kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, + ptl_nid_t *gateway_nidp) { - kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; - struct list_head *e; - int rc = -ENOENT; + kpr_nal_entry_t *ne = (kpr_nal_entry_t *)arg; + struct list_head *e; + kpr_route_entry_t *re; + kpr_gateway_entry_t *ge = NULL; + int rc = -ENOENT; - CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d\n", target_nid, ne->kpne_interface.kprni_nalid); + /* Caller wants to know if 'target_nid' can be reached via a gateway + * ON HER OWN NETWORK */ + + CDEBUG (D_NET, "lookup "LPX64" from NAL %d\n", target_nid, + ne->kpne_interface.kprni_nalid); if (ne->kpne_shutdown) /* caller is shutting down */ return (-ENOENT); @@ -159,9 +374,8 @@ kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) /* Search routes for one that has a gateway to target_nid on the callers network */ - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) - { - kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + list_for_each (e, &kpr_routes) { + re = list_entry (e, kpr_route_entry_t, kpre_list); if (re->kpre_lo_nid > target_nid || re->kpre_hi_nid < target_nid) @@ -169,33 +383,66 @@ kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp) /* found table entry */ - if (re->kpre_gateway_nalid != ne->kpne_interface.kprni_nalid) /* different NAL */ - rc = -EHOSTUNREACH; - else - { - rc = 0; - *gateway_nidp = re->kpre_gateway_nid; - } - break; + if (re->kpre_gateway->kpge_nalid != ne->kpne_interface.kprni_nalid || + !re->kpre_gateway->kpge_alive) { + /* different NAL or gateway down */ + rc = -EHOSTUNREACH; + continue; + } + + if (ge == NULL || + kpr_ge_isbetter (re->kpre_gateway, ge)) + ge = re->kpre_gateway; } + if (ge != NULL) { + kpr_update_weight (ge, nob); + *gateway_nidp = ge->kpge_nid; + rc = 0; + } + read_unlock (&kpr_rwlock); - CDEBUG (D_OTHER, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", + /* NB can't deref 're' now; it might have been removed! */ + + CDEBUG (D_NET, "lookup "LPX64" from NAL %d: %d ("LPX64")\n", target_nid, ne->kpne_interface.kprni_nalid, rc, (rc == 0) ? *gateway_nidp : (ptl_nid_t)0); return (rc); } +kpr_nal_entry_t * +kpr_find_nal_entry_locked (int nal_id) +{ + struct list_head *e; + + /* Called with kpr_rwlock held */ + + list_for_each (e, &kpr_nals) { + kpr_nal_entry_t *ne = list_entry (e, kpr_nal_entry_t, kpne_list); + + if (nal_id != ne->kpne_interface.kprni_nalid) /* no match */ + continue; + + return (ne); + } + + return (NULL); +} + void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) { - kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; - ptl_nid_t target_nid = fwd->kprfd_target_nid; - int nob = fwd->kprfd_nob; - struct list_head *e; - - CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d\n", fwd, + kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)arg; + ptl_nid_t target_nid = fwd->kprfd_target_nid; + int nob = fwd->kprfd_nob; + kpr_gateway_entry_t *ge = NULL; + kpr_nal_entry_t *dst_ne = NULL; + struct list_head *e; + kpr_route_entry_t *re; + kpr_nal_entry_t *tmp_ne; + + CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d\n", fwd, target_nid, src_ne->kpne_interface.kprni_nalid); LASSERT (nob >= sizeof (ptl_hdr_t)); /* at least got a packet header */ @@ -216,53 +463,58 @@ kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd) /* Search routes for one that has a gateway to target_nid NOT on the caller's network */ - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) - { - kpr_route_entry_t *re = list_entry (e, kpr_route_entry_t, kpre_list); + list_for_each (e, &kpr_routes) { + re = list_entry (e, kpr_route_entry_t, kpre_list); if (re->kpre_lo_nid > target_nid || /* no match */ re->kpre_hi_nid < target_nid) continue; - CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: match "LPX64" on NAL %d\n", fwd, - target_nid, src_ne->kpne_interface.kprni_nalid, - re->kpre_gateway_nid, re->kpre_gateway_nalid); - - if (re->kpre_gateway_nalid == src_ne->kpne_interface.kprni_nalid) - break; /* don't route to same NAL */ + if (re->kpre_gateway->kpge_nalid == src_ne->kpne_interface.kprni_nalid) + continue; /* don't route to same NAL */ - /* Search for gateway's NAL's entry */ - - for (e = kpr_nals.next; e != &kpr_nals; e = e->next) - { - kpr_nal_entry_t *dst_ne = list_entry (e, kpr_nal_entry_t, kpne_list); + if (!re->kpre_gateway->kpge_alive) + continue; /* gateway is dead */ + + tmp_ne = kpr_find_nal_entry_locked (re->kpre_gateway->kpge_nalid); - if (re->kpre_gateway_nalid != dst_ne->kpne_interface.kprni_nalid) /* no match */ - continue; + if (tmp_ne == NULL || + tmp_ne->kpne_shutdown) { + /* NAL must be registered and not shutting down */ + continue; + } - if (dst_ne->kpne_shutdown) /* don't route if NAL is shutting down */ - break; + if (ge == NULL || + kpr_ge_isbetter (re->kpre_gateway, ge)) { + ge = re->kpre_gateway; + dst_ne = tmp_ne; + } + } + + if (ge != NULL) { + LASSERT (dst_ne != NULL); + + kpr_update_weight (ge, nob); - fwd->kprfd_gateway_nid = re->kpre_gateway_nid; - atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ + fwd->kprfd_gateway_nid = ge->kpge_nid; + atomic_inc (&dst_ne->kpne_refcount); /* dest nal is busy until fwd completes */ - read_unlock (&kpr_rwlock); + read_unlock (&kpr_rwlock); - CDEBUG (D_OTHER, "forward [%p] "LPX64" from NAL %d: "LPX64" on NAL %d\n", fwd, - target_nid, src_ne->kpne_interface.kprni_nalid, - fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); + CDEBUG (D_NET, "forward [%p] "LPX64" from NAL %d: " + "to "LPX64" on NAL %d\n", + fwd, target_nid, src_ne->kpne_interface.kprni_nalid, + fwd->kprfd_gateway_nid, dst_ne->kpne_interface.kprni_nalid); - dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); - return; - } - break; + dst_ne->kpne_interface.kprni_fwd (dst_ne->kpne_interface.kprni_arg, fwd); + return; } - read_unlock (&kpr_rwlock); + read_unlock (&kpr_rwlock); out: kpr_fwd_errors++; - CDEBUG (D_OTHER, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, + CDEBUG (D_NET, "Failed to forward [%p] "LPX64" from NAL %d\n", fwd, target_nid, src_ne->kpne_interface.kprni_nalid); /* Can't find anywhere to forward to */ @@ -278,14 +530,14 @@ kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) kpr_nal_entry_t *dst_ne = (kpr_nal_entry_t *)arg; kpr_nal_entry_t *src_ne = (kpr_nal_entry_t *)fwd->kprfd_router_arg; - CDEBUG (D_OTHER, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, + CDEBUG (D_NET, "complete(1) [%p] from NAL %d to NAL %d: %d\n", fwd, src_ne->kpne_interface.kprni_nalid, dst_ne->kpne_interface.kprni_nalid, error); atomic_dec (&dst_ne->kpne_refcount); /* CAVEAT EMPTOR dst_ne can disappear now!!! */ (fwd->kprfd_callback)(fwd->kprfd_callback_arg, error); - CDEBUG (D_OTHER, "complete(2) [%p] from NAL %d: %d\n", fwd, + CDEBUG (D_NET, "complete(2) [%p] from NAL %d: %d\n", fwd, src_ne->kpne_interface.kprni_nalid, error); atomic_dec (&kpr_queue_depth); @@ -293,49 +545,76 @@ kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error) } int -kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, - ptl_nid_t hi_nid) +kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, + ptl_nid_t lo_nid, ptl_nid_t hi_nid) { - unsigned long flags; - struct list_head *e; - kpr_route_entry_t *re; + unsigned long flags; + struct list_head *e; + kpr_route_entry_t *re; + kpr_gateway_entry_t *ge; + int dup = 0; - CDEBUG(D_OTHER, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", + CDEBUG(D_NET, "Add route: %d "LPX64" : "LPX64" - "LPX64"\n", gateway_nalid, gateway_nid, lo_nid, hi_nid); - LASSERT(lo_nid <= hi_nid); + if (gateway_nalid == PTL_NID_ANY || + lo_nid == PTL_NID_ANY || + hi_nid == PTL_NID_ANY || + lo_nid > hi_nid) + return (-EINVAL); + + PORTAL_ALLOC (ge, sizeof (*ge)); + if (ge == NULL) + return (-ENOMEM); + + ge->kpge_nalid = gateway_nalid; + ge->kpge_nid = gateway_nid; + ge->kpge_alive = 1; + ge->kpge_timestamp = 0; + ge->kpge_refcount = 0; + atomic_set (&ge->kpge_weight, 0); PORTAL_ALLOC (re, sizeof (*re)); if (re == NULL) return (-ENOMEM); - re->kpre_gateway_nalid = gateway_nalid; - re->kpre_gateway_nid = gateway_nid; re->kpre_lo_nid = lo_nid; re->kpre_hi_nid = hi_nid; LASSERT(!in_interrupt()); write_lock_irqsave (&kpr_rwlock, flags); - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re2 = list_entry(e, kpr_route_entry_t, - kpre_list); - - if (re->kpre_lo_nid > re2->kpre_hi_nid || - re->kpre_hi_nid < re2->kpre_lo_nid) - continue; + list_for_each (e, &kpr_gateways) { + kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, + kpge_list); + + if (ge2->kpge_nalid == gateway_nalid && + ge2->kpge_nid == gateway_nid) { + PORTAL_FREE (ge, sizeof (*ge)); + ge = ge2; + dup = 1; + break; + } + } - CERROR ("Attempt to add duplicate routes ["LPX64" - "LPX64"]" - "to ["LPX64" - "LPX64"]\n", - re->kpre_lo_nid, re->kpre_hi_nid, - re2->kpre_lo_nid, re2->kpre_hi_nid); + if (!dup) { + /* Adding a new gateway... */ + + list_add (&ge->kpge_list, &kpr_gateways); - write_unlock_irqrestore (&kpr_rwlock, flags); + /* ...zero all gateway weights so this one doesn't have to + * play catch-up */ - PORTAL_FREE (re, sizeof (*re)); - return (-EINVAL); + list_for_each (e, &kpr_gateways) { + kpr_gateway_entry_t *ge2 = list_entry(e, kpr_gateway_entry_t, + kpge_list); + atomic_set (&ge2->kpge_weight, 0); + } + } + re->kpre_gateway = ge; + ge->kpge_refcount++; list_add (&re->kpre_list, &kpr_routes); write_unlock_irqrestore (&kpr_rwlock, flags); @@ -343,49 +622,82 @@ kpr_add_route (int gateway_nalid, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, } int -kpr_del_route (ptl_nid_t nid) +kpr_sys_notify (int gateway_nalid, ptl_nid_t gateway_nid, + int alive, time_t when) +{ + return (kpr_do_notify (0, gateway_nalid, gateway_nid, alive, when)); +} + +int +kpr_del_route (int gw_nalid, ptl_nid_t gw_nid, + ptl_nid_t lo, ptl_nid_t hi) { + int specific = (lo != PTL_NID_ANY); unsigned long flags; + int rc = -ENOENT; struct list_head *e; + struct list_head *n; - CDEBUG(D_OTHER, "Del route "LPX64"\n", nid); + CDEBUG(D_NET, "Del route [%d] "LPX64" : "LPX64" - "LPX64"\n", + gw_nalid, gw_nid, lo, hi); LASSERT(!in_interrupt()); + + /* NB Caller may specify either all routes via the given gateway + * (lo/hi == PTL_NID_ANY) or a specific route entry (lo/hi are + * actual NIDs) */ + + if (specific ? (hi == PTL_NID_ANY || hi < lo) : (hi != PTL_NID_ANY)) + return (-EINVAL); + write_lock_irqsave(&kpr_rwlock, flags); - for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + list_for_each_safe (e, n, &kpr_routes) { + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, kpre_list); - - if (re->kpre_lo_nid > nid || re->kpre_hi_nid < nid) + kpr_gateway_entry_t *ge = re->kpre_gateway; + + if (ge->kpge_nalid != gw_nalid || + ge->kpge_nid != gw_nid || + (specific && + (lo != re->kpre_lo_nid || hi != re->kpre_hi_nid))) continue; - list_del (&re->kpre_list); - write_unlock_irqrestore(&kpr_rwlock, flags); + rc = 0; + if (--ge->kpge_refcount == 0) { + list_del (&ge->kpge_list); + PORTAL_FREE (ge, sizeof (*ge)); + } + + list_del (&re->kpre_list); PORTAL_FREE(re, sizeof (*re)); - return (0); + + if (specific) + break; } write_unlock_irqrestore(&kpr_rwlock, flags); - return (-ENOENT); + return (rc); } int -kpr_get_route(int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid) +kpr_get_route (int idx, int *gateway_nalid, ptl_nid_t *gateway_nid, + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive) { struct list_head *e; read_lock(&kpr_rwlock); for (e = kpr_routes.next; e != &kpr_routes; e = e->next) { - kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, - kpre_list); - + kpr_route_entry_t *re = list_entry(e, kpr_route_entry_t, + kpre_list); + kpr_gateway_entry_t *ge = re->kpre_gateway; + if (idx-- == 0) { - *gateway_nalid = re->kpre_gateway_nalid; - *gateway_nid = re->kpre_gateway_nid; + *gateway_nalid = ge->kpge_nalid; + *gateway_nid = ge->kpge_nid; + *alive = ge->kpge_alive; *lo_nid = re->kpre_lo_nid; *hi_nid = re->kpre_hi_nid; diff --git a/lustre/portals/router/router.h b/lustre/portals/router/router.h index b8c3bec4e83d001aa2888109092a4101917bf892..c5cc1d3efdfb4a9f8595b9f68db6d4a0942f7b34 100644 --- a/lustre/portals/router/router.h +++ b/lustre/portals/router/router.h @@ -48,19 +48,42 @@ typedef struct int kpne_shutdown; } kpr_nal_entry_t; +typedef struct +{ + struct list_head kpge_list; + atomic_t kpge_weight; + time_t kpge_timestamp; + int kpge_alive; + int kpge_nalid; + int kpge_refcount; + ptl_nid_t kpge_nid; +} kpr_gateway_entry_t; + typedef struct { struct list_head kpre_list; - int kpre_gateway_nalid; - ptl_nid_t kpre_gateway_nid; + kpr_gateway_entry_t *kpre_gateway; ptl_nid_t kpre_lo_nid; ptl_nid_t kpre_hi_nid; } kpr_route_entry_t; +typedef struct +{ + struct tq_struct kpru_tq; + int kpru_nal_id; + ptl_nid_t kpru_nid; + int kpru_alive; + time_t kpru_when; +} kpr_upcall_t; + extern int kpr_register_nal (kpr_nal_interface_t *nalif, void **argp); -extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, ptl_nid_t *gateway_nidp); +extern int kpr_lookup_target (void *arg, ptl_nid_t target_nid, int nob, + ptl_nid_t *gateway_nidp); +extern kpr_nal_entry_t *kpr_find_nal_entry_locked (int nal_id); extern void kpr_forward_packet (void *arg, kpr_fwd_desc_t *fwd); extern void kpr_complete_packet (void *arg, kpr_fwd_desc_t *fwd, int error); +extern void kpr_nal_notify (void *arg, ptl_nid_t peer, + int alive, time_t when); extern void kpr_shutdown_nal (void *arg); extern void kpr_deregister_nal (void *arg); @@ -69,9 +92,12 @@ extern void kpr_proc_fini (void); extern int kpr_add_route (int gateway_nal, ptl_nid_t gateway_nid, ptl_nid_t lo_nid, ptl_nid_t hi_nid); -extern int kpr_del_route (ptl_nid_t nid); +extern int kpr_del_route (int gw_nal, ptl_nid_t gw_nid, + ptl_nid_t lo, ptl_nid_t hi); extern int kpr_get_route (int idx, int *gateway_nal, ptl_nid_t *gateway_nid, - ptl_nid_t *lo_nid, ptl_nid_t *hi_nid); + ptl_nid_t *lo_nid, ptl_nid_t *hi_nid, int *alive); +extern int kpr_sys_notify (int gw_nalid, ptl_nid_t gw_nid, + int alive, time_t when); extern unsigned long long kpr_fwd_bytes; extern unsigned long kpr_fwd_packets; diff --git a/lustre/portals/utils/parser.c b/lustre/portals/utils/parser.c index 4d93645b164d3931285e29a87b83e0b43b3658eb..eccf50704800952aaf7a9f14e3bdabf41d952997 100644 --- a/lustre/portals/utils/parser.c +++ b/lustre/portals/utils/parser.c @@ -676,6 +676,7 @@ int Parser_bool (int *b, char *str) { if (!strcasecmp (str, "no") || !strcasecmp (str, "n") || !strcasecmp (str, "off") || + !strcasecmp (str, "down") || !strcasecmp (str, "disable")) { *b = 0; @@ -685,6 +686,7 @@ int Parser_bool (int *b, char *str) { if (!strcasecmp (str, "yes") || !strcasecmp (str, "y") || !strcasecmp (str, "on") || + !strcasecmp (str, "up") || !strcasecmp (str, "enable")) { *b = 1; diff --git a/lustre/portals/utils/portals.c b/lustre/portals/utils/portals.c index 4a05234e9a1838b827ca22baf08763a6415c1eeb..5fe1777d64dba9647c188ee5f3e48b1a8f5223ab 100644 --- a/lustre/portals/utils/portals.c +++ b/lustre/portals/utils/portals.c @@ -60,6 +60,7 @@ typedef struct } name2num_t; static name2num_t nalnames[] = { + {"any", 0}, {"tcp", SOCKNAL}, {"toe", TOENAL}, {"elan", QSWNAL}, @@ -95,7 +96,7 @@ ptl_name2nal (char *str) { name2num_t *e = name2num_lookup_name (nalnames, str); - return ((e == NULL) ? 0 : e->num); + return ((e == NULL) ? -1 : e->num); } static char * @@ -127,6 +128,49 @@ ptl_gethostbyname(char * hname) { return he; } +int +ptl_parse_port (int *port, char *str) +{ + char *end; + + *port = strtol (str, &end, 0); + + if (*end == 0 && /* parsed whole string */ + *port > 0 && *port < 65536) /* minimal sanity check */ + return (0); + + return (-1); +} + +int +ptl_parse_time (time_t *t, char *str) +{ + char *end; + int n; + struct tm tm; + + *t = strtol (str, &end, 0); + if (*end == 0) /* parsed whole string */ + return (0); + + memset (&tm, 0, sizeof (tm)); + n = sscanf (str, "%d-%d-%d %d:%d:%d", + &tm.tm_year, &tm.tm_mon, &tm.tm_mday, + &tm.tm_hour, &tm.tm_min, &tm.tm_sec); + if (n != 6) + return (-1); + + tm.tm_mon--; /* convert to 0 == Jan */ + tm.tm_year -= 1900; /* y2k quirk */ + tm.tm_isdst = -1; /* dunno if it's daylight savings... */ + + *t = mktime (&tm); + if (*t == (time_t)-1) + return (-1); + + return (0); +} + int ptl_parse_ipaddr (__u32 *ipaddrp, char *str) { @@ -220,21 +264,29 @@ ptl_nid2str (char *buffer, ptl_nid_t nid) if (he != NULL) strcpy (buffer, he->h_name); else - sprintf (buffer, "0x"LPX64, nid); + sprintf (buffer, LPX64, nid); return (buffer); } -int g_nal_is_compatible (char *cmd, ...) +int g_nal_is_set () { - va_list ap; - int nal; - if (g_nal == 0) { fprintf (stderr, "Error: you must run the 'network' command first.\n"); return (0); } - + + return (1); +} + +int g_nal_is_compatible (char *cmd, ...) +{ + va_list ap; + int nal; + + if (!g_nal_is_set ()) + return (0); + va_start (ap, cmd); do { @@ -320,7 +372,7 @@ int jt_ptl_network(int argc, char **argv) int nal; if (argc == 2 && - (nal = ptl_name2nal (argv[1])) != 0) { + (nal = ptl_name2nal (argv[1])) >= 0) { g_nal = nal; return (0); } @@ -353,12 +405,14 @@ jt_ptl_print_autoconnects (int argc, char **argv) if (rc != 0) break; - printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s affinity %s share %d\n", + printf (LPX64"@%s:%d #%d buffer %d nonagle %s xchg %s " + "affinity %s eager %s share %d\n", data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer), data.ioc_misc, data.ioc_count, data.ioc_size, (data.ioc_flags & 1) ? "on" : "off", (data.ioc_flags & 2) ? "on" : "off", (data.ioc_flags & 4) ? "on" : "off", + (data.ioc_flags & 8) ? "on" : "off", data.ioc_wait); } @@ -377,10 +431,11 @@ jt_ptl_add_autoconnect (int argc, char **argv) int xchange_nids = 0; int irq_affinity = 0; int share = 0; + int eager = 0; int rc; if (argc < 4 || argc > 5) { - fprintf (stderr, "usage: %s nid ipaddr port [ixs]\n", argv[0]); + fprintf (stderr, "usage: %s nid ipaddr port [ixse]\n", argv[0]); return 0; } @@ -398,8 +453,11 @@ jt_ptl_add_autoconnect (int argc, char **argv) return -1; } - port = atol (argv[3]); - + if (ptl_parse_port (&port, argv[3]) != 0) { + fprintf (stderr, "Can't parse port: %s\n", argv[3]); + return -1; + } + if (argc > 4) { char *opts = argv[4]; @@ -414,6 +472,9 @@ jt_ptl_add_autoconnect (int argc, char **argv) case 's': share = 1; break; + case 'e': + eager = 1; + break; default: fprintf (stderr, "Can't parse options: %s\n", argv[4]); @@ -429,10 +490,11 @@ jt_ptl_add_autoconnect (int argc, char **argv) data.ioc_misc = port; /* only passing one buffer size! */ data.ioc_size = MAX (g_socket_rxmem, g_socket_txmem); - data.ioc_flags = (g_socket_nonagle ? 1 : 0) | - (xchange_nids ? 2 : 0) | - (irq_affinity ? 4 : 0) | - (share ? 8 : 0); + data.ioc_flags = (g_socket_nonagle ? 0x01 : 0) | + (xchange_nids ? 0x02 : 0) | + (irq_affinity ? 0x04 : 0) | + (share ? 0x08 : 0) | + (eager ? 0x10 : 0); rc = l_ioctl (PORTALS_DEV_ID, IOC_PORTAL_NAL_CMD, &data); if (rc != 0) { @@ -532,7 +594,7 @@ jt_ptl_print_connections (int argc, char **argv) if (rc != 0) break; - printf (LPD64"@%s:%d\n", + printf (LPX64"@%s:%d\n", data.ioc_nid, ptl_ipaddr_2_str (data.ioc_id, buffer), data.ioc_misc); @@ -646,7 +708,11 @@ int jt_ptl_connect(int argc, char **argv) return -1; } - port = atol(argv[2]); + if (ptl_parse_port (&port, argv[2]) != 0) { + fprintf (stderr, "Can't parse port: %s\n", argv[2]); + return -1; + } + if (argc > 3) for (flag = argv[3]; *flag != 0; flag++) switch (*flag) @@ -902,11 +968,8 @@ int jt_ptl_ping(int argc, char **argv) return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return -1; - } if (ptl_parse_nid (&nid, argv[1]) != 0) { @@ -957,11 +1020,9 @@ int jt_ptl_shownid(int argc, char **argv) return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command first\n"); + if (!g_nal_is_set()) return -1; - } - + PORTAL_IOC_INIT (data); data.ioc_nal = g_nal; rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_GET_NID, &data); @@ -987,11 +1048,8 @@ int jt_ptl_mynid(int argc, char **argv) return 0; } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return -1; - } if (argc >= 2) nidstr = argv[1]; @@ -1037,11 +1095,8 @@ jt_ptl_fail_nid (int argc, char **argv) return (0); } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return (-1); - } if (!strcmp (argv[1], "_all_")) nid = PTL_NID_ANY; @@ -1120,7 +1175,7 @@ jt_ptl_nagle (int argc, char **argv) if (Parser_bool (&enable, argv[1]) != 0) { fprintf (stderr, "Can't parse boolean %s\n", argv[1]); - return (0); + return (-1); } g_socket_nonagle = !enable; } @@ -1143,11 +1198,8 @@ jt_ptl_add_route (int argc, char **argv) return (0); } - if (g_nal == 0) { - fprintf(stderr, "Error: you must run the 'network' command " - "first.\n"); + if (!g_nal_is_set()) return (-1); - } if (ptl_parse_nid (&gateway_nid, argv[1]) != 0) { @@ -1190,6 +1242,8 @@ jt_ptl_del_route (int argc, char **argv) { struct portal_ioctl_data data; ptl_nid_t nid; + ptl_nid_t nid1 = PTL_NID_ANY; + ptl_nid_t nid2 = PTL_NID_ANY; int rc; if (argc < 2) @@ -1198,14 +1252,43 @@ jt_ptl_del_route (int argc, char **argv) return (0); } + if (!g_nal_is_set()) + return (-1); + if (ptl_parse_nid (&nid, argv[1]) != 0) { - fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); + fprintf (stderr, "Can't parse gateway NID \"%s\"\n", argv[1]); return (-1); } + if (argc >= 3 && + ptl_parse_nid (&nid1, argv[2]) != 0) + { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[2]); + return (-1); + } + + if (argc < 4) { + nid2 = nid1; + } else { + if (ptl_parse_nid (&nid2, argv[3]) != 0) { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[3]); + return (-1); + } + + if (nid1 > nid2) { + ptl_nid_t tmp = nid1; + + nid1 = nid2; + nid2 = tmp; + } + } + PORTAL_IOC_INIT(data); + data.ioc_nal = g_nal; data.ioc_nid = nid; + data.ioc_nid2 = nid1; + data.ioc_nid3 = nid2; rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_DEL_ROUTE, &data); if (rc != 0) @@ -1217,6 +1300,67 @@ jt_ptl_del_route (int argc, char **argv) return (0); } +int +jt_ptl_notify_router (int argc, char **argv) +{ + struct portal_ioctl_data data; + int enable; + ptl_nid_t nid; + int rc; + struct timeval now; + time_t when; + + if (argc < 3) + { + fprintf (stderr, "usage: %s targetNID <up/down> [<time>]\n", + argv[0]); + return (0); + } + + if (ptl_parse_nid (&nid, argv[1]) != 0) + { + fprintf (stderr, "Can't parse target NID \"%s\"\n", argv[1]); + return (-1); + } + + if (Parser_bool (&enable, argv[2]) != 0) { + fprintf (stderr, "Can't parse boolean %s\n", argv[2]); + return (-1); + } + + gettimeofday(&now, NULL); + + if (argc < 4) { + when = now.tv_sec; + } else if (ptl_parse_time (&when, argv[3]) != 0) { + fprintf(stderr, "Can't parse time %s\n" + "Please specify either 'YYYY-MM-DD HH:MM:SS'\n" + "or an absolute unix time in seconds\n", argv[3]); + return (-1); + } else if (when > now.tv_sec) { + fprintf (stderr, "%s specifies a time in the future\n", + argv[3]); + return (-1); + } + + PORTAL_IOC_INIT(data); + data.ioc_nal = g_nal; + data.ioc_nid = nid; + data.ioc_flags = enable; + /* Yeuch; 'cept I need a __u64 on 64 bit machines... */ + data.ioc_nid3 = (__u64)when; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_NOTIFY_ROUTER, &data); + if (rc != 0) + { + fprintf (stderr, "IOC_PORTAL_NOTIFY_ROUTER ("LPX64") failed: %s\n", + nid, strerror (errno)); + return (-1); + } + + return (0); +} + int jt_ptl_print_routes (int argc, char **argv) { @@ -1228,8 +1372,8 @@ jt_ptl_print_routes (int argc, char **argv) ptl_nid_t gateway_nid; ptl_nid_t nid1; ptl_nid_t nid2; - - + int alive; + for (index = 0;;index++) { PORTAL_IOC_INIT(data); @@ -1243,13 +1387,332 @@ jt_ptl_print_routes (int argc, char **argv) gateway_nid = data.ioc_nid; nid1 = data.ioc_nid2; nid2 = data.ioc_nid3; - - printf ("%8s %18s : %s - %s\n", + alive = data.ioc_flags; + + printf ("%8s %18s : %s - %s, %s\n", nal2name (gateway_nal), ptl_nid2str (buffer[0], gateway_nid), ptl_nid2str (buffer[1], nid1), - ptl_nid2str (buffer[2], nid2)); + ptl_nid2str (buffer[2], nid2), + alive ? "up" : "down"); } return (0); } +static int +lwt_control(int enable, int clear) +{ + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + data.ioc_flags = enable; + data.ioc_misc = clear; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_CONTROL, &data); + if (rc == 0) + return (0); + + fprintf(stderr, "IOC_PORTAL_LWT_CONTROL failed: %s\n", + strerror(errno)); + return (-1); +} + +static int +lwt_snapshot(int *ncpu, int *totalsize, lwt_event_t *events, int size) +{ + struct portal_ioctl_data data; + int rc; + + PORTAL_IOC_INIT(data); + data.ioc_pbuf1 = (char *)events; + data.ioc_plen1 = size; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_SNAPSHOT, &data); + if (rc != 0) { + fprintf(stderr, "IOC_PORTAL_LWT_SNAPSHOT failed: %s\n", + strerror(errno)); + return (-1); + } + + LASSERT (data.ioc_count != 0); + LASSERT (data.ioc_misc != 0); + + if (ncpu != NULL) + *ncpu = data.ioc_count; + + if (totalsize != NULL) + *totalsize = data.ioc_misc; + + return (0); +} + +static char * +lwt_get_string(char *kstr) +{ + char *ustr; + struct portal_ioctl_data data; + int size; + int rc; + + /* FIXME: this could maintain a symbol table since we expect to be + * looking up the same strings all the time... */ + + PORTAL_IOC_INIT(data); + data.ioc_pbuf1 = kstr; + data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */ + data.ioc_pbuf2 = NULL; + data.ioc_plen2 = 0; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data); + if (rc != 0) { + fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n", + strerror(errno)); + return (NULL); + } + + size = data.ioc_count; + ustr = (char *)malloc(size); + if (ustr == NULL) { + fprintf(stderr, "Can't allocate string storage of size %d\n", + size); + return (NULL); + } + + PORTAL_IOC_INIT(data); + data.ioc_pbuf1 = kstr; + data.ioc_plen1 = 1; /* non-zero just to fool portal_ioctl_is_invalid() */ + data.ioc_pbuf2 = ustr; + data.ioc_plen2 = size; + + rc = l_ioctl(PORTALS_DEV_ID, IOC_PORTAL_LWT_LOOKUP_STRING, &data); + if (rc != 0) { + fprintf(stderr, "IOC_PORTAL_LWT_LOOKUP_STRING failed: %s\n", + strerror(errno)); + return (NULL); + } + + LASSERT(strlen(ustr) == size - 1); + return (ustr); +} + +static void +lwt_put_string(char *ustr) +{ + free(ustr); +} + +static int +lwt_print(FILE *f, cycles_t t0, cycles_t tlast, double mhz, int cpu, lwt_event_t *e) +{ + char whenstr[32]; + char *where = lwt_get_string(e->lwte_where); + + if (where == NULL) + return (-1); + + sprintf(whenstr, LPD64, e->lwte_when - t0); + + fprintf(f, "%#010lx %#010lx %#010lx %#010lx: %#010lx %1d %10.6f %10.2f %s\n", + e->lwte_p1, e->lwte_p2, e->lwte_p3, e->lwte_p4, + (long)e->lwte_task, cpu, (e->lwte_when - t0) / (mhz * 1000000.0), + (t0 == e->lwte_when) ? 0.0 : (e->lwte_when - tlast) / mhz, + where); + + lwt_put_string(where); + + return (0); +} + +double +get_cycles_per_usec () +{ + FILE *f = fopen ("/proc/cpuinfo", "r"); + double mhz; + char line[64]; + + if (f != NULL) { + while (fgets (line, sizeof (line), f) != NULL) + if (sscanf (line, "cpu MHz : %lf", &mhz) == 1) { + fclose (f); + return (mhz); + } + fclose (f); + } + + fprintf (stderr, "Can't read/parse /proc/cpuinfo\n"); + return (1000.0); +} + +int +jt_ptl_lwt(int argc, char **argv) +{ +#define MAX_CPUS 8 + int ncpus; + int totalspace; + int nevents_per_cpu; + lwt_event_t *events; + lwt_event_t *cpu_event[MAX_CPUS + 1]; + lwt_event_t *next_event[MAX_CPUS]; + lwt_event_t *first_event[MAX_CPUS]; + int cpu; + lwt_event_t *e; + int rc; + int i; + double mhz; + cycles_t t0; + cycles_t tlast; + FILE *f = stdout; + + if (argc < 2 || + (strcmp(argv[1], "start") && + strcmp(argv[1], "stop"))) { + fprintf(stderr, + "usage: %s start\n" + " %s stop [fname]\n", argv[0], argv[0]); + return (-1); + } + + if (!strcmp(argv[1], "start")) { + /* disable */ + if (lwt_control(0, 0) != 0) + return (-1); + + /* clear */ + if (lwt_control(0, 1) != 0) + return (-1); + + /* enable */ + if (lwt_control(1, 0) != 0) + return (-1); + + return (0); + } + + if (lwt_snapshot(&ncpus, &totalspace, NULL, 0) != 0) + return (-1); + + if (ncpus > MAX_CPUS) { + fprintf(stderr, "Too many cpus: %d (%d)\n", ncpus, MAX_CPUS); + return (-1); + } + + events = (lwt_event_t *)malloc(totalspace); + if (events == NULL) { + fprintf(stderr, "Can't allocate %d\n", totalspace); + return (-1); + } + + if (lwt_control(0, 0) != 0) { /* disable */ + free(events); + return (-1); + } + + if (lwt_snapshot(NULL, NULL, events, totalspace)) { + free(events); + return (-1); + } + + if (argc > 2) { + f = fopen (argv[2], "w"); + if (f == NULL) { + fprintf(stderr, "Can't open %s for writing: %s\n", argv[2], strerror (errno)); + free(events); + return (-1); + } + } + + mhz = get_cycles_per_usec(); + + /* carve events into per-cpu slices */ + nevents_per_cpu = totalspace / (ncpus * sizeof(lwt_event_t)); + for (cpu = 0; cpu <= ncpus; cpu++) + cpu_event[cpu] = &events[cpu * nevents_per_cpu]; + + /* find the earliest event on each cpu */ + for (cpu = 0; cpu < ncpus; cpu++) { + first_event[cpu] = NULL; + + for (e = cpu_event[cpu]; e < cpu_event[cpu + 1]; e++) { + + if (e->lwte_where == NULL) /* not an event */ + continue; + + if (first_event[cpu] == NULL || + first_event[cpu]->lwte_when > e->lwte_when) + first_event[cpu] = e; + } + + next_event[cpu] = first_event[cpu]; + } + + t0 = tlast = 0; + for (cpu = 0; cpu < ncpus; cpu++) { + e = first_event[cpu]; + if (e == NULL) /* no events this cpu */ + continue; + + if (e == cpu_event[cpu]) + e = cpu_event[cpu + 1] - 1; + else + e = e - 1; + + /* If there's an event immediately before the first one, this + * cpu wrapped its event buffer */ + if (e->lwte_where == NULL) + continue; + + /* We should only start outputting events from the most recent + * first event in any wrapped cpu. Events before this time on + * other cpus won't have any events from this CPU to interleave + * with. */ + if (t0 < first_event[cpu]->lwte_when) + t0 = first_event[cpu]->lwte_when; + } + + for (;;) { + /* find which cpu has the next event */ + cpu = -1; + for (i = 0; i < ncpus; i++) { + + if (next_event[i] == NULL) /* this cpu exhausted */ + continue; + + if (cpu < 0 || + next_event[i]->lwte_when < next_event[cpu]->lwte_when) + cpu = i; + } + + if (cpu < 0) /* all cpus exhausted */ + break; + + if (t0 == 0) { + /* no wrapped cpus and this is he first ever event */ + t0 = next_event[cpu]->lwte_when; + } + + if (t0 <= next_event[cpu]->lwte_when) { + /* on or after the first event */ + rc = lwt_print(f, t0, tlast, mhz, cpu, next_event[cpu]); + if (rc != 0) + break; + } + + tlast = next_event[cpu]->lwte_when; + + next_event[cpu]++; + if (next_event[cpu] == cpu_event[cpu + 1]) + next_event[cpu] = cpu_event[cpu]; + + if (next_event[cpu]->lwte_where == NULL || + next_event[cpu] == first_event[cpu]) + next_event[cpu] = NULL; + } + + if (f != stdout) + fclose(f); + + free(events); + return (0); +#undef MAX_CPUS +} diff --git a/lustre/portals/utils/ptlctl.c b/lustre/portals/utils/ptlctl.c index c083e483366f1dddedd3e30f97d7a02c963604ab..1a8e637291ee307f49aa58533029d5da234f7f51 100644 --- a/lustre/portals/utils/ptlctl.c +++ b/lustre/portals/utils/ptlctl.c @@ -31,7 +31,7 @@ command_t list[] = { {"network", jt_ptl_network, 0,"setup the NAL (args: nal name)"}, {"print_autoconns", jt_ptl_print_autoconnects, 0, "print autoconnect entries (no args)"}, - {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixs])"}, + {"add_autoconn", jt_ptl_add_autoconnect, 0, "add autoconnect entry (args: nid host [ixse])"}, {"del_autoconn", jt_ptl_del_autoconnect, 0, "delete autoconnect entry (args: [nid] [host] [ks])"}, {"print_conns", jt_ptl_print_connections, 0, "print connections (no args)"}, {"connect", jt_ptl_connect, 0, "connect to a remote nid (args: host port [xi])"}, @@ -41,8 +41,12 @@ command_t list[] = { {"ping", jt_ptl_ping, 0, "do a ping test (args: nid [count] [size] [timeout])"}, {"shownid", jt_ptl_shownid, 0, "print the local NID"}, {"mynid", jt_ptl_mynid, 0, "inform the socknal of the local NID (args: [hostname])"}, - {"add_route", jt_ptl_add_route, 0, "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, - {"del_route", jt_ptl_del_route, 0, "delete an entry from the routing table (args: targetNID"}, + {"add_route", jt_ptl_add_route, 0, + "add an entry to the routing table (args: gatewayNID targetNID [targetNID])"}, + {"del_route", jt_ptl_del_route, 0, + "delete all routes via a gateway from the routing table (args: gatewayNID"}, + {"set_route", jt_ptl_notify_router, 0, + "enable/disable a route in the routing table (args: gatewayNID up/down [time]"}, {"print_routes", jt_ptl_print_routes, 0, "print the routing table (args: none)"}, {"recv_mem", jt_ptl_rxmem, 0, "Set socket receive buffer size (args: [size])"}, {"send_mem", jt_ptl_txmem, 0, "Set socket send buffer size (args: [size])"}, diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 830ab568f7aafe9b560454a31c0100c1745309ee..00ea58a1f9813ef1fe927666768f20e5bccda01e 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -145,7 +145,6 @@ def debug(*args): msg = string.join(map(str,args)) print msg - # ack, python's builtin int() does not support '0x123' syntax. # eval can do it, although what a hack! def my_int(s): @@ -436,8 +435,8 @@ class LCTLInterface: cmds = """ ignore_errors network %s - del_route %s - quit """ % (net, lo) + del_route %s %s %s + quit """ % (net, gw, lo, hi) self.run(cmds) # add a route to a host @@ -461,8 +460,8 @@ class LCTLInterface: ignore_errors network %s del_uuid %s - del_route %s - quit """ % (net, uuid, tgt) + del_route %s %s + quit """ % (net, uuid, gw, tgt) self.run(cmds) # disconnect one connection @@ -1002,16 +1001,7 @@ class Network(Module): gw = Network(net) if (gw.cluster_id == self.cluster_id and gw.net_type == self.net_type): - # hack: compare as numbers if possible, this should all - # go away once autoconnect is done. - # This also conveniently prevents us from connecting to ourself. - try: - gw_nid = my_int(gw.nid) - self_nid = my_int(self.nid) - except ValueError, e: - gw_nid = gw.nid - self_nid = self.nid - if gw_nid != self_nid: + if gw.nid != self.nid: lctl.connect(gw) def disconnect_peer_gateways(self): @@ -1022,16 +1012,7 @@ class Network(Module): gw = Network(net) if (gw.cluster_id == self.cluster_id and gw.net_type == self.net_type): - # hack: compare as numbers if possible, this should all - # go away once autoconnect is done. - # This also conveniently prevents us from connecting to ourself. - try: - gw_nid = my_int(gw.nid) - self_nid = my_int(self.nid) - except ValueError, e: - gw_nid = gw.nid - self_nid = self.nid - if gw_nid != self_nid: + if gw.nid != self.nid: try: lctl.disconnect(gw) except CommandError, e: @@ -1048,33 +1029,51 @@ class Network(Module): stop_acceptor(self.port) if node_is_router(): self.disconnect_peer_gateways() -# -# This commented out so connections not created by this -# config are not disturbed -# -# try: -# lctl.disconnectAll(self.net_type) -# except CommandError, e: -# print "disconnectAll failed: ", self.name -# e.dump() -# cleanup_error(e.rc) + + try: + lctl.disconnectAll(self.net_type) + except CommandError, e: + print "disconnectAll failed: ", self.name + e.dump() + cleanup_error(e.rc) class RouteTable(Module): def __init__(self,db): Module.__init__(self, 'ROUTES', db) + + def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi): + # only setup connections for tcp NALs + srvdb = None + if not net_type in ('tcp', 'toe'): + return None + + # connect to target if route is to single node and this node is the gw + if lo == hi and local_interface(net_type, gw_cluster_id, gw): + if not local_cluster(net_type, tgt_cluster_id): + panic("target", lo, " not on the local cluster") + srvdb = self.db.nid2server(lo, net_type) + # connect to gateway if this node is not the gw + elif (local_cluster(net_type, gw_cluster_id) + and not local_interface(net_type, gw_cluster_id, gw)): + srvdb = self.db.nid2server(gw, net_type) + else: + return None + + if not srvdb: + panic("no server for nid", lo) + return None + + return Network(srvdb) + def prepare(self): if is_network_prepared(): return self.info() for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl(): lctl.add_route(net_type, gw, lo, hi) - if net_type in ('tcp', 'toe') and local_net_type(net_type, tgt_cluster_id) and lo == hi: - srvdb = self.db.nid2server(lo, net_type) - if not srvdb: - panic("no server for nid", lo) - else: - srv = Network(srvdb) - lctl.connect(srv) + srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi) + if srv: + lctl.connect(srv) def safe_to_clean(self): return not is_network_prepared() @@ -1084,18 +1083,15 @@ class RouteTable(Module): # the network is still being used, don't clean it up return for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl(): - if net_type in ('tcp', 'toe') and local_net_type(net_type, tgt_cluster_id) and lo == hi: - srvdb = self.db.nid2server(lo, net_type) - if not srvdb: - panic("no server for nid", lo) - else: - srv = Network(srvdb) - try: - lctl.disconnect(srv) - except CommandError, e: - print "disconnect failed: ", self.name - e.dump() - cleanup_error(e.rc) + srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi) + if srv: + try: + lctl.disconnect(srv) + except CommandError, e: + print "disconnect failed: ", self.name + e.dump() + cleanup_error(e.rc) + try: lctl.del_route(net_type, gw, lo, hi) except CommandError, e: @@ -1611,11 +1607,13 @@ class ECHO_CLIENT(Module): self.add_lustre_module('obdecho', 'obdecho') self.obd_uuid = self.db.get_first_ref('obd') obd = self.db.lookup(self.obd_uuid) + self.uuid = generate_client_uuid(self.name) self.osc = VOSC(obd, self.uuid, self.name) def prepare(self): if is_prepared(self.name): return + run_acceptors() self.osc.prepare() # XXX This is so cheating. -p self.info(self.obd_uuid) @@ -1670,6 +1668,7 @@ class Mountpoint(Module): if fs_is_mounted(self.path): log(self.path, "already mounted.") return + run_acceptors() if self.mgmtcli: self.mgmtcli.prepare() self.vosc.prepare() @@ -1810,7 +1809,7 @@ def get_mdc(db, uuid, fs_name, mds_uuid): ############################################################ # routing ("rooting") -# list of (nettype, cluster_id) +# list of (nettype, cluster_id, nid) local_clusters = [] def find_local_clusters(node_db): @@ -1819,7 +1818,7 @@ def find_local_clusters(node_db): net = node_db.lookup(netuuid) srv = Network(net) debug("add_local", netuuid) - local_clusters.append((srv.net_type, srv.cluster_id)) + local_clusters.append((srv.net_type, srv.cluster_id, srv.nid)) if srv.port > 0: if acceptors.has_key(srv.port): panic("duplicate port:", srv.port) @@ -1855,7 +1854,7 @@ def find_local_routes(lustre): for router in list: if router.get_val_int('router', 0): needs_router = 1 - for (local_type, local_cluster_id) in local_clusters: + for (local_type, local_cluster_id, local_nid) in local_clusters: gw = None for netuuid in router.get_networks(): db = router.lookup(netuuid) @@ -1872,21 +1871,28 @@ def find_local_routes(lustre): def choose_local_server(srv_list): for srv in srv_list: - if local_net_type(srv.net_type, srv.cluster_id): + if local_cluster(srv.net_type, srv.cluster_id): return srv -def local_net_type(net_type, cluster_id): +def local_cluster(net_type, cluster_id): for cluster in local_clusters: if net_type == cluster[0] and cluster_id == cluster[1]: return 1 return 0 +def local_interface(net_type, cluster_id, nid): + for cluster in local_clusters: + if (net_type == cluster[0] and cluster_id == cluster[1] + and nid == cluster[2]): + return 1 + return 0 + def find_route(srv_list): frm_type = local_clusters[0][0] for srv in srv_list: - debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type) + debug("find_route: srv:", srv.nid, "type: ", srv.net_type) to_type = srv.net_type - to = srv.hostaddr # XXX should this be hostaddr, or nid? + to = srv.nid cluster_id = srv.cluster_id debug ('looking for route to', to_type, to) for r in local_routes: diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index a90eeee5256b7012289b422bb95ff91909f5187b..2c5852b905001b37cf71733c47b8fb1e3d57bfb1 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -63,12 +63,14 @@ command_t cmdlist[] = { /* Network configuration commands */ {"==== network config ====", jt_noop, 0, "network config"}, + {"--net", jt_opt_net, 0, "run <command> after setting network to <net>\n" + "usage: --net <tcp/elan/myrinet/scimac> <command>"}, {"network", jt_ptl_network, 0, "commands that follow apply to net\n" "usage: network <tcp/elan/myrinet/scimac>"}, {"autoconn_list", jt_ptl_print_autoconnects, 0, "print autoconnect entries\n" "usage: print_autoconns"}, {"add_autoconn", jt_ptl_add_autoconnect, 0, "add an autoconnect entry\n" - "usage: add_autoconn <nid> <host> <port> [ixs]"}, + "usage: add_autoconn <nid> <host> <port> [ixse]"}, {"del_autoconn", jt_ptl_del_autoconnect, 0, "remove an autoconnect entry\n" "usage: del_autoconn [<nid>] [<host>] [ks]"}, {"conn_list", jt_ptl_print_connections, 0, "connect to a remote nid\n" @@ -92,12 +94,15 @@ command_t cmdlist[] = { {"del_uuid", jt_obd_del_uuid, 0, "delete a UUID association\n" "usage: del_uuid <uuid>"}, {"add_route", jt_ptl_add_route, 0, - "add an entry to the routing table\n" - "usage: add_route <gateway> <target> [target]"}, + "add an entry to the portals routing table\n" + "usage: add_route <gateway> <target> [<target>]"}, {"del_route", jt_ptl_del_route, 0, - "delete an entry from the routing table\n" - "usage: del_route <target>"}, - {"route_list", jt_ptl_print_routes, 0, "print the routing table\n" + "delete the route via the given gateway to the given targets from the portals routing table\n" + "usage: del_route <gateway> [<target>] [<target>]"}, + {"set_route", jt_ptl_notify_router, 0, + "enable/disable routes via the given gateway in the portals routing table\n" + "usage: set_gw <gateway> <up/down> [<time>]"}, + {"route_list", jt_ptl_print_routes, 0, "print the portals routing table\n" "usage: route_list"}, {"recv_mem", jt_ptl_rxmem, 0, "set socket receive buffer size, " "if size is omited the current size is reported.\n" @@ -236,7 +241,11 @@ command_t cmdlist[] = { "usage: modules <path>"}, {"panic", jt_dbg_panic, 0, "force the kernel to panic\n" "usage: panic"}, - + {"lwt", jt_ptl_lwt, 0, + "light-weight tracing\n" + "usage: lwt start\n" + " lwt stop [file]"}, + /* User interface commands */ {"======= control ========", jt_noop, 0, "control commands"}, {"help", Parser_help, 0, "help"}, diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index 70cd5bfd8ba0c33caa3d2a4decf756889c0dc9a2..754de0d634add0e13642f676aece7f2df2016708 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -677,6 +677,25 @@ int jt_opt_threads(int argc, char **argv) return rc; } +int jt_opt_net(int argc, char **argv) +{ + char *arg2[3]; + int rc; + + if (argc < 3) + return CMD_HELP; + + arg2[0] = argv[0]; + arg2[1] = argv[1]; + arg2[2] = NULL; + rc = jt_ptl_network (2, arg2); + + if (!rc) + rc = Parser_execarg(argc - 2, argv + 2, cmdlist); + + return rc; +} + int jt_obd_detach(int argc, char **argv) { struct obd_ioctl_data data; @@ -1960,7 +1979,7 @@ int jt_obd_add_uuid(int argc, char **argv) nal = ptl_name2nal(argv[3]); - if (nal == 0) { + if (nal <= 0) { fprintf (stderr, "Can't parse NAL %s\n", argv[3]); return -1; } @@ -1980,7 +1999,7 @@ int jt_obd_close_uuid(int argc, char **argv) nal = ptl_name2nal(argv[2]); - if (nal == 0) { + if (nal <= 0) { fprintf (stderr, "Can't parse NAL %s\n", argv[2]); return -1; } diff --git a/lustre/utils/obdctl.h b/lustre/utils/obdctl.h index 0203579c9a9b3ba37090c419a6b309976e301a88..40614309900aeea787cc3ec360bbd37308607b3a 100644 --- a/lustre/utils/obdctl.h +++ b/lustre/utils/obdctl.h @@ -32,6 +32,7 @@ void obd_cleanup(int argc, char **argv); int jt_opt_device(int argc, char **argv); int jt_opt_threads(int argc, char **argv); +int jt_opt_net(int argc, char **argv); int jt_obd_device(int argc, char **argv); int jt_obd_connect(int argc, char **argv); diff --git a/lustre/utils/parser.c b/lustre/utils/parser.c index fef987b81f04b8b0233308aa89cfe2df415bc8ae..38752f63c9bb1433944a531dc602be8252ceabea 100644 --- a/lustre/utils/parser.c +++ b/lustre/utils/parser.c @@ -698,6 +698,7 @@ int Parser_bool (int *b, char *str) { if (!strcasecmp (str, "no") || !strcasecmp (str, "n") || !strcasecmp (str, "off") || + !strcasecmp (str, "down") || !strcasecmp (str, "disable")) { *b = 0; @@ -707,6 +708,7 @@ int Parser_bool (int *b, char *str) { if (!strcasecmp (str, "yes") || !strcasecmp (str, "y") || !strcasecmp (str, "on") || + !strcasecmp (str, "up") || !strcasecmp (str, "enable")) { *b = 1;