From 2de13382b974b8130d49d81888854c17db67b8a4 Mon Sep 17 00:00:00 2001 From: adilger <adilger> Date: Fri, 12 Sep 2003 10:43:36 +0000 Subject: [PATCH] Make server bulk RPC timeouts shorter than the client timeouts, so we don't have cascading failures. Server bulk timeout is 1/4 of the client timeout. Also fix /proc variables to be int, as that is what the functions expect. b=1845 --- lustre/include/linux/obd_support.h | 6 +++--- lustre/mds/handler.c | 2 +- lustre/obdclass/class_obd.c | 7 +++---- lustre/ost/ost_handler.c | 21 ++++++++++++--------- lustre/ptlbd/rpc.c | 2 +- 5 files changed, 20 insertions(+), 18 deletions(-) diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 28a9a3d2f6..b00de376f2 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -36,11 +36,11 @@ /* global variables */ extern atomic_t obd_memory; extern int obd_memmax; -extern unsigned long obd_fail_loc; -extern unsigned long obd_timeout; +extern unsigned int obd_fail_loc; +extern unsigned int obd_timeout; extern unsigned long obd_max_dirty_pages; extern char obd_lustre_upcall[128]; -extern unsigned long obd_sync_filter; +extern unsigned int obd_sync_filter; #define OBD_FAIL_MDS 0x100 #define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 364cf841e3..3c5337e6d1 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -114,7 +114,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, GOTO(cleanup_buf, rc); } - lwi = LWI_TIMEOUT(obd_timeout * HZ, mds_bulk_timeout, desc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, mds_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete (desc), &lwi); if (rc) { LASSERT (rc == -ETIMEDOUT); diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 2efee5b7e3..7ee897e18f 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -77,11 +77,10 @@ struct lprocfs_vars lprocfs_version[] = {{"version", obd_proc_read_version, NULL int proc_version; /* The following are visible and mutable through /proc/sys/lustre/. */ -unsigned long obd_fail_loc; -unsigned long obd_timeout = 100; -unsigned long obd_bulk_timeout = 1; +unsigned int obd_fail_loc; +unsigned int obd_timeout = 100; char obd_lustre_upcall[128] = "/usr/lib/lustre/lustre_upcall"; -unsigned long obd_sync_filter; /* = 0, don't sync by default */ +unsigned int obd_sync_filter; /* = 0, don't sync by default */ #ifdef __KERNEL__ /* opening /dev/obd */ diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 20db4866f8..5f3575806b 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -412,6 +412,9 @@ static int ost_brw_read(struct ptlrpc_request *req) if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK)) GOTO(out, rc = -EIO); + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE, + (obd_timeout + 1) / 4); + body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body); if (body == NULL) { CERROR("Missing/short ost_body\n"); @@ -494,17 +497,17 @@ static int ost_brw_read(struct ptlrpc_request *req) if (rc == 0) { rc = ptlrpc_bulk_put(desc); if (rc == 0) { - lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, - desc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, + ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi); if (rc) { LASSERT(rc == -ETIMEDOUT); - CERROR ("timeout waiting for bulk PUT\n"); + DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT"); ptlrpc_abort_bulk(desc); } } else { - CERROR("ptlrpc_bulk_put failed RC: %d\n", rc); + DEBUG_REQ(D_ERROR, req, "bulk PUT failed: rc %d\n", rc); } comms_error = rc != 0; } @@ -574,7 +577,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) /* pause before transaction has been started */ OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE, - obd_timeout +1); + (obd_timeout + 1) / 4); swab = lustre_msg_swabbed(req->rq_reqmsg); body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body); @@ -654,17 +657,17 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) if (rc == 0) { rc = ptlrpc_bulk_get(desc); if (rc == 0) { - lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout, - desc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, + ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi); if (rc) { LASSERT(rc == -ETIMEDOUT); - CERROR("timeout waiting for bulk GET\n"); + DEBUG_REQ(D_ERROR, req, "timeout on bulk GET"); ptlrpc_abort_bulk(desc); } } else { - CERROR("ptlrpc_bulk_get failed RC: %d\n", rc); + DEBUG_REQ(D_ERROR, req, "bulk GET failed: rc %d\n", rc); } comms_error = rc != 0; } diff --git a/lustre/ptlbd/rpc.c b/lustre/ptlbd/rpc.c index 9829900dce..f817802e6d 100644 --- a/lustre/ptlbd/rpc.c +++ b/lustre/ptlbd/rpc.c @@ -275,7 +275,7 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index, GOTO(out_reply, rc); } - lwi = LWI_TIMEOUT(obd_timeout * HZ, NULL, desc); + lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, desc); rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi); if (rc != 0) { LASSERT(rc == -ETIMEDOUT); -- GitLab