Skip to content
Snippets Groups Projects
Commit 2de13382 authored by Andreas Dilger's avatar Andreas Dilger
Browse files

Make server bulk RPC timeouts shorter than the client timeouts, so we don't

have cascading failures.  Server bulk timeout is 1/4 of the client timeout.
Also fix /proc variables to be int, as that is what the functions expect.
b=1845
parent 935498e0
No related branches found
No related tags found
No related merge requests found
......@@ -36,11 +36,11 @@
/* global variables */
extern atomic_t obd_memory;
extern int obd_memmax;
extern unsigned long obd_fail_loc;
extern unsigned long obd_timeout;
extern unsigned int obd_fail_loc;
extern unsigned int obd_timeout;
extern unsigned long obd_max_dirty_pages;
extern char obd_lustre_upcall[128];
extern unsigned long obd_sync_filter;
extern unsigned int obd_sync_filter;
#define OBD_FAIL_MDS 0x100
#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101
......
......@@ -114,7 +114,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
GOTO(cleanup_buf, rc);
}
lwi = LWI_TIMEOUT(obd_timeout * HZ, mds_bulk_timeout, desc);
lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, mds_bulk_timeout, desc);
rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete (desc), &lwi);
if (rc) {
LASSERT (rc == -ETIMEDOUT);
......
......@@ -77,11 +77,10 @@ struct lprocfs_vars lprocfs_version[] = {{"version", obd_proc_read_version, NULL
int proc_version;
/* The following are visible and mutable through /proc/sys/lustre/. */
unsigned long obd_fail_loc;
unsigned long obd_timeout = 100;
unsigned long obd_bulk_timeout = 1;
unsigned int obd_fail_loc;
unsigned int obd_timeout = 100;
char obd_lustre_upcall[128] = "/usr/lib/lustre/lustre_upcall";
unsigned long obd_sync_filter; /* = 0, don't sync by default */
unsigned int obd_sync_filter; /* = 0, don't sync by default */
#ifdef __KERNEL__
/* opening /dev/obd */
......
......@@ -412,6 +412,9 @@ static int ost_brw_read(struct ptlrpc_request *req)
if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK))
GOTO(out, rc = -EIO);
OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE,
(obd_timeout + 1) / 4);
body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body);
if (body == NULL) {
CERROR("Missing/short ost_body\n");
......@@ -494,17 +497,17 @@ static int ost_brw_read(struct ptlrpc_request *req)
if (rc == 0) {
rc = ptlrpc_bulk_put(desc);
if (rc == 0) {
lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout,
desc);
lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
ost_bulk_timeout, desc);
rc = l_wait_event(desc->bd_waitq,
ptlrpc_bulk_complete(desc), &lwi);
if (rc) {
LASSERT(rc == -ETIMEDOUT);
CERROR ("timeout waiting for bulk PUT\n");
DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT");
ptlrpc_abort_bulk(desc);
}
} else {
CERROR("ptlrpc_bulk_put failed RC: %d\n", rc);
DEBUG_REQ(D_ERROR, req, "bulk PUT failed: rc %d\n", rc);
}
comms_error = rc != 0;
}
......@@ -574,7 +577,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
/* pause before transaction has been started */
OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE,
obd_timeout +1);
(obd_timeout + 1) / 4);
swab = lustre_msg_swabbed(req->rq_reqmsg);
body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body);
......@@ -654,17 +657,17 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
if (rc == 0) {
rc = ptlrpc_bulk_get(desc);
if (rc == 0) {
lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout,
desc);
lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
ost_bulk_timeout, desc);
rc = l_wait_event(desc->bd_waitq,
ptlrpc_bulk_complete(desc), &lwi);
if (rc) {
LASSERT(rc == -ETIMEDOUT);
CERROR("timeout waiting for bulk GET\n");
DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
ptlrpc_abort_bulk(desc);
}
} else {
CERROR("ptlrpc_bulk_get failed RC: %d\n", rc);
DEBUG_REQ(D_ERROR, req, "bulk GET failed: rc %d\n", rc);
}
comms_error = rc != 0;
}
......
......@@ -275,7 +275,7 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index,
GOTO(out_reply, rc);
}
lwi = LWI_TIMEOUT(obd_timeout * HZ, NULL, desc);
lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, desc);
rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi);
if (rc != 0) {
LASSERT(rc == -ETIMEDOUT);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment