From 2de13382b974b8130d49d81888854c17db67b8a4 Mon Sep 17 00:00:00 2001
From: adilger <adilger>
Date: Fri, 12 Sep 2003 10:43:36 +0000
Subject: [PATCH] Make server bulk RPC timeouts shorter than the client
 timeouts, so we don't have cascading failures.  Server bulk timeout is 1/4 of
 the client timeout. Also fix /proc variables to be int, as that is what the
 functions expect. b=1845

---
 lustre/include/linux/obd_support.h |  6 +++---
 lustre/mds/handler.c               |  2 +-
 lustre/obdclass/class_obd.c        |  7 +++----
 lustre/ost/ost_handler.c           | 21 ++++++++++++---------
 lustre/ptlbd/rpc.c                 |  2 +-
 5 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h
index 28a9a3d2f6..b00de376f2 100644
--- a/lustre/include/linux/obd_support.h
+++ b/lustre/include/linux/obd_support.h
@@ -36,11 +36,11 @@
 /* global variables */
 extern atomic_t obd_memory;
 extern int obd_memmax;
-extern unsigned long obd_fail_loc;
-extern unsigned long obd_timeout;
+extern unsigned int obd_fail_loc;
+extern unsigned int obd_timeout;
 extern unsigned long obd_max_dirty_pages;
 extern char obd_lustre_upcall[128];
-extern unsigned long obd_sync_filter;
+extern unsigned int obd_sync_filter;
 
 #define OBD_FAIL_MDS                     0x100
 #define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c
index 364cf841e3..3c5337e6d1 100644
--- a/lustre/mds/handler.c
+++ b/lustre/mds/handler.c
@@ -114,7 +114,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file,
                 GOTO(cleanup_buf, rc);
         }
 
-        lwi = LWI_TIMEOUT(obd_timeout * HZ, mds_bulk_timeout, desc);
+        lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, mds_bulk_timeout, desc);
         rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete (desc), &lwi);
         if (rc) {
                 LASSERT (rc == -ETIMEDOUT);
diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c
index 2efee5b7e3..7ee897e18f 100644
--- a/lustre/obdclass/class_obd.c
+++ b/lustre/obdclass/class_obd.c
@@ -77,11 +77,10 @@ struct lprocfs_vars lprocfs_version[] = {{"version", obd_proc_read_version, NULL
 int proc_version;
 
 /* The following are visible and mutable through /proc/sys/lustre/. */
-unsigned long obd_fail_loc;
-unsigned long obd_timeout = 100;
-unsigned long obd_bulk_timeout = 1;
+unsigned int obd_fail_loc;
+unsigned int obd_timeout = 100;
 char obd_lustre_upcall[128] = "/usr/lib/lustre/lustre_upcall";
-unsigned long obd_sync_filter; /* = 0, don't sync by default */
+unsigned int obd_sync_filter; /* = 0, don't sync by default */
 
 #ifdef __KERNEL__
 /*  opening /dev/obd */
diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c
index 20db4866f8..5f3575806b 100644
--- a/lustre/ost/ost_handler.c
+++ b/lustre/ost/ost_handler.c
@@ -412,6 +412,9 @@ static int ost_brw_read(struct ptlrpc_request *req)
         if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK))
                 GOTO(out, rc = -EIO);
 
+        OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE,
+                         (obd_timeout + 1) / 4);
+
         body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body);
         if (body == NULL) {
                 CERROR("Missing/short ost_body\n");
@@ -494,17 +497,17 @@ static int ost_brw_read(struct ptlrpc_request *req)
         if (rc == 0) {
                 rc = ptlrpc_bulk_put(desc);
                 if (rc == 0) {
-                        lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout,
-                                          desc);
+                        lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
+                                          ost_bulk_timeout, desc);
                         rc = l_wait_event(desc->bd_waitq,
                                           ptlrpc_bulk_complete(desc), &lwi);
                         if (rc) {
                                 LASSERT(rc == -ETIMEDOUT);
-                                CERROR ("timeout waiting for bulk PUT\n");
+                                DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT");
                                 ptlrpc_abort_bulk(desc);
                         }
                 } else {
-                        CERROR("ptlrpc_bulk_put failed RC: %d\n", rc);
+                        DEBUG_REQ(D_ERROR, req, "bulk PUT failed: rc %d\n", rc);
 		}
 		comms_error = rc != 0;
         }
@@ -574,7 +577,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
 
         /* pause before transaction has been started */
         OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE,
-                         obd_timeout +1);
+                         (obd_timeout + 1) / 4);
 
         swab = lustre_msg_swabbed(req->rq_reqmsg);
         body = lustre_swab_reqbuf(req, 0, sizeof(*body), lustre_swab_ost_body);
@@ -654,17 +657,17 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         if (rc == 0) {
                 rc = ptlrpc_bulk_get(desc);
                 if (rc == 0) {
-                        lwi = LWI_TIMEOUT(obd_timeout * HZ, ost_bulk_timeout,
-                                          desc);
+                        lwi = LWI_TIMEOUT(obd_timeout * HZ / 4,
+                                          ost_bulk_timeout, desc);
                         rc = l_wait_event(desc->bd_waitq,
                                           ptlrpc_bulk_complete(desc), &lwi);
                         if (rc) {
                                 LASSERT(rc == -ETIMEDOUT);
-                                CERROR("timeout waiting for bulk GET\n");
+                                DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
                                 ptlrpc_abort_bulk(desc);
                         }
                 } else {
-			CERROR("ptlrpc_bulk_get failed RC: %d\n", rc);
+			DEBUG_REQ(D_ERROR, req, "bulk GET failed: rc %d\n", rc);
 		}
 		comms_error = rc != 0;
         }
diff --git a/lustre/ptlbd/rpc.c b/lustre/ptlbd/rpc.c
index 9829900dce..f817802e6d 100644
--- a/lustre/ptlbd/rpc.c
+++ b/lustre/ptlbd/rpc.c
@@ -275,7 +275,7 @@ int ptlbd_srv_rw_req(ptlbd_cmd_t cmd, __u16 index,
                 GOTO(out_reply, rc);
         }
 
-        lwi = LWI_TIMEOUT(obd_timeout * HZ, NULL, desc);
+        lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, desc);
         rc = l_wait_event(desc->bd_waitq, ptlrpc_bulk_complete(desc), &lwi);
         if (rc != 0) {
                 LASSERT(rc == -ETIMEDOUT);
-- 
GitLab