From dab8a7621beb2240db9c1883ff741b1c5584e09b Mon Sep 17 00:00:00 2001
From: johann <johann>
Date: Thu, 3 Jul 2008 07:31:29 +0000
Subject: [PATCH] Branch b1_6 b=15950 i=wangdi i=shadow

The direct IO path doesn't call check_rpcs to submit a new RPC once
one is completed. As a result, some RPCs are stuck in the queue
and are never sent.
Merge brw_interpret() and brw_interpret_oap().
---
 lustre/ChangeLog             | 23 +++++++++-----
 lustre/include/obd_support.h |  1 +
 lustre/osc/osc_request.c     | 61 +++++++++++++-----------------------
 lustre/tests/sanity.sh       | 25 +++++++++++++++
 4 files changed, 62 insertions(+), 48 deletions(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 922e531136..8b6c22b72e 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -18,10 +18,10 @@ tbd Sun Microsystems, Inc.
 	  removed cwd "./" (refer to Bugzilla 14399).
 	* A new quota file format has been introduced in 1.6.5.
 	  The format conversion from prior releases is handled transparently,
-	  but releases older than 1.4.12/1.6.5 will not understand this new
-	  format.  The automatic format conversion can be avoided by running
+	  but releases older than 1.4.12/1.6.5 don't understand this new
+	  format. The automatic format conversion can be avoided by running
 	  the following command on the MDS:
-	  	'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'.
+		'tunefs.lustre --param="mdt.quota_type=ug1" $MDTDEV'.
 	  For more information, please refer to bugzilla 13904.
 
 Severity   : enhancement
@@ -46,7 +46,7 @@ Severity   : major
 Bugzilla   : 15924
 Description: do not process already freed flock
 Details    : flock can possibly be freed by another thread before it reaches
-             to ldlm_flock_completion_ast.
+	     to ldlm_flock_completion_ast.
 
 Severity   : normal
 Bugzilla   : 14480
@@ -71,7 +71,7 @@ Bugzilla   : 14742
 Frequency  : rare
 Description: ASSERTION(CheckWriteback(page,cmd)) failed
 Details    : badly clear PG_Writeback bit in ll_ap_completion can produce false
-             positive assertion.
+	     positive assertion.
 
 Severity   : normal
 Frequency  : only with broken builds/installations
@@ -149,8 +149,8 @@ Details    : VM protocol want old IO finished before start new, in this case
 Severity   : normal
 Frequency  : rare
 Bugzilla   : 12888
-Description: mds_mfd_close() ASSERTION(rc == 0) 
-Details    : In mds_mfd_close(), we need protect inode's writecount change 
+Description: mds_mfd_close() ASSERTION(rc == 0)
+Details    : In mds_mfd_close(), we need protect inode's writecount change
 	     within its orphan write semaphore to prevent possible races.
 
 Severity   : minor
@@ -216,7 +216,7 @@ Severity   : normal
 Bugzilla   : 15953
 Description: more ldlm soft lockups
 Details    : In ldlm_resource_add_lock(), call to ldlm_resource_dump()
-             starve other threads from the resource lock for a long time in
+	     starve other threads from the resource lock for a long time in
 	     case of long waiting queue, so change the debug level from
 	     D_OTHER to the less frequently used D_INFO.
 
@@ -240,6 +240,13 @@ Description: this bug _only_ happens when inode quota limitation is very low
 Details    : if remaining quota	equates 1, it is a sign to demonstate that quota
 	     is effective now. So least quota qunit should be 2.
 
+Severity   : normal
+Bugzilla   : 15950
+Description: Hung threads in invalidate_inode_pages2_range
+Details    : The direct IO path doesn't call check_rpcs to submit a new RPC once
+	     one is completed. As a result, some RPCs are stuck in the queue
+	     and are never sent.
+
 -------------------------------------------------------------------------------
 
 
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index 03ba08f7b9..066a3f3f93 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -228,6 +228,7 @@ extern unsigned int obd_alloc_fail_rate;
 #define OBD_FAIL_OSC_BRW_PREP_REQ2       0x40a
 #define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b
 #define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE           0x40d
 
 #define OBD_FAIL_PTLRPC                  0x500
 #define OBD_FAIL_PTLRPC_ACK              0x501
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 00a3e9a2a3..9c5a2db342 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -63,6 +63,7 @@ static quota_interface_t *quota_interface = NULL;
 extern quota_interface_t osc_quota_interface;
 
 static void osc_release_ppga(struct brw_page **ppga, obd_count count);
+static int brw_interpret(struct ptlrpc_request *request, void *data, int rc);
 int osc_cleanup(struct obd_device *obd);
 
 static quota_interface_t *quota_interface;
@@ -814,7 +815,7 @@ static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
         CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant);
         if (body->oa.o_valid & OBD_MD_FLGRANT)
                 cli->cl_avail_grant += body->oa.o_grant;
-        /* waiters are woken in brw_interpret_oap */
+        /* waiters are woken in brw_interpret */
         client_obd_list_unlock(&cli->cl_loi_list_lock);
 }
 
@@ -1425,33 +1426,6 @@ int osc_brw_redo_request(struct ptlrpc_request *request,
         RETURN(0);
 }
 
-static int brw_interpret(struct ptlrpc_request *request, void *data, int rc)
-{
-        struct osc_brw_async_args *aa = data;
-        int                        i;
-        ENTRY;
-
-        rc = osc_brw_fini_request(request, rc);
-        CDEBUG(D_INODE, "request %p aa %p rc %d\n", request, aa, rc);	
-        if (osc_recoverable_error(rc)) {
-                rc = osc_brw_redo_request(request, aa);
-                if (rc == 0)
-                        RETURN(0);
-        }
-        client_obd_list_lock(&aa->aa_cli->cl_loi_list_lock);
-        if (lustre_msg_get_opc(request->rq_reqmsg) == OST_WRITE)
-                aa->aa_cli->cl_w_in_flight--;
-        else
-                aa->aa_cli->cl_r_in_flight--;
-
-        for (i = 0; i < aa->aa_page_count; i++)
-                osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1);
-        client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock);
-        osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
-
-        RETURN(rc);
-}
-
 static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
                           struct lov_stripe_md *lsm, obd_count page_count,
                           struct brw_page **pga, struct ptlrpc_request_set *set)
@@ -1487,6 +1461,7 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
                                  cli->cl_w_in_flight);
                 ptlrpc_lprocfs_brw(request, OST_WRITE, aa->aa_requested_nob);
         }
+        LASSERT(list_empty(&aa->aa_oaps));
 
         if (rc == 0) {
                 request->rq_interpret_reply = brw_interpret;
@@ -1497,10 +1472,12 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
                 else
                         cli->cl_w_in_flight++;
                 client_obd_list_unlock(&cli->cl_loi_list_lock);
+                OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DIO_PAUSE, 3);
         } else if (cmd == OBD_BRW_WRITE) {
                 client_obd_list_lock(&cli->cl_loi_list_lock);
                 for (i = 0; i < page_count; i++)
                         osc_release_write_grant(cli, pga[i], 0);
+                osc_wake_cache_waiters(cli);
                 client_obd_list_unlock(&cli->cl_loi_list_lock);
         }
 
@@ -1957,10 +1934,9 @@ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa,
         EXIT;
 }
 
-static int brw_interpret_oap(struct ptlrpc_request *request, void *data, int rc)
+static int brw_interpret(struct ptlrpc_request *request, void *data, int rc)
 {
         struct osc_brw_async_args *aa = data;
-        struct osc_async_page *oap, *tmp;
         struct client_obd *cli;
         ENTRY;
 
@@ -1983,19 +1959,24 @@ static int brw_interpret_oap(struct ptlrpc_request *request, void *data, int rc)
         else
                 cli->cl_r_in_flight--;
 
-        /* the caller may re-use the oap after the completion call so
-         * we need to clean it up a little */
-        list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, oap_rpc_item) {
-                list_del_init(&oap->oap_rpc_item);
-                osc_ap_completion(cli, aa->aa_oa, oap, 1, rc);
+        if (!list_empty(&aa->aa_oaps)) { /* from osc_send_oap_rpc() */
+                struct osc_async_page *oap, *tmp;
+                /* the caller may re-use the oap after the completion call so
+                 * we need to clean it up a little */
+                list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, oap_rpc_item) {
+                        list_del_init(&oap->oap_rpc_item);
+                        osc_ap_completion(cli, aa->aa_oa, oap, 1, rc);
+                }
+                OBDO_FREE(aa->aa_oa);
+        } else { /* from async_internal() */
+                int i;
+                for (i = 0; i < aa->aa_page_count; i++)
+                        osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1);
         }
-
         osc_wake_cache_waiters(cli);
         osc_check_rpcs(cli);
         client_obd_list_unlock(&cli->cl_loi_list_lock);
 
-        OBDO_FREE(aa->aa_oa);
-
         osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
         RETURN(rc);
 }
@@ -2295,7 +2276,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
         DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight",
                   page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight);
 
-        req->rq_interpret_reply = brw_interpret_oap;
+        req->rq_interpret_reply = brw_interpret;
         ptlrpcd_add_req(req);
         RETURN(1);
 }
@@ -3810,7 +3791,7 @@ int osc_setup(struct obd_device *obd, obd_count len, void *buf)
 
                 oscc_init(obd);
                 /* We need to allocate a few requests more, because
-                   brw_interpret_oap tries to create new requests before freeing
+                   brw_interpret tries to create new requests before freeing
                    previous ones. Ideally we want to have 2x max_rpcs_in_flight
                    reserved, but I afraid that might be too much wasted RAM
                    in fact, so 2 is just my guess and still should work. */
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 2d2f725095..9cea5c1731 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -4551,6 +4551,31 @@ test_119c() # bug 13099
 }
 run_test 119c "Testing for direct read hitting hole"
 
+test_119d() # bug 15950
+{
+        MAX_RPCS_IN_FLIGHT=`$LCTL get_param -n osc.*OST0000-osc-[^mM]*.max_rpcs_in_flight`
+        $LCTL set_param -n osc.*OST0000-osc-[^mM]*.max_rpcs_in_flight 1
+        BSIZE=1048576
+        $SETSTRIPE $DIR/$tfile -i 0 -c 1 || error "setstripe failed"
+        $DIRECTIO write $DIR/$tfile 0 1 $BSIZE || error "first directio failed"
+        #define OBD_FAIL_OSC_DIO_PAUSE           0x40d
+        lctl set_param fail_loc=0x40d
+        $DIRECTIO write $DIR/$tfile 1 4 $BSIZE &
+        pid_dio=$!
+        sleep 1
+        cat $DIR/$tfile > /dev/null &
+        lctl set_param fail_loc=0
+        pid_reads=$!
+        wait $pid_dio
+        log "the DIO writes have completed, now wait for the reads (should not block very long)"
+        sleep 2
+        [ -n "`ps h -p $pid_reads -o comm`" ] && \
+                error "the read rpcs have not completed in 2s"
+        rm -f $DIR/$tfile
+        $LCTL set_param -n osc.*OST0000-osc-[^mM]*.max_rpcs_in_flight $MAX_RPCS_IN_FLIGHT
+}
+run_test 119d "The DIO path should try to send a new rpc once one is completed"
+
 test_120a() {
         mkdir -p $DIR/$tdir
         [ -z "`lctl get_param -n mdc.*.connect_flags | grep early_lock_cancel`" ] && \
-- 
GitLab