From 6a179ab82cb14f9925decdf421079bb2a4d8087e Mon Sep 17 00:00:00 2001
From: johann <johann>
Date: Thu, 3 Jul 2008 07:16:34 +0000
Subject: [PATCH] Branch HEAD b=15950 i=wangdi i=shadow

The direct IO path doesn't call check_rpcs to submit a new RPC once
one is completed. As a result, some RPCs are stuck in the queue
and are never sent.
Merge brw_interpret() and brw_interpret_oap().
---
 lustre/ChangeLog             | 85 +++++++++++++++++++-----------------
 lustre/include/obd_support.h |  1 +
 lustre/osc/osc_request.c     | 62 +++++++++-----------------
 lustre/tests/sanity.sh       | 25 +++++++++++
 4 files changed, 93 insertions(+), 80 deletions(-)

diff --git a/lustre/ChangeLog b/lustre/ChangeLog
index 3c54e94b11..bb8cb73f85 100644
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -22,7 +22,7 @@ Bugzilla   : 14742
 Frequency  : rare
 Description: ASSERTION(CheckWriteback(page,cmd)) failed
 Details    : badly clear PG_Writeback bit in ll_ap_completion can produce false
-             positive assertion.
+	     positive assertion.
 
 Severity   : enhancement
 Bugzilla   : 15865
@@ -32,7 +32,7 @@ Severity   : major
 Bugzilla   : 15924
 Description: do not process already freed flock
 Details    : flock can possibly be freed by another thread before it reaches
-             to ldlm_flock_completion_ast.
+	     to ldlm_flock_completion_ast.
 
 Severity   : normal
 Bugzilla   : 14480
@@ -43,14 +43,14 @@ Severity   : minor
 Bugzilla   : 15837
 Description: oops in page fault handler
 Details    : kernel page fault handler can return two special 'pages' in error case, don't
-             try dereference NOPAGE_SIGBUS and NOPAGE_OMM.
+	     try dereference NOPAGE_SIGBUS and NOPAGE_OMM.
 
 Severity   : minor
 Bugzilla   : 15716
 Description: timeout with invalidate import.
 Details    : ptlrpcd_check call obd_zombie_impexp_cull and wait request which should be
-             handled by ptlrpcd. This produce long age waiting and -ETIMEOUT
-             ptlrpc_invalidate_import and as result LASSERT.
+	     handled by ptlrpcd. This produce long age waiting and -ETIMEOUT
+	     ptlrpc_invalidate_import and as result LASSERT.
 
 Severity   : enhancement
 Bugzilla   : 15741
@@ -60,35 +60,35 @@ Severity   : major
 Bugzilla   : 14134
 Description: enable MGS and MDT services start separately
 Details    : add a 'nomgs' option in mount.lustre to enable start a MDT with
-             a co-located MGS without starting the MGS, which is a complement
-             to 'nosvc' mount option.
+	     a co-located MGS without starting the MGS, which is a complement
+	     to 'nosvc' mount option.
 
 Severity   : normal
 Bugzilla   : 14835
 Frequency  : after recovery
 Description: precreate to many object's after del orphan.
 Details    : del orphan st in oscc last_id == next_id and this triger growing
-             count of precreated objects. Set flag LOW to skip increase count
-             of precreated objects.
+	     count of precreated objects. Set flag LOW to skip increase count
+	     of precreated objects.
 
 Severity   : normal
 Bugzilla   : 15139
 Frequency  : rare, on clear nid stats
 Description: ASSERTION(client_stat->nid_exp_ref_count == 0)
 Details    : when clean nid stats sometimes try destroy live entry,
-             and this produce panic in free.
+	     and this produce panic in free.
 
 Severity   : major
 Bugzilla   : 15575
 Description: Stack overflow during MDS log replay
- ease stack pressure by using a thread dealing llog_process.
+	     ease stack pressure by using a thread dealing llog_process.
 
 Severity   : normal
 Bugzilla   : 15443
 Description: wait until IO finished before start new when do lock cancel.
 Details    : VM protocol want old IO finished before start new, in this case
-             need wait until PG_writeback is cleared until check dirty flag and
-             call writepages in lock cancel callback.
+	     need wait until PG_writeback is cleared until check dirty flag and
+	     call writepages in lock cancel callback.
 
 Severity   : enhancement
 Bugzilla   : 14929
@@ -96,9 +96,9 @@ Description: using special macro for print time and cleanup in includes.
 
 Severity   : normal
 Bugzilla   : 12888
-Description: mds_mfd_close() ASSERTION(rc == 0) 
-Details    : In mds_mfd_close(), we need protect inode's writecount change 
-             within its orphan write semaphore to prevent possible races.
+Description: mds_mfd_close() ASSERTION(rc == 0)
+Details    : In mds_mfd_close(), we need protect inode's writecount change
+	     within its orphan write semaphore to prevent possible races.
 
 Severity   : minor
 Bugzilla   : 14929
@@ -115,7 +115,7 @@ Severity   : minor
 Bugzilla   : 14949
 Description: don't panic with use echo client
 Details    : echo client pass NULL as client nid pointer and this produce null
-             pointer dereference.
+	     pointer dereference.
 
 Severity   : normal
 Bugzilla   : 15278
@@ -128,20 +128,20 @@ Description: add message levels for liblustreapi
 
 Severity   : normal
 Bugzilla   : 13380
-Description: fix for occasional failure case of -ENOSPC in recovery-small tests 
-Details    : Move the 'good_osts' check before the 'total_bavail' check.  This 
-	     will result in an -EAGAIN and in the exit call path we call 
-	     alloc_rr() which will with increasing aggressiveness attempt to 
+Description: fix for occasional failure case of -ENOSPC in recovery-small tests
+Details    : Move the 'good_osts' check before the 'total_bavail' check.  This
+	     will result in an -EAGAIN and in the exit call path we call
+	     alloc_rr() which will with increasing aggressiveness attempt to
 	     aquire precreated objects on the minimum number of required OSCs.
 
 Severity   : major
 Bugzilla   : 14326
 Description: Use old size assignment to avoid deadlock
 Details    : This reverts the changes in bugs 2369 and bug 14138 that introduced
-	     the scheduling while holding a spinlock.  We do not need locking 
-	     for size in ll_update_inode() because size is only updated from 
-	     the MDS for directories or files without objects, so there is no 
-	     other place to do the update, and concurrent access to such inodes 
+	     the scheduling while holding a spinlock.  We do not need locking
+	     for size in ll_update_inode() because size is only updated from
+	     the MDS for directories or files without objects, so there is no
+	     other place to do the update, and concurrent access to such inodes
 	     are protected by the inode lock.
 
 Severity   : normal
@@ -171,7 +171,7 @@ Severity   : normal
 Bugzilla   : 14803
 Description: Don't update lov_desc members until making sure they are valid
 Details    : When updating lov_desc members via proc fs, need fix their
-             validities before doing the real update.
+	     validities before doing the real update.
 
 Severity   : normal
 Bugzilla   : 15069
@@ -222,17 +222,17 @@ Details    : When MGC is disconnected from MGS long enough, MGS will evict the
 Severity   : major
 Bugzilla   : 15027
 Frequency  : on network error
-Description: panic with double free request if network error 
+Description: panic with double free request if network error
 Details    : mdc_finish_enqueue is finish request if any network error ocuring,
-             but it's true only for synchronus enqueue, for async enqueue 
-             (via ptlrpcd) this incorrect and ptlrpcd want finish request
-             himself.
+	     but it's true only for synchronus enqueue, for async enqueue
+	     (via ptlrpcd) this incorrect and ptlrpcd want finish request
+	     himself.
 
 Severity   : enhancement
 Bugzilla   : 11401
 Description: client-side metadata stat-ahead during readdir(directory readahead)
 Details    : perform client-side metadata stat-ahead when the client detects
-             readdir and sequential stat of dir entries therein
+	     readdir and sequential stat of dir entries therein
 
 Severity   : major
 Frequency  : on start mds
@@ -1107,35 +1107,42 @@ Severity   : normal
 Bugzilla   : 15346
 Description: skiplist implementation simplification
 Details    : skiplists are used to group compatible locks on granted list
-             that was implemented as tracking first and last lock of each lock group
-             the patch changes that to using doubly linked lists
+	     that was implemented as tracking first and last lock of each lock group
+	     the patch changes that to using doubly linked lists
 
 Severity   : normal
 Bugzilla   : 15574
 Description: MDS LBUG: ASSERTION(!IS_ERR(dchild))
 Details    : Change LASSERTs to client eviction (i.e. abort client's recovery)
-             because LASSERT on both the data supplied by a client, and the data 
+	     because LASSERT on both the data supplied by a client, and the data 
 	     on disk is dangerous and incorrect.
 
 Severity   : enhancement
 Bugzilla   : 10718
-Description: Slow trucate/writes to huge files at high offsets.
+Description: Slow truncate/writes to huge files at high offsets.
 Details    : Directly associate cached pages to lock that protect those pages,
-             this allows us to quickly find what pages to write and remove
+	     this allows us to quickly find what pages to write and remove
 	     once lock callback is received.
 
 Severity   : normal
 Bugzilla   : 15953
 Description: more ldlm soft lockups
 Details    : In ldlm_resource_add_lock(), call to ldlm_resource_dump()
-             starve other threads from the resource lock for a long time in
-             case of long waiting queue, so change the debug level from
-             D_OTHER to the less frequently used D_INFO.
+	     starve other threads from the resource lock for a long time in
+	     case of long waiting queue, so change the debug level from
+	     D_OTHER to the less frequently used D_INFO.
 
 Severity   : enhancement
 Bugzilla   : 13128
 Description: add -gid, -group, -uid, -user options to lfs find
 
+Severity   : normal
+Bugzilla   : 15950
+Description: Hung threads in invalidate_inode_pages2_range
+Details    : The direct IO path doesn't call check_rpcs to submit a new RPC once
+	     one is completed. As a result, some RPCs are stuck in the queue
+	     and are never sent.
+
 --------------------------------------------------------------------------------
 
 2007-08-10         Cluster File Systems, Inc. <info@clusterfs.com>
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h
index 646e579010..1b800b6555 100644
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -221,6 +221,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_OSC_BRW_PREP_REQ2       0x40a
 #define OBD_FAIL_OSC_CONNECT_CKSUM       0x40b
 #define OBD_FAIL_OSC_CKSUM_ADLER_ONLY    0x40c
+#define OBD_FAIL_OSC_DIO_PAUSE           0x40d
 
 #define OBD_FAIL_PTLRPC                  0x500
 #define OBD_FAIL_PTLRPC_ACK              0x501
diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c
index 06fabd3276..43f8c7eaa6 100644
--- a/lustre/osc/osc_request.c
+++ b/lustre/osc/osc_request.c
@@ -64,6 +64,7 @@ static quota_interface_t *quota_interface = NULL;
 extern quota_interface_t osc_quota_interface;
 
 static void osc_release_ppga(struct brw_page **ppga, obd_count count);
+static int brw_interpret(struct ptlrpc_request *request, void *data, int rc);
 int osc_cleanup(struct obd_device *obd);
 
 /* Pack OSC object metadata for disk storage (LE byte order). */
@@ -880,7 +881,7 @@ static void osc_update_grant(struct client_obd *cli, struct ost_body *body)
         CDEBUG(D_CACHE, "got "LPU64" extra grant\n", body->oa.o_grant);
         if (body->oa.o_valid & OBD_MD_FLGRANT)
                 cli->cl_avail_grant += body->oa.o_grant;
-        /* waiters are woken in brw_interpret_oap */
+        /* waiters are woken in brw_interpret */
         client_obd_list_unlock(&cli->cl_loi_list_lock);
 }
 
@@ -1517,33 +1518,6 @@ int osc_brw_redo_request(struct ptlrpc_request *request,
         RETURN(0);
 }
 
-static int brw_interpret(struct ptlrpc_request *req, void *data, int rc)
-{
-        struct osc_brw_async_args *aa = data;
-        int                        i;
-        ENTRY;
-
-        rc = osc_brw_fini_request(req, rc);
-        if (osc_recoverable_error(rc)) {
-                rc = osc_brw_redo_request(req, aa);
-                if (rc == 0)
-                        RETURN(0);
-        }
-
-        client_obd_list_lock(&aa->aa_cli->cl_loi_list_lock);
-        if (lustre_msg_get_opc(req->rq_reqmsg) == OST_WRITE)
-                aa->aa_cli->cl_w_in_flight--;
-        else
-                aa->aa_cli->cl_r_in_flight--;
-        for (i = 0; i < aa->aa_page_count; i++)
-                osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1);
-        client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock);
-
-        osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
-
-        RETURN(rc);
-}
-
 static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
                           struct lov_stripe_md *lsm, obd_count page_count,
                           struct brw_page **pga, struct ptlrpc_request_set *set,
@@ -1581,6 +1555,7 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
                 ptlrpc_lprocfs_brw(req, OST_WRITE, aa->aa_requested_nob);
         }
 
+        LASSERT(list_empty(&aa->aa_oaps));
         if (rc == 0) {
                 req->rq_interpret_reply = brw_interpret;
                 ptlrpc_set_add_req(set, req);
@@ -1590,10 +1565,12 @@ static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa,
                 else
                         cli->cl_w_in_flight++;
                 client_obd_list_unlock(&cli->cl_loi_list_lock);
+                OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DIO_PAUSE, 3);
         } else if (cmd == OBD_BRW_WRITE) {
                 client_obd_list_lock(&cli->cl_loi_list_lock);
                 for (i = 0; i < page_count; i++)
                         osc_release_write_grant(cli, pga[i], 0);
+                osc_wake_cache_waiters(cli);
                 client_obd_list_unlock(&cli->cl_loi_list_lock);
         }
         RETURN (rc);
@@ -2048,9 +2025,8 @@ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa,
         EXIT;
 }
 
-static int brw_interpret_oap(struct ptlrpc_request *req, void *data, int rc)
+static int brw_interpret(struct ptlrpc_request *req, void *data, int rc)
 {
-        struct osc_async_page *oap, *tmp;
         struct osc_brw_async_args *aa = data;
         struct client_obd *cli;
         ENTRY;
@@ -2075,20 +2051,24 @@ static int brw_interpret_oap(struct ptlrpc_request *req, void *data, int rc)
         else
                 cli->cl_r_in_flight--;
 
-        /* the caller may re-use the oap after the completion call so
-         * we need to clean it up a little */
-        list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, oap_rpc_item) {
-                list_del_init(&oap->oap_rpc_item);
-                osc_ap_completion(cli, aa->aa_oa, oap, 1, rc);
+        if (!list_empty(&aa->aa_oaps)) { /* from osc_send_oap_rpc() */
+                struct osc_async_page *oap, *tmp;
+                /* the caller may re-use the oap after the completion call so
+                 * we need to clean it up a little */
+                list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, oap_rpc_item) {
+                        list_del_init(&oap->oap_rpc_item);
+                        osc_ap_completion(cli, aa->aa_oa, oap, 1, rc);
+                }
+                OBDO_FREE(aa->aa_oa);
+        } else { /* from async_internal() */
+                int i;
+                for (i = 0; i < aa->aa_page_count; i++)
+                        osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1);
         }
-
         osc_wake_cache_waiters(cli);
         osc_check_rpcs(cli);
-
         client_obd_list_unlock(&cli->cl_loi_list_lock);
 
-        OBDO_FREE(aa->aa_oa);
-        
         osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
         RETURN(rc);
 }
@@ -2388,7 +2368,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi,
         DEBUG_REQ(D_INODE, req, "%d pages, aa %p. now %dr/%dw in flight",
                   page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight);
 
-        req->rq_interpret_reply = brw_interpret_oap;
+        req->rq_interpret_reply = brw_interpret;
         ptlrpcd_add_req(req);
         RETURN(1);
 }
@@ -3938,7 +3918,7 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
 
                 oscc_init(obd);
                 /* We need to allocate a few requests more, because
-                   brw_interpret_oap tries to create new requests before freeing
+                   brw_interpret tries to create new requests before freeing
                    previous ones. Ideally we want to have 2x max_rpcs_in_flight
                    reserved, but I afraid that might be too much wasted RAM
                    in fact, so 2 is just my guess and still should work. */
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh
index 03831df538..7570e9d30a 100644
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -4664,6 +4664,31 @@ test_119c() # bug 13099
 }
 run_test 119c "Testing for direct read hitting hole"
 
+test_119d() # bug 15950
+{
+        MAX_RPCS_IN_FLIGHT=`$LCTL get_param -n osc.*OST0000-osc-*.max_rpcs_in_flight`
+        $LCTL set_param -n osc.*OST0000-osc-*.max_rpcs_in_flight 1
+        BSIZE=1048576
+        $SETSTRIPE $DIR/$tfile -i 0 -c 1 || error "setstripe failed"
+        $DIRECTIO write $DIR/$tfile 0 1 $BSIZE || error "first directio failed"
+        #define OBD_FAIL_OSC_DIO_PAUSE           0x40d
+        lctl set_param fail_loc=0x40d
+        $DIRECTIO write $DIR/$tfile 1 4 $BSIZE &
+        pid_dio=$!
+        sleep 1
+        cat $DIR/$tfile > /dev/null &
+        lctl set_param fail_loc=0
+        pid_reads=$!
+        wait $pid_dio
+        log "the DIO writes have completed, now wait for the reads (should not block very long)"
+        sleep 2
+        [ -n "`ps h -p $pid_reads -o comm`" ] && \
+        error "the read rpcs have not completed in 2s"
+        rm -f $DIR/$tfile
+        $LCTL set_param -n osc.*OST0000-osc-*.max_rpcs_in_flight $MAX_RPCS_IN_FLIGHT
+}
+run_test 119d "The DIO path should try to send a new rpc once one is completed"
+
 test_120a() {
         mkdir -p $DIR/$tdir
         [ -z "`lctl get_param -n mdc.*.connect_flags | grep early_lock_cancel`" ] && \
-- 
GitLab