From 4cef4816149177ec92628b93c13a32067647b42d Mon Sep 17 00:00:00 2001
From: eeb <eeb>
Date: Tue, 6 Apr 2004 22:45:41 +0000
Subject: [PATCH] *     changed ptlrpc_unregister_reply(),
 ptlrpc_unregister_bulk() and       ptlrpc_abort_bulk() to always
 l_wait_event() after doing       PtlMDUnlink() on the relevant MD.  Previous
 code assumed that when       PtlMDUnlink() returned PTL_MD_INVALID, the
 relevant callback handler       must have run already, which isn't true for
 liblustre.  The       l_wait_event() will be a NOOP for kernel lustre
 (condition already       true) but gives the liblustre event handler a chance
 to call all       the callbacks.

---
 lustre/ptlrpc/client.c | 11 ++++-------
 lustre/ptlrpc/niobuf.c | 26 ++++++++------------------
 2 files changed, 12 insertions(+), 25 deletions(-)

diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c
index 6635659e64..b328b5468a 100644
--- a/lustre/ptlrpc/client.c
+++ b/lustre/ptlrpc/client.c
@@ -1099,13 +1099,10 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request)
         if (!ptlrpc_client_receiving_reply(request))
                 return;
 
-        rc = PtlMDUnlink (request->rq_reply_md_h);
-        if (rc == PTL_MD_INVALID) {
-                LASSERT (!ptlrpc_client_receiving_reply(request));
-                return;
-        }
-        
-        LASSERT (rc == PTL_OK);
+        PtlMDUnlink (request->rq_reply_md_h);
+
+        /* We have to l_wait_event() whatever the result, to give liblustre
+         * a chance to run reply_in_callback() */
 
         if (request->rq_set == NULL)
                 wq = &request->rq_set->set_waitq;
diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c
index c22e66890b..2b10ac6e93 100644
--- a/lustre/ptlrpc/niobuf.c
+++ b/lustre/ptlrpc/niobuf.c
@@ -194,16 +194,11 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc)
                 return;                         /* never started */
         
         /* The unlink ensures the callback happens ASAP and is the last
-         * one.  If it fails, it must be because completion just
-         * happened. */
+         * one.  If it fails, it must be because completion just happened,
+         * but we must still l_wait_event() in this case, to give liblustre
+         * a chance to run server_bulk_callback()*/
 
-        rc = PtlMDUnlink (desc->bd_md_h);
-        if (rc == PTL_MD_INVALID) {
-                LASSERT(!ptlrpc_bulk_active(desc));
-                return;
-        }
-        
-        LASSERT (rc == PTL_OK);
+        PtlMDUnlink (desc->bd_md_h);
 
         for (;;) {
                 /* Network access will complete in finite time but the HUGE
@@ -312,16 +307,11 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req)
         LASSERT (desc->bd_req == req);          /* bd_req NULL until registered */
 
         /* the unlink ensures the callback happens ASAP and is the last
-         * one.  If it fails, it must be because completion just
-         * happened. */
+         * one.  If it fails, it must be because completion just happened,
+         * but we must still l_wait_event() in this case to give liblustre
+         * a chance to run client_bulk_callback() */
 
-        rc = PtlMDUnlink (desc->bd_md_h);
-        if (rc == PTL_MD_INVALID) {
-                LASSERT(!ptlrpc_bulk_active(desc));
-                return;
-        }
-        
-        LASSERT (rc == PTL_OK);
+        PtlMDUnlink (desc->bd_md_h);
         
         if (desc->bd_req->rq_set != NULL)
                 wq = &req->rq_set->set_waitq;
-- 
GitLab