diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 3fe3bfc0101fe5490d18981577c4b519e0860bb7..f93cad41f23c041f3ac3e8cea78500165c3b9602 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -102,6 +102,13 @@ Details : Modify targets/2.6-vanilla.target.in. Add config file kernel-2.6.18-2.6-vanilla-x86_64.config. Add config file kernel-2.6.18-2.6-vanilla-x86_64-smp.config. +Severity : major +Bugzilla : 11710 +Description: improve handling recoverable errors +Details : if request processig with error which can be recoverable on server + request should be resend, otherwise page released from cache and + marked as error. + -------------------------------------------------------------------------------- 2007-09-27 Cluster File Systems, Inc. <info@clusterfs.com> diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 94ee5d22f6412163da1e89e08494588eeafac2d5..e3b3e0d19860a6363f56bf3c7f6383bdc31fbf6d 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -333,9 +333,10 @@ struct ptlrpc_request { void *rq_cb_data; struct ptlrpc_bulk_desc *rq_bulk; /* client side bulk */ - /* client outgoing req */ - time_t rq_sent; /* when request/reply sent (secs) */ + time_t rq_sent; /* when request sent, seconds, + * or time when request should + * be sent */ volatile time_t rq_deadline; /* when request must finish. volatile so that servers' early reply updates to the deadline aren't kept in per-cpu cache */ diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 9c05f17836094148c887ff42803da4d9006a69f8..c4ff02a0e203d91ca4074b22d2206a74f62809ab 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -317,6 +317,7 @@ struct filter_obd { #define OSC_MAX_RIF_MAX 256 #define OSC_MAX_DIRTY_DEFAULT (OSC_MAX_RIF_DEFAULT * 4) #define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ +#define OSC_DEFAULT_RESENDS 10 #define MDC_MAX_RIF_DEFAULT 8 #define MDC_MAX_RIF_MAX 512 @@ -400,6 +401,8 @@ struct client_obd { /* used by quotacheck */ int cl_qchk_stat; /* quotacheck stat of the peer */ + + atomic_t cl_resends; /* resend count */ }; #define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) diff --git a/lustre/include/obd_ost.h b/lustre/include/obd_ost.h index d9accb553f4c0c1c1cd3887d0d2171348df4fdd8..8dcc9c283fbaa004b48ec1f31aed9ad1c156efd8 100644 --- a/lustre/include/obd_ost.h +++ b/lustre/include/obd_ost.h @@ -18,7 +18,7 @@ struct osc_brw_async_args { int aa_requested_nob; int aa_nio_count; obd_count aa_page_count; - int aa_retries; + int aa_resends; struct brw_page **aa_ppga; struct client_obd *aa_cli; struct list_head aa_oaps; diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 8e8940373d9afbfde16d9fe349c24a35ee4ad246..f0a0578fb69a064fd1d8d9dca4c56efc9151353e 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -171,6 +171,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_OST_DROP_REQ 0x21d #define OBD_FAIL_OST_SETATTR_CREDITS 0x21e #define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f +#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 #define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 #define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 #define OBD_FAIL_OST_PAUSE_CREATE 0x223 @@ -205,6 +206,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_OSC_SHUTDOWN 0x407 #define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 #define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 +#define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a #define OBD_FAIL_PTLRPC 0x500 #define OBD_FAIL_PTLRPC_ACK 0x501 diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index c584d40c50d8b858c27ee715f294e1af9c3ce3fc..8b60e4626f1ef50c7958c9cfb6d1fcdc23345c8b 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -268,6 +268,7 @@ int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf) #ifdef ENABLE_CHECKSUM cli->cl_checksum = 1; #endif + atomic_set(&cli->cl_resends, OSC_DEFAULT_RESENDS); /* This value may be changed at connect time in ptlrpc_connect_interpret. */ diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 922dba09f93c684f6e0a7833255f43605a5a5655..7c7692f456033c537004b7f13373f0f542266d29 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -777,6 +777,9 @@ void __ldlm_resource_putref_final(struct ldlm_resource *res) LASSERT_SPIN_LOCKED(&ns->ns_hash_lock); + printk("destroy res %p\n", res); + WARN_ON(1); + if (!list_empty(&res->lr_granted)) { ldlm_resource_dump(D_ERROR, res); LBUG(); diff --git a/lustre/llite/file.c b/lustre/llite/file.c index bd173e14bae3fcac90c6815efc92638a72203df1..aa6d6cbd3968247ca2f4da3ff0ad170bf5379598 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -647,9 +647,10 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, struct page *page; int rc, rc2, discard = lock->l_flags & LDLM_FL_DISCARD_DATA; struct lustre_handle lockh; - ENTRY; + struct address_space *mapping = inode->i_mapping; - memcpy(&tmpex, &lock->l_policy_data, sizeof(tmpex)); + ENTRY; + tmpex = lock->l_policy_data; CDEBUG(D_INODE|D_PAGE, "inode %lu(%p) ["LPU64"->"LPU64"] size: %llu\n", inode->i_ino, inode, tmpex.l_extent.start, tmpex.l_extent.end, i_size_read(inode)); @@ -692,8 +693,8 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, for (i = start; i <= end; i += (j + skip)) { j = min(count - (i % count), end - i + 1); LASSERT(j > 0); - LASSERT(inode->i_mapping); - if (ll_teardown_mmaps(inode->i_mapping, + LASSERT(mapping); + if (ll_teardown_mmaps(mapping, (__u64)i << CFS_PAGE_SHIFT, ((__u64)(i+j) << CFS_PAGE_SHIFT) - 1) ) break; @@ -718,14 +719,14 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, tmpex.l_extent.start, lock->l_policy_data.l_extent.end, start, i, end); - if (!mapping_has_pages(inode->i_mapping)) { + if (!mapping_has_pages(mapping)) { CDEBUG(D_INODE|D_PAGE, "nothing left\n"); break; } cond_resched(); - page = find_get_page(inode->i_mapping, i); + page = find_get_page(mapping, i); if (page == NULL) continue; LL_CDEBUG_PAGE(D_PAGE, page, "lock page idx %lu ext "LPU64"\n", @@ -735,12 +736,23 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, /* page->mapping to check with racing against teardown */ if (!discard && clear_page_dirty_for_io(page)) { rc = ll_call_writepage(inode, page); - if (rc != 0) - CERROR("writepage of page %p failed: %d\n", - page, rc); /* either waiting for io to complete or reacquiring * the lock that the failed writepage released */ lock_page(page); + wait_on_page_writeback(page); + if (rc != 0) { + CERROR("writepage inode %lu(%p) of page %p " + "failed: %d\n", inode->i_ino, inode, + page, rc); +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + if (rc == -ENOSPC) + set_bit(AS_ENOSPC, &mapping->flags); + else + set_bit(AS_EIO, &mapping->flags); +#else + mapping->gfp_mask |= AS_EIO_MASK; +#endif + } } tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1; @@ -755,7 +767,7 @@ void ll_pgcache_remove_extent(struct inode *inode, struct lov_stripe_md *lsm, // checking again to account for writeback's lock_page() LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n"); if (llap) - ll_ra_accounting(llap, inode->i_mapping); + ll_ra_accounting(llap, mapping); ll_truncate_complete_page(page); } unlock_page(page); diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 47af372108342968559273f725bce7ded933ca30..81dade0b126f722b6a150a71a11cd49479f332b5 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -795,7 +795,7 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) /* 2.4 doesn't seem to have SEQ_START_TOKEN, so we implement * it in our own state */ if (dummy_llap->llap_magic == 0) { - seq_printf(seq, "gener | llap cookie origin wq du | page " + seq_printf(seq, "gener | llap cookie origin wq du wb | page " "inode index count [ page flags ]\n"); return 0; } @@ -810,13 +810,14 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) LASSERTF(llap->llap_origin < LLAP__ORIGIN_MAX, "%u\n", llap->llap_origin); - seq_printf(seq," %5lu | %p %p %s %s %s | %p %lu/%u(%p) " + seq_printf(seq," %5lu | %p %p %s %s %s %s | %p %lu/%u(%p) " "%lu %u [", sbi->ll_pglist_gen, llap, llap->llap_cookie, llap_origins[llap->llap_origin], llap->llap_write_queued ? "wq" : "- ", llap->llap_defer_uptodate ? "du" : "- ", + PageWriteback(page) ? "wb" : "-", page, page->mapping->host->i_ino, page->mapping->host->i_generation, page->mapping->host, page->index, @@ -829,9 +830,10 @@ static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) #if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,12)) seq_page_flag(seq, page, highmem, has_flags); #endif + seq_page_flag(seq, page, writeback, has_flags); if (!has_flags) seq_puts(seq, "-]\n"); - else + else seq_puts(seq, "]\n"); } diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 691ccd40bf5e291d21d1797c29b1e4cb2cc0c810..f0853a08aaac8571b8db87b4e6d176ca635f4091 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -510,18 +510,16 @@ int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction) continue; } - if (llap->llap_write_queued || PageDirty(page) || - (!PageUptodate(page) && - llap->llap_origin != LLAP_ORIGIN_READAHEAD)) - keep = 1; - else - keep = 0; + keep = (llap->llap_write_queued || PageDirty(page) || + PageWriteback(page) || (!PageUptodate(page) && + llap->llap_origin != LLAP_ORIGIN_READAHEAD)); - LL_CDEBUG_PAGE(D_PAGE, page,"%s LRU page: %s%s%s%s origin %s\n", + LL_CDEBUG_PAGE(D_PAGE, page,"%s LRU page: %s%s%s%s%s origin %s\n", keep ? "keep" : "drop", llap->llap_write_queued ? "wq " : "", PageDirty(page) ? "pd " : "", PageUptodate(page) ? "" : "!pu ", + PageWriteback(page) ? "wb" : "", llap->llap_defer_uptodate ? "" : "!du", llap_origins[llap->llap_origin]); @@ -878,11 +876,16 @@ int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) } else { if (cmd & OBD_BRW_READ) { llap->llap_defer_uptodate = 0; - } else { - ll_redirty_page(page); - ret = 1; } SetPageError(page); +#if (LINUX_VERSION_CODE > KERNEL_VERSION(2,5,0)) + if (rc == -ENOSPC) + set_bit(AS_ENOSPC, &page->mapping->flags); + else + set_bit(AS_EIO, &page->mapping->flags); +#else + page->mapping->gfp_mask |= AS_EIO_MASK; +#endif } unlock_page(page); @@ -1418,7 +1421,9 @@ out: if (PageWriteback(page)) { end_page_writeback(page); } - ll_redirty_page(page); + /* resend page only for not started IO*/ + if (!PageError(page)) + ll_redirty_page(page); unlock_page(page); } RETURN(rc); diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 878fa8e5e5e423c001e2b66dd0b8e430415d8b63..aed4bef5af6c1fd41901fbd792ba2267d5af5694 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -303,6 +303,32 @@ static int osc_wr_checksum(struct file *file, const char *buffer, return count; } +static int osc_rd_resend_count(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = data; + + return snprintf(page, count, "%u\n", atomic_read(&obd->u.cli.cl_resends)); +} + +static int osc_wr_resend_count(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 0) + return -EINVAL; + + atomic_set(&obd->u.cli.cl_resends, val); + + return count; +} + static struct lprocfs_vars lprocfs_obd_vars[] = { { "uuid", lprocfs_rd_uuid, 0, 0 }, { "ping", 0, lprocfs_wr_ping, 0 }, @@ -329,6 +355,7 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "prealloc_next_id", osc_rd_prealloc_next_id, 0, 0 }, { "prealloc_last_id", osc_rd_prealloc_last_id, 0, 0 }, { "checksums", osc_rd_checksum, osc_wr_checksum, 0 }, + { "resend_count", osc_rd_resend_count, osc_wr_resend_count, 0}, { "timeouts", lprocfs_rd_timeouts, 0, 0 }, { 0 } }; @@ -464,3 +491,4 @@ int lproc_osc_attach_seqstat(struct obd_device *dev) LPROCFS_INIT_VARS(osc, lprocfs_module_vars, lprocfs_obd_vars) #endif /* LPROCFS */ + diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index e3e6013f13ee8359dc266d1ee9d535bc91615fe3..72ce3ec4a3f6d178d372d3a1b23228fe54dc6286 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -73,4 +73,17 @@ static inline int lproc_osc_attach_seqstat(struct obd_device *dev) {return 0;} ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) #endif +static inline int osc_recoverable_error(int rc) +{ + return (rc == -EIO || rc == -EROFS || rc == -ENOMEM || rc == -EAGAIN); +} + +/* return 1 if osc should be resend request */ +static inline int osc_should_resend(int resend, struct client_obd *cli) +{ + return atomic_read(&cli->cl_resends) ? + atomic_read(&cli->cl_resends) > resend : 1; +} + + #endif /* OSC_INTERNAL_H */ diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index f29db5689395c28e21c7a2a277e6f2998b9f3eb3..d28ac422ce3713c4de3398b435d4f84b12734dcb 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -66,6 +66,9 @@ static void osc_release_ppga(struct brw_page **ppga, obd_count count); static quota_interface_t *quota_interface; extern quota_interface_t osc_quota_interface; +/* by default 10s */ +atomic_t osc_resend_time; + /* Pack OSC object metadata for disk storage (LE byte order). */ static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, struct lov_stripe_md *lsm) @@ -917,6 +920,9 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, struct brw_page *pg_prev; ENTRY; + OBD_FAIL_RETURN(OBD_FAIL_OSC_BRW_PREP_REQ, -ENOMEM); /* Recoverable */ + OBD_FAIL_RETURN(OBD_FAIL_OSC_BRW_PREP_REQ2, -EINVAL); /* Fatal */ + opc = ((cmd & OBD_BRW_WRITE) != 0) ? OST_WRITE : OST_READ; pool = ((cmd & OBD_BRW_WRITE) != 0) ? cli->cl_import->imp_rq_pool :NULL; @@ -928,7 +934,6 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, size[REQ_REC_OFF + 1] = sizeof(*ioobj); size[REQ_REC_OFF + 2] = niocount * sizeof(*niobuf); - OBD_FAIL_RETURN(OBD_FAIL_OSC_BRW_PREP_REQ, -ENOMEM); req = ptlrpc_prep_req_pool(cli->cl_import, LUSTRE_OST_VERSION, opc, 4, size, NULL, pool); if (req == NULL) @@ -1036,7 +1041,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, aa->aa_requested_nob = requested_nob; aa->aa_nio_count = niocount; aa->aa_page_count = page_count; - aa->aa_retries = 5; /*retry for checksum errors; lprocfs? */ + aa->aa_resends = 0; aa->aa_ppga = pga; aa->aa_cli = cli; INIT_LIST_HEAD(&aa->aa_oaps); @@ -1235,8 +1240,13 @@ static int osc_brw_internal(int cmd, struct obd_export *exp,struct obdo *oa, obd_count page_count, struct brw_page **pga) { struct ptlrpc_request *request; - int rc, retries = 5; /* lprocfs? */ + int rc; + cfs_waitq_t waitq; + int resends = 0; + struct l_wait_info lwi; + ENTRY; + init_waitqueue_head(&waitq); restart_bulk: rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm, @@ -1255,10 +1265,17 @@ restart_bulk: rc = osc_brw_fini_request(request, rc); ptlrpc_req_finished(request); - if (rc == -EAGAIN) { - if (retries-- > 0) - goto restart_bulk; - rc = -EIO; + if (osc_recoverable_error(rc)) { + resends++; + if (!osc_should_resend(resends, &exp->exp_obd->u.cli)) { + CERROR("too many resend retries, returning error\n"); + RETURN(-EIO); + } + + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(resends), NULL, NULL, NULL); + l_wait_event(waitq, 0, &lwi); + + goto restart_bulk; } RETURN(rc); } @@ -1273,40 +1290,44 @@ int osc_brw_redo_request(struct ptlrpc_request *request, int rc = 0; ENTRY; - if (aa->aa_retries-- <= 0) { - CERROR("too many checksum retries, returning error\n"); + if (!osc_should_resend(aa->aa_resends, aa->aa_cli)) { + CERROR("too many resend retries, returning error\n"); RETURN(-EIO); } + + DEBUG_REQ(D_ERROR, request, "redo for recoverable error"); - DEBUG_REQ(D_ERROR, request, "redo for checksum error"); + rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == + OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ, + aa->aa_cli, aa->aa_oa, + NULL /* lsm unused by osc currently */, + aa->aa_page_count, aa->aa_ppga, &new_req); + if (rc) + RETURN(rc); + + client_obd_list_lock(&aa->aa_cli->cl_loi_list_lock); + list_for_each_entry(oap, &aa->aa_oaps, oap_rpc_item) { if (oap->oap_request != NULL) { LASSERTF(request == oap->oap_request, "request %p != oap_request %p\n", request, oap->oap_request); if (oap->oap_interrupted) { - ptlrpc_mark_interrupted(oap->oap_request); - rc = -EINTR; - break; + client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock); + ptlrpc_req_finished(new_req); + RETURN(-EINTR); } } } - if (rc) - RETURN(rc); - - rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == - OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ, - aa->aa_cli, aa->aa_oa, - NULL /* lsm unused by osc currently */, - aa->aa_page_count, aa->aa_ppga, &new_req); - if (rc) - RETURN(rc); - /* New request takes over pga and oaps from old request. * Note that copying a list_head doesn't work, need to move it... */ + aa->aa_resends++; new_req->rq_interpret_reply = request->rq_interpret_reply; new_req->rq_async_args = request->rq_async_args; + new_req->rq_sent = CURRENT_SECONDS + aa->aa_resends; + new_aa = (struct osc_brw_async_args *)&new_req->rq_async_args; + INIT_LIST_HEAD(&new_aa->aa_oaps); list_splice(&aa->aa_oaps, &new_aa->aa_oaps); INIT_LIST_HEAD(&aa->aa_oaps); @@ -1317,6 +1338,9 @@ int osc_brw_redo_request(struct ptlrpc_request *request, oap->oap_request = ptlrpc_request_addref(new_req); } } + client_obd_list_unlock(&aa->aa_cli->cl_loi_list_lock); + + DEBUG_REQ(D_INFO, new_req, "new request"); ptlrpc_set_add_req(set, new_req); @@ -1331,7 +1355,8 @@ static int brw_interpret(struct ptlrpc_request *request, void *data, int rc) ENTRY; rc = osc_brw_fini_request(request, rc); - if (rc == -EAGAIN) { + CDEBUG(D_INODE, "request %p aa %p rc %d\n", request, aa, rc); + if (osc_recoverable_error(rc)) { rc = osc_brw_redo_request(request, aa); if (rc == 0) RETURN(0); @@ -1770,7 +1795,7 @@ unlock: * the app does an fsync. As long as errors persist we force future rpcs to be * sync so that the app can get a sync error and break the cycle of queueing * pages for which writeback will fail. */ -static void osc_process_ar(struct osc_async_rc *ar, struct ptlrpc_request *req, +static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, int rc) { if (rc) { @@ -1783,7 +1808,7 @@ static void osc_process_ar(struct osc_async_rc *ar, struct ptlrpc_request *req, } - if (ar->ar_force_sync && req && (ptlrpc_req_xid(req) >= ar->ar_min_xid)) + if (ar->ar_force_sync && (xid >= ar->ar_min_xid)) ar->ar_force_sync = 0; } @@ -1807,18 +1832,21 @@ static void osc_oap_to_pending(struct osc_async_page *oap) static void osc_ap_completion(struct client_obd *cli, struct obdo *oa, struct osc_async_page *oap, int sent, int rc) { + __u64 xid = 0; + ENTRY; + if (oap->oap_request != NULL) { + xid = ptlrpc_req_xid(oap->oap_request); + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = NULL; + } + oap->oap_async_flags = 0; oap->oap_interrupted = 0; if (oap->oap_cmd & OBD_BRW_WRITE) { - osc_process_ar(&cli->cl_ar, oap->oap_request, rc); - osc_process_ar(&oap->oap_loi->loi_ar, oap->oap_request, rc); - } - - if (oap->oap_request != NULL) { - ptlrpc_req_finished(oap->oap_request); - oap->oap_request = NULL; + osc_process_ar(&cli->cl_ar, xid, rc); + osc_process_ar(&oap->oap_loi->loi_ar, xid, rc); } if (rc == 0 && oa != NULL) { @@ -1863,11 +1891,11 @@ static int brw_interpret_oap(struct ptlrpc_request *request, void *data, int rc) rc = osc_brw_fini_request(request, rc); CDEBUG(D_INODE, "request %p aa %p rc %d\n", request, aa, rc); - if (rc == -EAGAIN) { + + if (osc_recoverable_error(rc)) { rc = osc_brw_redo_request(request, aa); if (rc == 0) RETURN(0); - GOTO(out, rc); } cli = aa->aa_cli; @@ -1893,8 +1921,6 @@ static int brw_interpret_oap(struct ptlrpc_request *request, void *data, int rc) OBDO_FREE(aa->aa_oa); - rc = 0; -out: osc_release_ppga(aa->aa_ppga, aa->aa_page_count); RETURN(rc); } @@ -3732,7 +3758,6 @@ struct obd_ops osc_obd_ops = { .o_llog_finish = osc_llog_finish, .o_process_config = osc_process_config, }; - int __init osc_init(void) { struct lprocfs_static_vars lvars; diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index a268d7e48d2b3611cbd9297eeb2e5795e4f42a4a..14dce9a82326e6b7dbc5d12505e54fb38ac93410 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -659,8 +659,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_READ_BULK)) GOTO(out, rc = -EIO); - OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE, - (obd_timeout + 1) / 4); + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, (obd_timeout + 1) / 4); /* Check if there is eviction in progress, and if so, wait for it to * finish */ @@ -915,10 +914,11 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK)) GOTO(out, rc = -EIO); + if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK2)) + GOTO(out, rc = -EFAULT); /* pause before transaction has been started */ - OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK | OBD_FAIL_ONCE, - (obd_timeout + 1) / 4); + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_BRW_PAUSE_BULK, (obd_timeout + 1) / 4); /* Check if there is eviction in progress, and if so, wait for it to * finish */ diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 958375ec3c1d64134ef0bdafd30924b33bcb0b6d..c85f9e69496523fcbd99b871abcc4d561a543035 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -804,10 +804,10 @@ static int after_reply(struct ptlrpc_request *req) /* Either we've been evicted, or the server has failed for * some reason. Try to reconnect, and if that fails, punt to * the upcall. */ - if (rc == -ENOTCONN || rc == -ENODEV) { + if (ll_rpc_recoverable_error(rc)) { if (req->rq_send_state != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov || imp->imp_dlm_fake) { - RETURN(-ENOTCONN); + RETURN(rc); } ptlrpc_request_handle_notconn(req); RETURN(rc); @@ -860,6 +860,9 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) ENTRY; LASSERT(req->rq_phase == RQ_PHASE_NEW); + if (req->rq_sent && (req->rq_sent > CURRENT_SECONDS)) + RETURN (0); + req->rq_phase = RQ_PHASE_RPC; imp = req->rq_import; @@ -933,6 +936,9 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) ptlrpc_send_new_req(req)) { force_timer_recalc = 1; } + /* delayed send - skip */ + if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) + continue; if (!(req->rq_phase == RQ_PHASE_RPC || req->rq_phase == RQ_PHASE_BULK || @@ -1282,6 +1288,7 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) time_t now = cfs_time_current_sec(); int timeout = 0; struct ptlrpc_request *req; + int deadline; ENTRY; SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */ @@ -1291,19 +1298,23 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) /* request in-flight? */ if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || - (req->rq_phase == RQ_PHASE_BULK))) - continue; + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) if (req->rq_timedout) /* already timed out */ continue; - if (req->rq_deadline <= now) { /* actually expired already */ + if (req->rq_phase == RQ_PHASE_NEW) + deadline = req->rq_sent; /* delayed send */ + else + deadline = req->rq_deadline; + + if (deadline <= now) { /* actually expired already */ timeout = 1; /* ASAP */ break; - } - - if ((timeout == 0) || (timeout > (req->rq_deadline - now))) { - timeout = req->rq_deadline - now; + } + if ((timeout == 0) || (timeout > (deadline - now))) { + timeout = deadline - now; } } RETURN(timeout); diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 25ec036c456230766a1f62997f81b28446793d32..f427fbe0293f42fa93980e29be9e49f8e32d9118 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -124,7 +124,7 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req); /* pers.c */ void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc); -void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, cfs_page_t *page, +void ptlrpc_add_bulk_page(struct ptlrpc_bulk_desc *desc, cfs_page_t *page, int pageoffset, int len); void ptl_rpc_wipe_bulk_pages(struct ptlrpc_bulk_desc *desc); @@ -140,4 +140,8 @@ int ping_evictor_wake(struct obd_export *exp); #define ping_evictor_wake(exp) 1 #endif +static inline int ll_rpc_recoverable_error(int rc) +{ + return (rc == -ENOTCONN || rc == -ENODEV); +} #endif /* PTLRPC_INTERNAL_H */ diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 120ec88489c31c3afbec978a0b27fc6d8412ec2f..b255622fbe5aff95b947439347e869865645efe0 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -3895,15 +3895,306 @@ test_117() # bug 10891 } run_test 117 "verify fsfilt_extend =============================" -test_118() #bug 11710 +# Reset async IO behavior after error case +reset_async() { + FILE=$DIR/reset_async + + # Ensure all OSCs are cleared + $LSTRIPE $FILE 0 -1 -1 + dd if=/dev/zero of=$FILE bs=64k count=$OSTCOUNT + sync + rm $FILE +} + +test_118a() #bug 11710 { - sync; sleep 1; sync - multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c; - dirty=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + reset_async + + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + return 1; + fi +} +run_test 118a "verify O_SYNC works ==========" + +test_118b() +{ + reset_async + + #define OBD_FAIL_OST_ENOENT 0x217 + sysctl -w lustre.fail_loc=0x217 + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + sysctl -w lustre.fail_loc=0 + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + return 1; + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + return 1; + fi + + echo "Dirty pages not leaked on ENOENT" + + # Due to the above error the OSC will issue all RPCs syncronously + # until a subsequent RPC completes successfully without error. + multiop $DIR/$tfile Ow4096yc + rm -f $DIR/$tfile + + return 0 +} +run_test 118b "Reclaim dirty pages on fatal error ==========" + +test_118c() +{ + reset_async + + #define OBD_FAIL_OST_EROFS 0x216 + sysctl -w lustre.fail_loc=0x216 + + # multiop should block due to fsync until pages are written + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & + MULTIPID=$! + sleep 1 + + if [[ `ps h -o comm -p $MULTIPID` != "multiop" ]]; then + error "Multiop failed to block on fsync, pid=$MULTIPID" + fi + + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $WRITEBACK -eq 0 ]]; then + error "No page in writeback, writeback=$WRITEBACK" + fi + + sysctl -w lustre.fail_loc=0 + wait $MULTIPID + RC=$? + if [[ $RC -ne 0 ]]; then + error "Multiop fsync failed, rc=$RC" + fi + + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "Dirty pages flushed via fsync on EROFS" + return 0 +} +run_test 118c "Fsync blocks on EROFS until dirty pages are flushed ==========" + +test_118d() +{ + reset_async + + #define OBD_FAIL_OST_BRW_PAUSE_BULK + sysctl -w lustre.fail_loc=0x214 + # multiop should block due to fsync until pages are written + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & + MULTIPID=$! + sleep 1 + + if [[ `ps h -o comm -p $MULTIPID` != "multiop" ]]; then + error "Multiop failed to block on fsync, pid=$MULTIPID" + fi + + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $WRITEBACK -eq 0 ]]; then + error "No page in writeback, writeback=$WRITEBACK" + fi + + wait $MULTIPID || error "Multiop fsync failed, rc=$?" + + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "Dirty pages gaurenteed flushed via fsync" + return 0 +} +run_test 118d "Fsync validation inject a delay of the bulk ==========" + +test_118f() { + reset_async + + #define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a + sysctl -w lustre.fail_loc=0x8000040a + + # Should simulate EINVAL error which is fatal + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + fi + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" + + reset_async + return 0 +} +run_test 118f "Simulate unrecoverable OSC side error ==========" + +test_118g() { + reset_async + + #define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 + sysctl -w lustre.fail_loc=0x406 + + # simulate local -ENOMEM + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + + sysctl -w lustre.fail_loc=0 + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + fi + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" + + reset_async + return 0 +} +run_test 118g "Don't stay in wait if we got local -ENOMEM ==========" + +test_118h() { + reset_async + + #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e + sysctl -w lustre.fail_loc=0x20e + # Should simulate ENOMEM error which is recoverable and should be handled by timeout + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + + sysctl -w lustre.fail_loc=0 + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + fi + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" + + return 0 +} +run_test 118h "Verify timeout in handling recoverables errors ==========" + +test_118i() { + reset_async + + #define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e + sysctl -w lustre.fail_loc=0x20e + + # Should simulate ENOMEM error which is recoverable and should be handled by timeout + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c & + PID=$! + sleep 5 + sysctl -w lustre.fail_loc=0 + + wait $PID + RC=$? + if [[ $RC -ne 0 ]]; then + error "got error, but should be not, rc=$RC" + fi + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi + + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" + + return 0 +} +run_test 118i "Fix error before timeout in recoverable error ==========" + +test_118j() { + reset_async + + #define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 + sysctl -w lustre.fail_loc=0x220 + + # return -EIO from OST + multiop $DIR/$tfile oO_CREAT:O_RDWR:O_SYNC:w4096c + RC=$? + sysctl -w lustre.fail_loc=0x0 + if [[ $RC -eq 0 ]]; then + error "Must return error due to dropped pages, rc=$RC" + fi + + LOCKED=$(grep -c locked $LPROC/llite/*/dump_page_cache) + DIRTY=$(grep -c dirty $LPROC/llite/*/dump_page_cache) + WRITEBACK=$(grep -c writeback $LPROC/llite/*/dump_page_cache) + if [[ $LOCKED -ne 0 ]]; then + error "Locked pages remain in cache, locked=$LOCKED" + fi + + # in recoverable error on OST we want resend and stay until it finished + if [[ $DIRTY -ne 0 || $WRITEBACK -ne 0 ]]; then + error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" + fi + + rm -f $DIR/$tfile + echo "No pages locked after fsync" - return $dirty + return 0 } -run_test 118 "verify O_SYNC works" +run_test 118j "Simulate unrecoverable OST side error ==========" test_119a() # bug 11737 {