From 1455a668284d2f33706d456f25532a22cc0d86da Mon Sep 17 00:00:00 2001 From: nathan <nathan> Date: Mon, 3 Sep 2007 06:05:55 +0000 Subject: [PATCH] b=3055 i=adilger i=eeb (pending) adaptive timeouts --- lustre/ChangeLog | 7 + lustre/include/linux/lustre_fsfilt.h | 18 +- lustre/include/lprocfs_status.h | 34 ++ lustre/include/lustre/lustre_idl.h | 13 +- lustre/include/lustre_import.h | 68 ++- lustre/include/lustre_lib.h | 5 +- lustre/include/lustre_net.h | 88 +++- lustre/include/obd.h | 5 +- lustre/include/obd_support.h | 48 +- lustre/ldlm/ldlm_lib.c | 73 +-- lustre/ldlm/ldlm_lockd.c | 54 ++- lustre/ldlm/ldlm_request.c | 10 +- lustre/liblustre/llite_lib.c | 4 +- lustre/liblustre/super.c | 5 +- lustre/llite/llite_lib.c | 13 +- lustre/mdc/lproc_mdc.c | 3 +- lustre/mds/handler.c | 26 +- lustre/mds/mds_fs.c | 4 +- lustre/mds/mds_internal.h | 2 +- lustre/mds/mds_lov.c | 6 +- lustre/mgc/mgc_request.c | 2 +- lustre/mgs/mgs_handler.c | 8 +- lustre/mgs/mgs_internal.h | 3 - lustre/obdclass/class_obd.c | 8 +- lustre/obdclass/genops.c | 25 +- lustre/obdclass/linux/linux-module.c | 23 - lustre/obdclass/linux/linux-sysctl.c | 20 +- lustre/obdclass/lprocfs_status.c | 114 ++++- lustre/obdclass/obd_config.c | 1 - lustre/obdclass/obd_mount.c | 20 +- lustre/obdfilter/filter.c | 20 +- lustre/obdfilter/filter_internal.h | 3 +- lustre/obdfilter/filter_io.c | 8 +- lustre/obdfilter/filter_io_24.c | 6 +- lustre/obdfilter/filter_io_26.c | 8 +- lustre/osc/lproc_osc.c | 1 + lustre/osc/osc_create.c | 4 +- lustre/ost/ost_handler.c | 87 ++-- lustre/ost/ost_internal.h | 4 +- lustre/ptlrpc/client.c | 369 ++++++++++++--- lustre/ptlrpc/events.c | 69 ++- lustre/ptlrpc/import.c | 308 +++++++++++-- lustre/ptlrpc/lproc_ptlrpc.c | 56 +++ lustre/ptlrpc/niobuf.c | 107 ++++- lustre/ptlrpc/pack_generic.c | 217 ++++++++- lustre/ptlrpc/pinger.c | 13 +- lustre/ptlrpc/ptlrpc_internal.h | 1 + lustre/ptlrpc/ptlrpc_module.c | 2 + lustre/ptlrpc/service.c | 644 ++++++++++++++++++++------- lustre/ptlrpc/wiretest.c | 38 +- lustre/tests/conf-sanity.sh | 27 +- lustre/tests/recovery-small.sh | 45 +- lustre/tests/replay-dual.sh | 2 +- lustre/tests/replay-single.sh | 144 +++++- lustre/tests/sanity.sh | 16 +- lustre/tests/sanityN.sh | 2 +- lustre/tests/test-framework.sh | 6 +- lustre/utils/wirecheck.c | 7 +- lustre/utils/wiretest.c | 38 +- 59 files changed, 2312 insertions(+), 650 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 8111ab87bf..8d68675bb9 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -14,6 +14,12 @@ * Recommended e2fsprogs version: 1.40.2-cfs1 * Note that reiserfs quotas are disabled on SLES 10 in this kernel. +Severity : enhancement +Bugzilla : 3055 +Description: Adaptive timeouts +Details : RPC timeouts adapt to changing server load and network + conditions to reduce resend attempts and improve recovery time. + Severity : major Bugzilla : 12932 Description: obd_health_check_timeout too short @@ -923,6 +929,7 @@ Bugzilla : 12860 Description: mds_lov_synchronize race leads to various problems Details : simultaneous MDT->OST connections at startup can cause the sync to abort, leaving the OSC in a bad state. + Severity : normal Bugzilla : 12975 Frequency : rare diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index a95cc61f47..cee0c21411 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -164,14 +164,14 @@ static inline lvfs_sbdev_type fsfilt_journal_sbdev(struct obd_device *obd, #define FSFILT_OP_JOIN 11 #define FSFILT_OP_NOOP 15 -#define fsfilt_check_slow(obd, start, timeout, msg) \ +#define fsfilt_check_slow(obd, start, msg) \ do { \ if (time_before(jiffies, start + 15 * HZ)) \ break; \ else if (time_before(jiffies, start + 30 * HZ)) \ CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \ msg, (jiffies-start) / HZ); \ - else if (time_before(jiffies, start + timeout / 2 * HZ)) \ + else if (time_before(jiffies, start + DISK_TIMEOUT * HZ)) \ CWARN("%s: slow %s %lus\n", obd->obd_name, msg, \ (jiffies - start) / HZ); \ else \ @@ -202,7 +202,7 @@ static inline void *fsfilt_start_log(struct obd_device *obd, LBUG(); } } - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return handle; } @@ -237,7 +237,7 @@ static inline void *fsfilt_brw_start_log(struct obd_device *obd, int objcount, LBUG(); } } - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return handle; } @@ -257,7 +257,7 @@ static inline int fsfilt_extend(struct obd_device *obd, struct inode *inode, int rc = obd->obd_fsops->fs_extend(inode, nblocks, handle); CDEBUG(D_INFO, "extending handle %p with %u blocks\n", handle, nblocks); - fsfilt_check_slow(obd, now, obd_timeout, "journal extend"); + fsfilt_check_slow(obd, now, "journal extend"); return rc; } @@ -269,7 +269,7 @@ static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode, int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync); CDEBUG(D_INFO, "committing handle %p\n", handle); - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return rc; } @@ -282,7 +282,7 @@ static inline int fsfilt_commit_async(struct obd_device *obd, int rc = obd->obd_fsops->fs_commit_async(inode, handle, wait_handle); CDEBUG(D_INFO, "committing handle %p (async)\n", *wait_handle); - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return rc; } @@ -293,7 +293,7 @@ static inline int fsfilt_commit_wait(struct obd_device *obd, unsigned long now = jiffies; int rc = obd->obd_fsops->fs_commit_wait(inode, handle); CDEBUG(D_INFO, "waiting for completion %p\n", handle); - fsfilt_check_slow(obd, now, obd_timeout, "journal start"); + fsfilt_check_slow(obd, now, "journal start"); return rc; } @@ -303,7 +303,7 @@ static inline int fsfilt_setattr(struct obd_device *obd, struct dentry *dentry, unsigned long now = jiffies; int rc; rc = obd->obd_fsops->fs_setattr(dentry, handle, iattr, do_trunc); - fsfilt_check_slow(obd, now, obd_timeout, "setattr"); + fsfilt_check_slow(obd, now, "setattr"); return rc; } diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index da1dc16eac..beba48e4b8 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -150,6 +150,23 @@ struct obd_device; struct file; struct obd_histogram; +/* Days / hours / mins / seconds format */ +struct dhms { + int d,h,m,s; +}; +static inline void s2dhms(struct dhms *ts, time_t secs) +{ + ts->d = secs / 86400; + secs = secs % 86400; + ts->h = secs / 3600; + secs = secs % 3600; + ts->m = secs / 60; + ts->s = secs % 60; +} +#define DHMS_FMT "%dd%dh%02dm%02ds" +#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s + + #ifdef LPROCFS /* Two optimized LPROCFS counter increment functions are provided: @@ -271,6 +288,13 @@ extern int lprocfs_rd_num_exports(char *page, char **start, off_t off, int count, int *eof, void *data); extern int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count, int *eof, void *data); +struct adaptive_timeout; +extern int lprocfs_at_hist_helper(char *page, int count, int rc, + struct adaptive_timeout *at); +extern int lprocfs_rd_timeouts(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_wr_timeouts(struct file *file, const char *buffer, + unsigned long count, void *data); extern int lprocfs_wr_evict_client(struct file *file, const char *buffer, unsigned long count, void *data); extern int lprocfs_wr_ping(struct file *file, const char *buffer, @@ -449,6 +473,16 @@ static inline int lprocfs_rd_num_exports(char *page, char **start, off_t off, static inline int lprocfs_rd_numrefs(char *page, char **start, off_t off, int count, int *eof, void *data) { return 0; } +struct adaptive_timeout; +static inline int lprocfs_at_hist_helper(char *page, int count, int rc, + struct adaptive_timeout *at) +{ return 0; } +static inline int lprocfs_rd_timeouts(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ return 0; } +static inline int lprocfs_wr_timeouts(struct file *file, const char *buffer, + unsigned long count, void *data) +{ return 0; } static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer, unsigned long count, void *data) { return 0; } diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index e425c826ac..094998e470 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -188,7 +188,7 @@ struct lustre_msg_v2 { __u32 lm_secflvr; __u32 lm_magic; __u32 lm_repsize; - __u32 lm_timeout; + __u32 lm_cksum; __u32 lm_padding_1; __u32 lm_padding_2; __u32 lm_padding_3; @@ -209,11 +209,11 @@ struct ptlrpc_body { __u32 pb_flags; __u32 pb_op_flags; __u32 pb_conn_cnt; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time */ __u32 pb_padding_1; __u32 pb_padding_2; __u32 pb_padding_3; - __u32 pb_padding_4; - __u32 pb_padding_5; }; extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); @@ -247,6 +247,7 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); #define MSG_LAST_REPLAY 1 #define MSG_RESENT 2 #define MSG_REPLAY 4 +#define MSG_AT_SUPPORT 8 /* * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT) @@ -293,14 +294,14 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \ OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \ OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \ - OBD_CONNECT_CANCELSET) + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT) #define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ OBD_CONNECT_BRW_SIZE | OBD_CONNECT_QUOTA64 | \ - OBD_CONNECT_CANCELSET) + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT) #define ECHO_CONNECT_SUPPORTED (0) -#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION) +#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT) #define MAX_QUOTA_COUNT32 ((0xffffffffULL >> QUOTABLOCK_BITS) << QUOTABLOCK_BITS) diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 672d045d53..42dd2a8cd4 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -8,6 +8,26 @@ #include <lustre_handles.h> #include <lustre/lustre_idl.h> + +/* Adaptive Timeout stuff */ +#define D_ADAPTTO D_OTHER +#define AT_BINS 4 /* "bin" means "N seconds of history" */ +#define AT_TIMEBASE_DEFAULT 600 /* remembered history (sec) (should be + evenly divisible by AT_BINS) */ +#define AT_FLG_NOHIST 0x1 /* use last reported value only */ +#define AT_FLG_MIN 0x2 /* use a minimum limit */ + +struct adaptive_timeout { + time_t at_binstart; /* bin start time */ + time_t at_binlimit; /* bin time limit */ + unsigned int at_hist[AT_BINS]; /* timeout history bins */ + unsigned int at_flags; + unsigned int at_current; /* current timeout value */ + unsigned int at_worst_ever; /* worst-ever timeout value */ + time_t at_worst_time; /* worst-ever timeout timestamp */ + spinlock_t at_lock; +}; + enum lustre_imp_state { LUSTRE_IMP_CLOSED = 1, LUSTRE_IMP_NEW = 2, @@ -48,6 +68,14 @@ struct obd_import_conn { __u64 oic_last_attempt; /* jiffies, 64-bit */ }; +#define IMP_AT_MAX_PORTALS 4 +struct imp_at { + int iat_portal[IMP_AT_MAX_PORTALS]; + struct adaptive_timeout iat_net_latency; + struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS]; + time_t iat_drain; /* hack to slow reconnect reqs */ +}; + struct obd_import { struct portals_handle imp_handle; atomic_t imp_refcount; @@ -106,25 +134,35 @@ struct obd_import { __u64 imp_connect_flags_orig; __u32 imp_msg_magic; + __u32 imp_msg_flags; /* adjusted based on server capability */ - struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ -}; - -typedef void (*obd_import_callback)(struct obd_import *imp, void *closure, - int event, void *event_arg, void *cb_data); + struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ -struct obd_import_observer { - struct list_head oio_chain; - obd_import_callback oio_cb; - void *oio_cb_data; + struct imp_at imp_at; /* adaptive timeout data */ + time_t imp_last_reply_time; /* for health check */ }; -void class_observe_import(struct obd_import *imp, obd_import_callback cb, - void *cb_data); -void class_unobserve_import(struct obd_import *imp, obd_import_callback cb, - void *cb_data); -void class_notify_import_observers(struct obd_import *imp, int event, - void *event_arg); +/* import.c */ +static inline void at_init(struct adaptive_timeout *at, int val, int timebase, + int flags) { + memset(at, 0, sizeof(*at)); + at->at_binlimit = timebase / AT_BINS; + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = cfs_time_current_sec(); + at->at_flags = flags; + spin_lock_init(&at->at_lock); +} +static inline int at_get(struct adaptive_timeout *at) { + extern unsigned int adaptive_timeout_max; + if (adaptive_timeout_max) + return min(at->at_current, adaptive_timeout_max); + return at->at_current; +} +void at_add(struct adaptive_timeout *at, unsigned int val); +int import_at_get_index(struct obd_import *imp, int portal); +int import_at_get_ldlm(struct obd_import *imp); +#define AT_OFF (adaptive_timeout_max == 0) /* genops.c */ struct obd_export; diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index 092b3a601e..1e1a116575 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -73,9 +73,8 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req); #endif void target_cancel_recovery_timer(struct obd_device *obd); - -#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) /* *waves hands* */ -void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler); +void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler, + struct ptlrpc_request *req); void target_abort_recovery(void *data); void target_cleanup_recovery(struct obd_device *obd); int target_queue_recovery_request(struct ptlrpc_request *req, diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 76a5eafda7..9153574fc3 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -255,11 +255,12 @@ struct ptlrpc_request_pool { struct ptlrpc_request { int rq_type; /* one of PTL_RPC_MSG_* */ struct list_head rq_list; + struct list_head rq_timed_list; /* server-side early replies */ struct list_head rq_history_list; /* server-side history */ __u64 rq_history_seq; /* history sequence # */ int rq_status; spinlock_t rq_lock; - /* client-side flags */ + /* client-side flags. */ unsigned int rq_intr:1, rq_replied:1, rq_err:1, rq_timedout:1, rq_resend:1, rq_restart:1, /* @@ -273,9 +274,12 @@ struct ptlrpc_request { */ rq_replay:1, rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, - rq_no_delay:1, rq_net_err:1; + rq_no_delay:1, rq_net_err:1, rq_early:1, rq_must_unlink:1, + /* server-side: */ + rq_final:1; /* packed final reply */ enum rq_phase rq_phase; /* one of RQ_PHASE_* */ - atomic_t rq_refcount; /* client-side refcount for SENT race */ + atomic_t rq_refcount; /* client-side refcount for SENT race, + server-side refcounf for multiple replies */ struct ptlrpc_thread *rq_svc_thread; /* initial thread servicing req */ @@ -287,8 +291,8 @@ struct ptlrpc_request { int rq_reqlen; struct lustre_msg *rq_reqmsg; - int rq_timeout; /* time to wait for reply (seconds) */ int rq_replen; + struct lustre_msg *rq_repbuf; /* client only, buf may be bigger than msg */ struct lustre_msg *rq_repmsg; __u64 rq_transno; __u64 rq_xid; @@ -300,6 +304,8 @@ struct ptlrpc_request { int rq_import_generation; enum lustre_imp_state rq_send_state; + int rq_early_count; /* how many early replies (for stats) */ + /* client+server request */ lnet_handle_md_t rq_req_md_h; struct ptlrpc_cb_id rq_req_cbid; @@ -307,6 +313,7 @@ struct ptlrpc_request { /* server-side... */ struct timeval rq_arrival_time; /* request arrival time */ struct ptlrpc_reply_state *rq_reply_state; /* separated reply state */ + struct semaphore rq_rs_sem; /* one reply at a time */ struct ptlrpc_request_buffer_desc *rq_rqbd; /* incoming request buffer*/ #ifdef CRAY_XT3 __u32 rq_uid; /* peer uid, used in MDS only */ @@ -327,7 +334,11 @@ struct ptlrpc_request { void *rq_cb_data; struct ptlrpc_bulk_desc *rq_bulk; /* client side bulk */ - time_t rq_sent; /* when request sent, seconds */ + + /* client outgoing req */ + time_t rq_sent; /* when request/reply sent (secs) */ + time_t rq_deadline; /* when request must finish */ + int rq_timeout; /* service time estimate (secs) */ /* Multi-rpc bits */ struct list_head rq_set_chain; @@ -526,13 +537,21 @@ struct ptlrpc_service { int srv_n_difficult_replies; /* # 'difficult' replies */ int srv_n_active_reqs; /* # reqs being served */ cfs_duration_t srv_rqbd_timeout; /* timeout before re-posting reqs, in tick */ - int srv_watchdog_timeout; /* soft watchdog timeout, in ms */ + int srv_watchdog_factor; /* soft watchdog timeout mutiplier */ unsigned srv_cpu_affinity:1; /* bind threads to CPUs */ + unsigned srv_at_check:1; /* check early replies */ __u32 srv_req_portal; __u32 srv_rep_portal; - - int srv_n_queued_reqs; /* # reqs waiting to be served */ + + /* AT stuff */ + struct adaptive_timeout srv_at_estimate;/* estimated service time */ + spinlock_t srv_at_lock; + struct list_head srv_at_list; /* reqs waiting for replies */ + cfs_timer_t srv_at_timer; /* early reply timer */ + + int srv_n_queued_reqs; /* # reqs in either of the queues below */ + struct list_head srv_req_in_queue; /* incoming reqs */ struct list_head srv_request_queue; /* reqs waiting for service */ struct list_head srv_request_history; /* request history */ @@ -624,10 +643,13 @@ static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc) return (rc); } -int ptlrpc_send_reply(struct ptlrpc_request *req, int); +#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 +#define PTLRPC_REPLY_EARLY 0x02 +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); int ptlrpc_reply(struct ptlrpc_request *req); int ptlrpc_error(struct ptlrpc_request *req); void ptlrpc_resend_req(struct ptlrpc_request *request); +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); int ptl_send_rpc(struct ptlrpc_request *request, int noreply); int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd); @@ -648,6 +670,17 @@ ptlrpc_client_receiving_reply (struct ptlrpc_request *req) return (rc); } +static inline int +ptlrpc_client_must_unlink (struct ptlrpc_request *req) +{ + int rc; + + spin_lock(&req->rq_lock); + rc = req->rq_must_unlink; + spin_unlock(&req->rq_lock); + return (rc); +} + static inline int ptlrpc_client_replied (struct ptlrpc_request *req) { @@ -722,7 +755,7 @@ void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs); struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, int req_portal, int rep_portal, - int watchdog_timeout, /* in ms */ + int watchdog_factor, svc_handler_t, char *name, cfs_proc_dir_entry_t *proc_entry, svcreq_printfn_t, @@ -759,10 +792,14 @@ int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count, int *lens, char **bufs); int lustre_pack_reply(struct ptlrpc_request *, int count, int *lens, char **bufs); +#define LPRFL_EARLY_REPLY 1 +int lustre_pack_reply_flags(struct ptlrpc_request *, int count, int *lens, + char **bufs, int flags); void lustre_shrink_reply(struct ptlrpc_request *req, int segment, unsigned int newlen, int move_data); void lustre_free_reply_state(struct ptlrpc_reply_state *rs); int lustre_msg_size(__u32 magic, int count, int *lengths); +int lustre_msg_early_size(void); int lustre_unpack_msg(struct lustre_msg *m, int len); void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen); int lustre_msg_buflen(struct lustre_msg *m, int n); @@ -791,7 +828,12 @@ __u64 lustre_msg_get_last_committed(struct lustre_msg *msg); __u64 lustre_msg_get_transno(struct lustre_msg *msg); int lustre_msg_get_status(struct lustre_msg *msg); __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg); +int lustre_msg_is_v1(struct lustre_msg *msg); __u32 lustre_msg_get_magic(struct lustre_msg *msg); +__u32 lustre_msg_get_timeout(struct lustre_msg *msg); +__u32 lustre_msg_get_service_time(struct lustre_msg *msg); +__u32 lustre_msg_get_cksum(struct lustre_msg *msg); +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg); void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle); void lustre_msg_set_type(struct lustre_msg *msg, __u32 type); void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc); @@ -800,6 +842,9 @@ void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed); void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno); void lustre_msg_set_status(struct lustre_msg *msg, __u32 status); void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt); +void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout); +void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time); +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum); static inline void ptlrpc_rs_addref(struct ptlrpc_reply_state *rs) @@ -816,6 +861,24 @@ ptlrpc_rs_decref(struct ptlrpc_reply_state *rs) lustre_free_reply_state(rs); } +/* Should only be called once per req */ +static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req) +{ + if (req->rq_reply_state == NULL) + return; /* shouldn't occur */ + ptlrpc_rs_decref(req->rq_reply_state); + req->rq_reply_state = NULL; + req->rq_repmsg = NULL; + up(&req->rq_rs_sem); /* held since lustre_pack_reply */ +} + +/* Check if we already packed a normal (non-early) reply. + Single thread only! */ +static inline int lustre_packed_reply(struct ptlrpc_request *req) +{ + return req->rq_final; +} + static inline __u32 lustre_request_magic(struct ptlrpc_request *req) { return lustre_msg_get_magic(req->rq_reqmsg); @@ -839,9 +902,10 @@ static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req) static inline void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, int *lens) { - req->rq_replen = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens); + int size = lustre_msg_size(req->rq_reqmsg->lm_magic, count, lens); + req->rq_replen = size + lustre_msg_early_size(); if (req->rq_reqmsg->lm_magic == LUSTRE_MSG_MAGIC_V2) - req->rq_reqmsg->lm_repsize = req->rq_replen; + req->rq_reqmsg->lm_repsize = size; } /* ldlm/ldlm_lib.c */ diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 817058d15d..6551ac290c 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -613,7 +613,7 @@ static inline void oti_init(struct obd_trans_info *oti, oti->oti_xid = req->rq_xid; - if (req->rq_repmsg && req->rq_reqmsg != 0) + if (req->rq_reqmsg && req->rq_repmsg && req->rq_reply_state) oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1; oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); @@ -767,7 +767,8 @@ struct obd_device { struct list_head obd_recovery_queue; struct list_head obd_delayed_reply_queue; time_t obd_recovery_start; - time_t obd_recovery_end; + time_t obd_recovery_end; /* for lprocfs_status */ + int obd_recovery_timeout; union { struct obd_device_target obt; diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 5db0110698..f1c189624a 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -34,11 +34,12 @@ extern unsigned int obd_fail_val; extern unsigned int obd_debug_peer_on_timeout; extern unsigned int obd_dump_on_timeout; extern unsigned int obd_dump_on_eviction; +/* obd_timeout should only be used for recovery, not for + networking / disk / timings affected by load (use Adaptive Timeouts) */ extern unsigned int obd_timeout; /* seconds */ -#define PING_INTERVAL max(obd_timeout / 4, 1U) -#define RECONNECT_INTERVAL max(obd_timeout / 10, 10U) -extern unsigned int ldlm_timeout; -extern unsigned int obd_health_check_timeout; +extern unsigned int ldlm_timeout; /* seconds */ +extern unsigned int adaptive_timeout_min; /* seconds */ +extern unsigned int adaptive_timeout_max; /* seconds */ extern unsigned int obd_sync_filter; extern unsigned int obd_max_dirty_pages; extern atomic_t obd_dirty_pages; @@ -47,11 +48,31 @@ extern int obd_race_state; extern unsigned int obd_alloc_fail_rate; /* Timeout definitions */ -#define LDLM_TIMEOUT_DEFAULT 20 #define OBD_TIMEOUT_DEFAULT 100 -#define HEALTH_CHECK_COEF 3 / 2 -#define HEALTH_CHECK_TIMEOUT_DEFAULT (OBD_TIMEOUT_DEFAULT * HEALTH_CHECK_COEF) -#define HEALTH_CHECK_TIMEOUT (obd_timeout * HEALTH_CHECK_COEF) +#define LDLM_TIMEOUT_DEFAULT 20 +/* Time to wait for all clients to reconnect during recovery */ +/* Should be very conservative; must catch the first reconnect after reboot */ +#define OBD_RECOVERY_TIMEOUT (obd_timeout * 5 / 2) +/* Change recovery-small 26b time if you change this */ +#define PING_INTERVAL max(obd_timeout / 4, 1U) +/* Client may skip 1 ping; wait for 2.5 */ +#define PING_EVICT_TIMEOUT (PING_INTERVAL * 5 / 2) +#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */ +#define CONNECTION_SWITCH_MIN 5 /* Connection switching rate limiter */ +#ifndef CRAY_XT3 +/* In general this should be low to have quick detection of a system + running on a backup server. */ +#define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/20) +#else +/* ...but for very large systems (e.g. CRAY) we need to keep the initial + connect t.o. high (bz 10803), because they will nearly ALWAYS be doing the + connects for the first time (clients "reboot" after every process, so no + chance to generate adaptive timeout data. */ +#define INITIAL_CONNECT_TIMEOUT max_t(int,CONNECTION_SWITCH_MIN,obd_timeout/2) +#endif +#define LND_TIMEOUT 50 /* LNET LND-level RPC timeout */ +#define FOREVER 300 /* Something taking this long is broken */ + #define OBD_FAIL_MDS 0x100 #define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 @@ -131,7 +152,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_OST_BRW_READ_BULK 0x20f #define OBD_FAIL_OST_SYNC_NET 0x210 #define OBD_FAIL_OST_ALL_REPLY_NET 0x211 -#define OBD_FAIL_OST_ALL_REQUESTS_NET 0x212 +#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212 #define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 #define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 #define OBD_FAIL_OST_ENOSPC 0x215 @@ -147,6 +168,7 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f #define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 #define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 +#define OBD_FAIL_OST_PAUSE_CREATE 0x223 #define OBD_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 @@ -187,6 +209,8 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 #define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a +#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c #define OBD_FAIL_OBD_PING_NET 0x600 #define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 @@ -208,9 +232,9 @@ extern unsigned int obd_alloc_fail_rate; #define OBD_FAIL_MGS 0x900 #define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 #define OBD_FAIL_MGS_ALL_REPLY_NET 0x902 -#define OBD_FAIL_MGC_PROCESS_LOG 0x903 -#define OBD_FAIL_MGS_SLOW_REQUEST_NET 0x904 -#define OBD_FAIL_MGS_SLOW_TARGET_REG 0x905 +#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 +#define OBD_FAIL_MGS_PAUSE_REQ 0x904 +#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 #define OBD_FAIL_QUOTA_QD_COUNT_32BIT 0xA00 diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index 2ea0d2635a..48229aad48 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -729,15 +729,15 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) OBD_FAIL_TIMEOUT(OBD_FAIL_TGT_DELAY_CONNECT, 2 * obd_timeout); } else if (req->rq_export == NULL && atomic_read(&export->exp_rpc_count) > 0) { - CWARN("%s: refuse connection from %s/%s to 0x%p/%d\n", - target->obd_name, cluuid.uuid, + CWARN("%s: refuse connection from %s/%s to 0x%p; still busy " + "with %d references\n", target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), export, atomic_read(&export->exp_refcount)); GOTO(out, rc = -EBUSY); } else if (req->rq_export != NULL && atomic_read(&export->exp_rpc_count) > 1) { - CWARN("%s: refuse reconnection from %s@%s to 0x%p/%d\n", - target->obd_name, cluuid.uuid, + CWARN("%s: refuse reconnection from %s@%s to 0x%p; still busy " + "with %d active RPCs\n", target->obd_name, cluuid.uuid, libcfs_nid2str(req->rq_peer.nid), export, atomic_read(&export->exp_rpc_count)); GOTO(out, rc = -EBUSY); @@ -763,7 +763,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) /* If this is the first client, start the recovery timer */ if (target->obd_recovering) { lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); - target_start_recovery_timer(target, handler); + target_start_recovery_timer(target, handler, req); } /* Tell the client if we support replayable requests */ @@ -862,8 +862,10 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) &export->exp_nid_hash); spin_unlock(&target->obd_dev_lock); - if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) - GOTO(out, rc = 0); + if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) { + revimp = class_import_get(export->exp_imp_reverse); + GOTO(set_flags, rc = 0); + } if (target->obd_recovering) target->obd_connected_clients++; @@ -880,10 +882,13 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) revimp->imp_remote_handle = conn; revimp->imp_dlm_fake = 1; revimp->imp_state = LUSTRE_IMP_FULL; - +set_flags: if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_NEXT_VER) { + /* Client wants v2 */ revimp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2; lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_NEXT_VER); + if (export->exp_connect_flags & OBD_CONNECT_AT) + revimp->imp_msg_flags |= MSG_AT_SUPPORT; } class_import_put(revimp); @@ -892,7 +897,7 @@ out: spin_lock(&export->exp_lock); export->exp_connecting = 0; spin_unlock(&export->exp_lock); - } + } if (targref) class_decref(targref); if (rc) @@ -934,11 +939,7 @@ void target_destroy_export(struct obd_export *exp) static void target_release_saved_req(struct ptlrpc_request *req) { - if (req->rq_reply_state != NULL) { - ptlrpc_rs_decref(req->rq_reply_state); - /* req->rq_reply_state = NULL; */ - } - + ptlrpc_req_drop_rs(req); class_export_put(req->rq_export); OBD_FREE(req->rq_reqmsg, req->rq_reqlen); OBD_FREE(req, sizeof *req); @@ -968,7 +969,7 @@ static void target_finish_recovery(struct obd_device *obd) ptlrpc_reply(req); target_release_saved_req(req); } - obd->obd_recovery_end = CURRENT_SECONDS; + obd->obd_recovery_end = cfs_time_current_sec(); } static void abort_recovery_queue(struct obd_device *obd) @@ -1080,38 +1081,44 @@ void target_cancel_recovery_timer(struct obd_device *obd) cfs_timer_disarm(&obd->obd_recovery_timer); } -static void reset_recovery_timer(struct obd_device *obd) +static void reset_recovery_timer(struct obd_device *obd, + struct ptlrpc_request *req, int first) { spin_lock_bh(&obd->obd_processing_task_lock); if (!obd->obd_recovering) { spin_unlock_bh(&obd->obd_processing_task_lock); return; } + /* Track the client's largest expected replay time */ + obd->obd_recovery_timeout = + max((first ? (int)OBD_RECOVERY_TIMEOUT : + obd->obd_recovery_timeout), + (int)lustre_msg_get_timeout(req->rq_reqmsg)); cfs_timer_arm(&obd->obd_recovery_timer, - cfs_time_shift(OBD_RECOVERY_TIMEOUT)); + cfs_time_shift(obd->obd_recovery_timeout)); spin_unlock_bh(&obd->obd_processing_task_lock); - CDEBUG(D_HA, "%s: timer will expire in %u seconds\n", obd->obd_name, - OBD_RECOVERY_TIMEOUT); - /* Only used for lprocfs_status */ - obd->obd_recovery_end = CURRENT_SECONDS + OBD_RECOVERY_TIMEOUT; + CDEBUG(D_HA, "%s: recovery timer will expire in %u seconds\n", + obd->obd_name, obd->obd_recovery_timeout); + obd->obd_recovery_end = cfs_time_current_sec() + + obd->obd_recovery_timeout; } /* Only start it the first time called */ -void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler) +void target_start_recovery_timer(struct obd_device *obd, svc_handler_t handler, + struct ptlrpc_request *req) { spin_lock_bh(&obd->obd_processing_task_lock); if (obd->obd_recovery_handler) { spin_unlock_bh(&obd->obd_processing_task_lock); return; } - CWARN("%s: starting recovery timer (%us)\n", obd->obd_name, - OBD_RECOVERY_TIMEOUT); + CWARN("%s: starting recovery timer\n", obd->obd_name); obd->obd_recovery_handler = handler; cfs_timer_init(&obd->obd_recovery_timer, target_recovery_expired, obd); spin_unlock_bh(&obd->obd_processing_task_lock); - reset_recovery_timer(obd); + reset_recovery_timer(obd, req, 1); } static int check_for_next_transno(struct obd_device *obd) @@ -1193,14 +1200,11 @@ static void process_recovery_queue(struct obd_device *obd) DEBUG_REQ(D_HA, req, "processing: "); (void)obd->obd_recovery_handler(req); obd->obd_replayed_requests++; - reset_recovery_timer(obd); + reset_recovery_timer(obd, req, 0); /* bug 1580: decide how to properly sync() in recovery */ //mds_fsync_super(obd->u.obt.obt_sb); class_export_put(req->rq_export); - if (req->rq_reply_state != NULL) { - ptlrpc_rs_decref(req->rq_reply_state); - /* req->rq_reply_state = NULL; */ - } + ptlrpc_req_drop_rs(req); OBD_FREE(req->rq_reqmsg, req->rq_reqlen); OBD_FREE(req, sizeof *req); spin_lock_bh(&obd->obd_processing_task_lock); @@ -1332,12 +1336,13 @@ int target_queue_final_reply(struct ptlrpc_request *req, int rc) struct lustre_msg *reqmsg; int recovery_done = 0; - LASSERT ((rc == 0) == (req->rq_reply_state != NULL)); + LASSERT ((rc == 0) == lustre_packed_reply(req)); - if (rc) { + if (!lustre_packed_reply(req)) { /* Just like ptlrpc_error, but without the sending. */ rc = lustre_pack_reply(req, 1, NULL, NULL); - LASSERT(rc == 0); /* XXX handle this */ + if (rc) + CERROR("pack error %d\n", rc); req->rq_type = PTL_RPC_MSG_ERR; } @@ -1417,7 +1422,7 @@ target_send_reply_msg (struct ptlrpc_request *req, int rc, int fail_id) DEBUG_REQ(D_NET, req, "sending reply"); } - return (ptlrpc_send_reply(req, 1)); + return (ptlrpc_send_reply(req, PTLRPC_REPLY_MAYBE_DIFFICULT)); } void diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index 7fd3d0b70d..15bd1b874e 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -58,10 +58,10 @@ inline cfs_time_t round_timeout(cfs_time_t timeout) return cfs_time_seconds((int)cfs_duration_sec(cfs_time_sub(timeout, 0)) + 1); } -/* timeout for initial callback (AST) reply */ -static inline unsigned int ldlm_get_rq_timeout(unsigned int ldlm_timeout, - unsigned int obd_timeout) +/* timeout for initial callback (AST) reply (bz10399) */ +static inline unsigned int ldlm_get_rq_timeout(void) { + /* Non-AT value */ unsigned int timeout = min(ldlm_timeout, obd_timeout / 3); return timeout < 1 ? 1 : timeout; @@ -204,15 +204,15 @@ static void waiting_locks_callback(unsigned long unused) lock = list_entry(waiting_locks_list.next, struct ldlm_lock, l_pending_chain); - if (cfs_time_after(lock->l_callback_timeout, cfs_time_current()) || - (lock->l_req_mode == LCK_GROUP)) + if (cfs_time_after(lock->l_callback_timeout, cfs_time_current()) + || (lock->l_req_mode == LCK_GROUP)) break; - LDLM_ERROR(lock, "lock callback timer expired: evicting client " - "%s@%s nid %s ",lock->l_export->exp_client_uuid.uuid, - lock->l_export->exp_connection->c_remote_uuid.uuid, - libcfs_nid2str(lock->l_export->exp_connection->c_peer.nid)); - + LDLM_ERROR(lock, "lock callback timer expired after %lds: " + "evicting client at %s ", + cfs_time_current_sec()- lock->l_enqueued_time.tv_sec, + libcfs_nid2str( + lock->l_export->exp_connection->c_peer.nid)); if (lock == last) { LDLM_ERROR(lock, "waiting on lock multiple times"); CERROR("wll %p n/p %p/%p, l_pending %p n/p %p/%p\n", @@ -270,20 +270,23 @@ static void waiting_locks_callback(unsigned long unused) */ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock) { + int timeout = obd_timeout / 2; /* Non-AT value */ cfs_time_t timeout_rounded; if (!list_empty(&lock->l_pending_chain)) return 0; - lock->l_callback_timeout =cfs_time_add(cfs_time_current(), - cfs_time_seconds(obd_timeout)/2); + if (lock->l_export && !AT_OFF) + timeout = import_at_get_ldlm(lock->l_export->exp_imp_reverse); + + lock->l_callback_timeout = cfs_time_shift(timeout); timeout_rounded = round_timeout(lock->l_callback_timeout); - if (cfs_time_before(timeout_rounded, cfs_timer_deadline(&waiting_locks_timer)) || + if (cfs_time_before(timeout_rounded, + cfs_timer_deadline(&waiting_locks_timer)) || !cfs_timer_is_armed(&waiting_locks_timer)) { cfs_timer_arm(&waiting_locks_timer, timeout_rounded); - } list_add_tail(&lock->l_pending_chain, &waiting_locks_list); /* FIFO */ return 1; @@ -614,7 +617,9 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, } req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); + /* ptlrpc_prep_req already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); if (lock->l_export && lock->l_export->exp_ldlm_stats) lprocfs_counter_incr(lock->l_export->exp_ldlm_stats, @@ -644,7 +649,8 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) total_enqueue_wait = cfs_timeval_sub(&granted_time, &lock->l_enqueued_time, NULL); - if (total_enqueue_wait / 1000000 > obd_timeout) + if (total_enqueue_wait / ONE_MILLION > obd_timeout) + /* non-fatal with AT - change to LDLM_DEBUG? */ LDLM_ERROR(lock, "enqueue wait took %luus from %lu", total_enqueue_wait, lock->l_enqueued_time.tv_sec); @@ -687,7 +693,9 @@ int ldlm_server_completion_ast(struct ldlm_lock *lock, int flags, void *data) ptlrpc_req_set_repsize(req, 1, NULL); req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); + /* ptlrpc_prep_req already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); /* We only send real blocking ASTs after the lock is granted */ lock_res_and_lock(lock); @@ -750,7 +758,9 @@ int ldlm_server_glimpse_ast(struct ldlm_lock *lock, void *data) ptlrpc_req_set_repsize(req, 2, size); req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_timeout = ldlm_get_rq_timeout(ldlm_timeout, obd_timeout); + /* ptlrpc_prep_req already set timeout */ + if (AT_OFF) + req->rq_timeout = ldlm_get_rq_timeout(); if (lock->l_export && lock->l_export->exp_ldlm_stats) lprocfs_counter_incr(lock->l_export->exp_ldlm_stats, @@ -1010,7 +1020,7 @@ existing_lock: EXIT; out: req->rq_status = rc ?: err; /* return either error - bug 11190 */ - if (req->rq_reply_state == NULL) { + if (!lustre_packed_reply(req)) { err = lustre_pack_reply(req, 1, NULL, NULL); if (rc == 0) rc = err; @@ -1340,7 +1350,7 @@ static void ldlm_handle_gl_callback(struct ptlrpc_request *req, static int ldlm_callback_reply(struct ptlrpc_request *req, int rc) { req->rq_status = rc; - if (req->rq_reply_state == NULL) { + if (!lustre_packed_reply(req)) { rc = lustre_pack_reply(req, 1, NULL, NULL); if (rc) return rc; @@ -1727,7 +1737,7 @@ static int ldlm_setup(void) ldlm_state->ldlm_cb_service = ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, LDLM_MAXREPSIZE, LDLM_CB_REQUEST_PORTAL, - LDLM_CB_REPLY_PORTAL, ldlm_timeout * 900, + LDLM_CB_REPLY_PORTAL, 1800, ldlm_callback_handler, "ldlm_cbd", ldlm_svc_proc_dir, NULL, LDLM_THREADS_AUTO_MIN, LDLM_THREADS_AUTO_MAX, @@ -1741,7 +1751,7 @@ static int ldlm_setup(void) ldlm_state->ldlm_cancel_service = ptlrpc_init_svc(LDLM_NBUFS, LDLM_BUFSIZE, LDLM_MAXREQSIZE, LDLM_MAXREPSIZE, LDLM_CANCEL_REQUEST_PORTAL, - LDLM_CANCEL_REPLY_PORTAL, ldlm_timeout * 6000, + LDLM_CANCEL_REPLY_PORTAL, 6000, ldlm_cancel_handler, "ldlm_canceld", ldlm_svc_proc_dir, NULL, LDLM_THREADS_AUTO_MIN, LDLM_THREADS_AUTO_MAX, diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index c1b0030269..d9f5dcf9bd 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -61,7 +61,8 @@ int ldlm_expired_completion_wait(void *data) LDLM_ERROR(lock, "lock timed out (enqueued at %lu, %lus ago); " "not entering recovery in server code, just going " "back to sleep", lock->l_enqueued_time.tv_sec, - CURRENT_SECONDS - lock->l_enqueued_time.tv_sec); + cfs_time_current_sec() - + lock->l_enqueued_time.tv_sec); if (cfs_time_after(cfs_time_current(), next_dump)) { last_dump = next_dump; next_dump = cfs_time_shift(300); @@ -91,6 +92,7 @@ int ldlm_completion_ast(struct ldlm_lock *lock, int flags, void *data) struct obd_device *obd; struct obd_import *imp = NULL; struct l_wait_info lwi; + __u32 timeout = obd_timeout; /* Non-AT value */ int rc = 0; ENTRY; @@ -115,8 +117,10 @@ noreproc: obd = class_exp2obd(lock->l_conn_export); /* if this is a local lock, then there is no import */ - if (obd != NULL) + if (obd != NULL) { imp = obd->u.cli.cl_import; + timeout = import_at_get_ldlm(imp); + } lwd.lwd_lock = lock; @@ -124,7 +128,7 @@ noreproc: LDLM_DEBUG(lock, "waiting indefinitely because of NO_TIMEOUT"); lwi = LWI_INTR(interrupted_completion_wait, &lwd); } else { - lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(obd_timeout), + lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), ldlm_expired_completion_wait, interrupted_completion_wait, &lwd); } diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index 4da1a09b4c..9c196657ba 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -155,7 +155,7 @@ int liblustre_process_log(struct config_llog_instance *cfg, if (ocd == NULL) GOTO(out_cleanup, rc = -ENOMEM); - ocd->ocd_connect_flags = OBD_CONNECT_VERSION; + ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT; ocd->ocd_version = LUSTRE_VERSION_CODE; rc = obd_connect(&mgc_conn, obd, &mgc_uuid, ocd); @@ -286,7 +286,7 @@ int _sysio_lustre_init(void) obd_timeout); } - /* debug peer on timeout? */ + /* debug peer on timeout? */ envstr = getenv("LIBLUSTRE_DEBUG_PEER_ON_TIMEOUT"); if (envstr != NULL) { obd_debug_peer_on_timeout = diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index fa7ff8f5c0..eaa21ec895 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -1935,7 +1935,8 @@ llu_fsswop_mount(const char *source, obd_set_info_async(obd->obd_self_export, strlen("async"), "async", sizeof(async), &async, NULL); - ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION; + ocd.ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_VERSION | + OBD_CONNECT_AT; ocd.ocd_ibits_known = MDS_INODELOCK_FULL; ocd.ocd_version = LUSTRE_VERSION_CODE; @@ -1968,7 +1969,7 @@ llu_fsswop_mount(const char *source, obd->obd_upcall.onu_upcall = ll_ocd_update; ocd.ocd_connect_flags = OBD_CONNECT_SRVLOCK | OBD_CONNECT_REQPORTAL | - OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK; + OBD_CONNECT_VERSION | OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_AT; ocd.ocd_version = LUSTRE_VERSION_CODE; err = obd_connect(&osc_conn, obd, &sbi->ll_sb_uuid, &ocd); if (err) { diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index 1ef3295175..0719ac2019 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -156,10 +156,9 @@ static int client_common_fill_super(struct super_block *sb, } /* indicate the features supported by this client */ - data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH | - OBD_CONNECT_JOIN | - OBD_CONNECT_ATTRFID | OBD_CONNECT_VERSION | - OBD_CONNECT_CANCELSET; + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_IBITS | + OBD_CONNECT_JOIN | OBD_CONNECT_ATTRFID | OBD_CONNECT_NODEVOH | + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT; #ifdef CONFIG_FS_POSIX_ACL data->ocd_connect_flags |= OBD_CONNECT_ACL; #endif @@ -257,9 +256,9 @@ static int client_common_fill_super(struct super_block *sb, GOTO(out_mdc, err = -ENODEV); } - data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | - OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | - OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET; + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_GRANT | + OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_SRVLOCK | OBD_CONNECT_CANCELSET | OBD_CONNECT_AT; CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d " "ocd_grant: %d\n", data->ocd_connect_flags, diff --git a/lustre/mdc/lproc_mdc.c b/lustre/mdc/lproc_mdc.c index 27107cd12b..cfba05b226 100644 --- a/lustre/mdc/lproc_mdc.c +++ b/lustre/mdc/lproc_mdc.c @@ -74,11 +74,12 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "kbytesavail", lprocfs_rd_kbytesavail, 0, 0 }, { "filestotal", lprocfs_rd_filestotal, 0, 0 }, { "filesfree", lprocfs_rd_filesfree, 0, 0 }, - //{ "filegroups", lprocfs_rd_filegroups, 0, 0 }, + /*{ "filegroups", lprocfs_rd_filegroups, 0, 0 },*/ { "mds_server_uuid", lprocfs_rd_server_uuid, 0, 0 }, { "mds_conn_uuid", lprocfs_rd_conn_uuid, 0, 0 }, { "max_rpcs_in_flight", mdc_rd_max_rpcs_in_flight, mdc_wr_max_rpcs_in_flight, 0 }, + { "timeouts", lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 }, { 0 } }; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 4fb8b47ffd..db7b876998 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -79,6 +79,7 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, struct ptlrpc_bulk_desc *desc; struct l_wait_info lwi; struct page **pages; + int timeout; int rc = 0, npages, i, tmpcount, tmpsize = 0; ENTRY; @@ -130,7 +131,12 @@ static int mds_sendpage(struct ptlrpc_request *req, struct file *file, GOTO(abort_bulk, rc); } - lwi = LWI_TIMEOUT(obd_timeout * HZ / 4, NULL, NULL); + timeout = (int)req->rq_deadline - (int)cfs_time_current_sec(); + if (timeout < 0) { + CERROR("Req deadline already passed %lu (now: %lu)\n", + req->rq_deadline, cfs_time_current_sec()); + } + lwi = LWI_TIMEOUT(max(timeout, 1) * HZ, NULL, NULL); rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), &lwi); LASSERT (rc == 0 || rc == -ETIMEDOUT); @@ -1022,7 +1028,7 @@ static int mds_getattr_lock(struct ptlrpc_request *req, int offset, pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); default: mds_exit_ucred(&uc, mds); - if (req->rq_reply_state == NULL) { + if (!lustre_packed_reply(req)) { req->rq_status = rc; lustre_pack_reply(req, 1, NULL, NULL); } @@ -1072,7 +1078,7 @@ static int mds_getattr(struct ptlrpc_request *req, int offset) out_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, &uc); out_ucred: - if (req->rq_reply_state == NULL) { + if (!lustre_packed_reply(req)) { req->rq_status = rc; lustre_pack_reply(req, 1, NULL, NULL); } @@ -1097,13 +1103,15 @@ static int mds_obd_statfs(struct obd_device *obd, struct obd_statfs *osfs, static int mds_statfs(struct ptlrpc_request *req) { struct obd_device *obd = req->rq_export->exp_obd; + struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; int rc, size[2] = { sizeof(struct ptlrpc_body), sizeof(struct obd_statfs) }; ENTRY; /* This will trigger a watchdog timeout */ OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_STATFS_LCW_SLEEP, - (MDS_SERVICE_WATCHDOG_TIMEOUT / 1000) + 1); + (MDS_SERVICE_WATCHDOG_FACTOR * + at_get(&svc->srv_at_estimate) / 1000) + 1); OBD_COUNTER_INCREMENT(obd, statfs); rc = lustre_pack_reply(req, 2, size, NULL); @@ -2018,8 +2026,8 @@ static int mds_setup(struct obd_device *obd, obd_count len, void *buf) obd->obd_recoverable_clients, (obd->obd_recoverable_clients == 1) ? "client" : "clients", - (int)(OBD_RECOVERY_TIMEOUT) / 60, - (int)(OBD_RECOVERY_TIMEOUT) % 60, + obd->obd_recovery_timeout / 60, + obd->obd_recovery_timeout % 60, obd->obd_name); } else { LCONSOLE_INFO("MDT %s now serving %s (%s%s%s) with recovery " @@ -2539,7 +2547,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) mds->mds_service = ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_MAXREPSIZE, MDS_REQUEST_PORTAL, - MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, + MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR, mds_handle, LUSTRE_MDS_NAME, obd->obd_proc_entry, NULL, mds_min_threads, mds_max_threads, "ll_mdt"); @@ -2556,7 +2564,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) mds->mds_setattr_service = ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_MAXREPSIZE, MDS_SETATTR_PORTAL, - MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, + MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR, mds_handle, "mds_setattr", obd->obd_proc_entry, NULL, mds_min_threads, mds_max_threads, @@ -2573,7 +2581,7 @@ static int mdt_setup(struct obd_device *obd, obd_count len, void *buf) mds->mds_readpage_service = ptlrpc_init_svc(MDS_NBUFS, MDS_BUFSIZE, MDS_MAXREQSIZE, MDS_MAXREPSIZE, MDS_READPAGE_PORTAL, - MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_TIMEOUT, + MDC_REPLY_PORTAL, MDS_SERVICE_WATCHDOG_FACTOR, mds_handle, "mds_readpage", obd->obd_proc_entry, NULL, MDS_THREADS_MIN_READPAGE, mds_max_threads, diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index 70fb74285a..2c34203fa7 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -446,9 +446,9 @@ static int mds_init_server_data(struct obd_device *obd, struct file *file) obd->obd_next_recovery_transno = obd->obd_last_committed + 1; obd->obd_recovering = 1; obd->obd_recovery_start = CURRENT_SECONDS; - /* Only used for lprocfs_status */ + obd->obd_recovery_timeout = OBD_RECOVERY_TIMEOUT; obd->obd_recovery_end = obd->obd_recovery_start + - OBD_RECOVERY_TIMEOUT; + obd->obd_recovery_timeout; } mds->mds_mount_count = mount_count + 1; diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index c5355e9683..d7c0db5e80 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -26,7 +26,7 @@ struct mds_client_data { __u8 mcd_padding[LR_CLIENT_SIZE - 88]; }; -#define MDS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000) +#define MDS_SERVICE_WATCHDOG_FACTOR 2000 #define MAX_ATIME_DIFF 60 diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index e2cb1c8048..9633c97cb3 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -337,7 +337,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) if (data == NULL) RETURN(-ENOMEM); data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX | - OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64; + OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64 | OBD_CONNECT_AT; data->ocd_version = LUSTRE_VERSION_CODE; /* NB: lov_connect() needs to fill in .ocd_index for each OST */ rc = obd_connect(&conn, mds->mds_osc_obd, &obd->obd_uuid, data); @@ -575,8 +575,8 @@ int mds_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *handle; struct inode *inode = obd->u.obt.obt_sb->s_root->d_inode; BDEVNAME_DECLARE_STORAGE(tmp); - CERROR("*** setting device %s read-only ***\n", - ll_bdevname(obd->u.obt.obt_sb, tmp)); + LCONSOLE_WARN("*** setting obd %s device '%s' read-only ***\n", + obd->obd_name, ll_bdevname(obd->u.obt.obt_sb, tmp)); handle = fsfilt_start(obd, inode, FSFILT_OP_MKNOD, NULL); if (!IS_ERR(handle)) diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index 762cbd8bf7..6020017b68 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -1051,7 +1051,7 @@ static int mgc_process_log(struct obd_device *mgc, if (cld->cld_stopping) RETURN(0); - OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PROCESS_LOG, 20); + OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20); lsi = s2lsi(cld->cld_cfg.cfg_sb); diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c index e93134dfc7..e17634d1da 100644 --- a/lustre/mgs/mgs_handler.c +++ b/lustre/mgs/mgs_handler.c @@ -173,7 +173,7 @@ static int mgs_setup(struct obd_device *obd, obd_count len, void *buf) mgs->mgs_service = ptlrpc_init_svc(MGS_NBUFS, MGS_BUFSIZE, MGS_MAXREQSIZE, MGS_MAXREPSIZE, MGS_REQUEST_PORTAL, - MGC_REPLY_PORTAL, MGS_SERVICE_WATCHDOG_TIMEOUT, + MGC_REPLY_PORTAL, 2000, mgs_handle, LUSTRE_MGS_NAME, obd->obd_proc_entry, NULL, MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX, @@ -379,7 +379,7 @@ static int mgs_handle_target_reg(struct ptlrpc_request *req) obd->obd_name, lockrc); } - OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_TARGET_REG, 10); + OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_PAUSE_TARGET_REG, 10); /* Log writing contention is handled by the fsdb_sem */ @@ -457,8 +457,8 @@ int mgs_handle(struct ptlrpc_request *req) int opc, rc = 0; ENTRY; - OBD_FAIL_TIMEOUT(OBD_FAIL_MGS_SLOW_REQUEST_NET, 2); - OBD_FAIL_RETURN(OBD_FAIL_MGS_ALL_REQUEST_NET | OBD_FAIL_ONCE, 0); + OBD_FAIL_TIMEOUT_MS(OBD_FAIL_MGS_PAUSE_REQ, obd_fail_val); + OBD_FAIL_RETURN(OBD_FAIL_MGS_ALL_REQUEST_NET, 0); LASSERT(current->journal_info == NULL); opc = lustre_msg_get_opc(req->rq_reqmsg); diff --git a/lustre/mgs/mgs_internal.h b/lustre/mgs/mgs_internal.h index 81c94ad99f..5647659d4e 100644 --- a/lustre/mgs/mgs_internal.h +++ b/lustre/mgs/mgs_internal.h @@ -16,9 +16,6 @@ #include <lustre_export.h> -/* in ms */ -#define MGS_SERVICE_WATCHDOG_TIMEOUT (obd_timeout * 1000) - /* mgs_llog.c */ int class_dentry_readdir(struct obd_device *obd, struct dentry *dir, struct vfsmount *inmnt, diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index fbfcd096c4..17e174b219 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -63,9 +63,10 @@ unsigned int obd_alloc_fail_rate; unsigned int obd_debug_peer_on_timeout; unsigned int obd_dump_on_timeout; unsigned int obd_dump_on_eviction; -unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ +unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ -unsigned int obd_health_check_timeout = HEALTH_CHECK_TIMEOUT_DEFAULT; /* seconds */ +unsigned int adaptive_timeout_min = 10; /* seconds */ +unsigned int adaptive_timeout_max = 600; /* seconds */ unsigned int obd_max_dirty_pages = 256; atomic_t obd_dirty_pages; @@ -384,7 +385,8 @@ EXPORT_SYMBOL(obd_dump_on_timeout); EXPORT_SYMBOL(obd_dump_on_eviction); EXPORT_SYMBOL(obd_timeout); EXPORT_SYMBOL(ldlm_timeout); -EXPORT_SYMBOL(obd_health_check_timeout); +EXPORT_SYMBOL(adaptive_timeout_min); +EXPORT_SYMBOL(adaptive_timeout_max); EXPORT_SYMBOL(obd_max_dirty_pages); EXPORT_SYMBOL(obd_dirty_pages); EXPORT_SYMBOL(ptlrpc_put_connection_superhack); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index f04e3ae31d..86cafe346f 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -670,7 +670,7 @@ struct obd_export *class_new_export(struct obd_device *obd, CFS_INIT_LIST_HEAD(&export->exp_handle.h_link); class_handle_hash(&export->exp_handle, export_handle_addref); - export->exp_last_request_time = CURRENT_SECONDS; + export->exp_last_request_time = cfs_time_current_sec(); spin_lock_init(&export->exp_lock); INIT_HLIST_NODE(&export->exp_uuid_hash); INIT_HLIST_NODE(&export->exp_nid_hash); @@ -791,6 +791,24 @@ void class_import_destroy(struct obd_import *import) } EXPORT_SYMBOL(class_import_put); +static void init_imp_at(struct imp_at *at) { + int i; + /* We need enough time to get an early response on a slow network. + Since we can't say for sure how slow a network might be, we use + a user-defined max expected network latency. We will adapt to slow + increases, but a sudden jump can still kill us. */ + at_init(&at->iat_net_latency, adaptive_timeout_min, AT_TIMEBASE_DEFAULT, + AT_FLG_MIN); + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + /* max service estimates are tracked on the server side, so + don't use the AT history here, just use the last reported + val. (But keep hist for proc histogram, worst_ever) */ + at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT, + AT_TIMEBASE_DEFAULT, AT_FLG_NOHIST); + } + at->iat_drain = 0; +} + struct obd_import *class_new_import(struct obd_device *obd) { struct obd_import *imp; @@ -816,6 +834,7 @@ struct obd_import *class_new_import(struct obd_device *obd) CFS_INIT_LIST_HEAD(&imp->imp_conn_list); CFS_INIT_LIST_HEAD(&imp->imp_handle.h_link); class_handle_hash(&imp->imp_handle, import_handle_addref); + init_imp_at(&imp->imp_at); /* the default magic is V1, will be used in connect RPC, and * then adjusted according to the flags in request/reply. */ @@ -946,9 +965,9 @@ static void class_disconnect_export_list(struct list_head *list, int flags) spin_unlock(&fake_exp->exp_lock); rc = obd_disconnect(fake_exp); + CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n", + obd_export_nid2str(exp), exp, rc); class_export_put(exp); - CDEBUG(D_HA, "disconnecting export %s (%p): rc %d\n", - exp->exp_client_uuid.uuid, exp, rc); } EXIT; } diff --git a/lustre/obdclass/linux/linux-module.c b/lustre/obdclass/linux/linux-module.c index 9d57d4f17c..c65126110a 100644 --- a/lustre/obdclass/linux/linux-module.c +++ b/lustre/obdclass/linux/linux-module.c @@ -286,27 +286,6 @@ static int obd_proc_read_health(char *page, char **start, off_t off, return rc; } -static int obd_proc_rd_health_timeout(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - *eof = 1; - return snprintf(page, count, "%d\n", obd_health_check_timeout); -} - -static int obd_proc_wr_health_timeout(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - int val, rc; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - obd_health_check_timeout = val; - - return count; -} - /* Root for /proc/fs/lustre */ struct proc_dir_entry *proc_lustre_root = NULL; @@ -314,8 +293,6 @@ struct lprocfs_vars lprocfs_base[] = { { "version", obd_proc_read_version, NULL, NULL }, { "pinger", obd_proc_read_pinger, NULL, NULL }, { "health_check", obd_proc_read_health, NULL, NULL }, - { "health_check_timeout", obd_proc_rd_health_timeout, - obd_proc_wr_health_timeout, NULL }, { 0 } }; #else diff --git a/lustre/obdclass/linux/linux-sysctl.c b/lustre/obdclass/linux/linux-sysctl.c index 7d919b8a96..89c0cb9b64 100644 --- a/lustre/obdclass/linux/linux-sysctl.c +++ b/lustre/obdclass/linux/linux-sysctl.c @@ -62,6 +62,8 @@ enum { OBD_DUMP_ON_EVICTION, /* dump kernel debug log upon eviction */ OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */ OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */ + ADAPTIVE_MIN, /* Adaptive timeout lower limit */ + ADAPTIVE_MAX, /* Adaptive timeout upper limit */ }; int LL_PROC_PROTO(proc_fail_loc) @@ -177,7 +179,7 @@ static cfs_sysctl_table_t obd_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, - { + { .ctl_name = OBD_LDLM_TIMEOUT, .procname = "ldlm_timeout", .data = &ldlm_timeout, @@ -195,6 +197,22 @@ static cfs_sysctl_table_t obd_table[] = { .proc_handler = &proc_alloc_fail_rate }, #endif + { + .ctl_name = ADAPTIVE_MIN, + .procname = "adaptive_min", + .data = &adaptive_timeout_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = ADAPTIVE_MAX, + .procname = "adaptive_max", + .data = &adaptive_timeout_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { 0 } }; diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 1b2ba7ac9a..84a825155f 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -490,6 +490,111 @@ int lprocfs_rd_conn_uuid(char *page, char **start, off_t off, int count, return rc; } +int lprocfs_at_hist_helper(char *page, int count, int rc, + struct adaptive_timeout *at) +{ + int i; + for (i = 0; i < AT_BINS; i++) + rc += snprintf(page + rc, count - rc, "%3u ", at->at_hist[i]); + rc += snprintf(page + rc, count - rc, "\n"); + return rc; +} + +/* See also ptlrpc_lprocfs_rd_timeouts */ +int lprocfs_rd_timeouts(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + unsigned int cur, worst; + time_t now, worstt; + struct dhms ts; + int i, rc = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + *eof = 1; + + if (AT_OFF) + rc += snprintf(page + rc, count - rc, + "adaptive timeouts off, using obd_timeout %u\n", + obd_timeout); + + rc += snprintf(page + rc, count - rc, + "%-10s : %ld sec\n", "timebase", + imp->imp_at.iat_net_latency.at_binlimit * AT_BINS); + + now = cfs_time_current_sec(); + + /* Some network health info for kicks */ + s2dhms(&ts, now - imp->imp_last_reply_time); + rc += snprintf(page + rc, count - rc, + "%-10s : %ld, "DHMS_FMT" ago\n", + "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts)); + + + cur = at_get(&imp->imp_at.iat_net_latency); + worst = imp->imp_at.iat_net_latency.at_worst_ever; + worstt = imp->imp_at.iat_net_latency.at_worst_time; + s2dhms(&ts, now - worstt); + rc += snprintf(page + rc, count - rc, + "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ", + "network", cur, worst, worstt, DHMS_VARS(&ts)); + rc = lprocfs_at_hist_helper(page, count, rc, + &imp->imp_at.iat_net_latency); + + for(i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (imp->imp_at.iat_portal[i] == 0) + break; + cur = at_get(&imp->imp_at.iat_service_estimate[i]); + worst = imp->imp_at.iat_service_estimate[i].at_worst_ever; + worstt = imp->imp_at.iat_service_estimate[i].at_worst_time; + s2dhms(&ts, now - worstt); + rc += snprintf(page + rc, count - rc, + "portal %-2d : cur %3u worst %3u (at %ld, " + DHMS_FMT" ago) ", imp->imp_at.iat_portal[i], + cur, worst, worstt, DHMS_VARS(&ts)); + rc = lprocfs_at_hist_helper(page, count, rc, + &imp->imp_at.iat_service_estimate[i]); + } + + LPROCFS_CLIMP_EXIT(obd); + return rc; +} + +int lprocfs_wr_timeouts(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + time_t bval; + int val, i, rc; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + if (val <= 0) + return -ERANGE; + + bval = max(1, val / AT_BINS); + spin_lock(&imp->imp_at.iat_net_latency.at_lock); + imp->imp_at.iat_net_latency.at_binlimit = bval; + spin_unlock(&imp->imp_at.iat_net_latency.at_lock); + for(i = 0; i < IMP_AT_MAX_PORTALS; i++) { + spin_lock(&imp->imp_at.iat_service_estimate[i].at_lock); + imp->imp_at.iat_service_estimate[i].at_binlimit = bval; + spin_unlock(&imp->imp_at.iat_service_estimate[i].at_lock); + } + + LPROCFS_CLIMP_EXIT(obd); + return count; +} + static const char *obd_connect_names[] = { "read_only", "lov_index", @@ -694,7 +799,7 @@ static int lprocfs_stats_seq_show(struct seq_file *p, void *v) struct lprocfs_stats *stats = p->private; struct lprocfs_counter *cntr = v; struct lprocfs_counter t, ret = { .lc_min = ~(__u64)0 }; - int i, idx, rc; + int i, idx, rc = 0; if (cntr == &(stats->ls_percpu[0])->lp_cntr[0]) { struct timeval now; @@ -729,6 +834,9 @@ static int lprocfs_stats_seq_show(struct seq_file *p, void *v) ret.lc_sumsquare += t.lc_sumsquare; } + if (ret.lc_count == 0) + goto out; + rc = seq_printf(p, "%-25s "LPU64" samples [%s]", cntr->lc_name, ret.lc_count, cntr->lc_units); if (rc < 0) @@ -1395,7 +1503,9 @@ EXPORT_SYMBOL(lprocfs_rd_server_uuid); EXPORT_SYMBOL(lprocfs_rd_conn_uuid); EXPORT_SYMBOL(lprocfs_rd_num_exports); EXPORT_SYMBOL(lprocfs_rd_numrefs); - +EXPORT_SYMBOL(lprocfs_at_hist_helper); +EXPORT_SYMBOL(lprocfs_rd_timeouts); +EXPORT_SYMBOL(lprocfs_wr_timeouts); EXPORT_SYMBOL(lprocfs_rd_blksize); EXPORT_SYMBOL(lprocfs_rd_kbytestotal); EXPORT_SYMBOL(lprocfs_rd_kbytesfree); diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 4c01e23432..a1d9fd739a 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -760,7 +760,6 @@ int class_process_config(struct lustre_cfg *lcfg) CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", obd_timeout, lcfg->lcfg_num); obd_timeout = max(lcfg->lcfg_num, 1U); - obd_health_check_timeout = HEALTH_CHECK_TIMEOUT; GOTO(out, err = 0); } case LCFG_SET_UPCALL: { diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 2843b54393..b0bc3faeb6 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -188,8 +188,10 @@ int server_put_mount(char *name, struct vfsmount *mnt) CERROR("Can't find mount for %s\n", name); RETURN(-ENOENT); } - lsi = s2lsi(lmi->lmi_sb); LASSERT(lmi->lmi_mnt == mnt); + /* Note if lsi is poisoned, it's probably because server_wait_finished + didn't wait long enough and the sb was freed at the umount. */ + lsi = s2lsi(lmi->lmi_sb); CDEBUG(D_MOUNT, "put_mnt %p from %s, refs=%d, vfscount=%d\n", lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts), count); @@ -518,7 +520,7 @@ DECLARE_MUTEX(mgc_start_lock); static int lustre_start_mgc(struct super_block *sb) { struct lustre_handle mgc_conn = {0, }; - struct obd_connect_data ocd = { 0 }; + struct obd_connect_data *data = NULL; struct lustre_sb_info *lsi = s2lsi(sb); struct obd_device *obd; struct obd_export *exp; @@ -698,8 +700,14 @@ static int lustre_start_mgc(struct super_block *sb) /* nonfatal */ CERROR("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc); + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out, rc = -ENOMEM); + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT; + data->ocd_version = LUSTRE_VERSION_CODE; /* We connect to the MGS at setup, and don't disconnect until cleanup */ - rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), &ocd); + rc = obd_connect(&mgc_conn, obd, &(obd->obd_uuid), data); + OBD_FREE_PTR(data); if (rc) { CERROR("connect failed %d\n", rc); GOTO(out, rc); @@ -1296,7 +1304,7 @@ out_free: RETURN(ERR_PTR(rc)); } -/* We have to wait for everything to finish, including lnet lnd expires, +/* We have to wait for everything to finish, including lnet lnd expires, before it is safe to free the sb */ static void server_wait_finished(struct vfsmount *mnt) { @@ -1307,9 +1315,9 @@ static void server_wait_finished(struct vfsmount *mnt) init_waitqueue_head(&waitq); while (atomic_read(&mnt->mnt_count) > 1) { - if (waited && (waited % 30 == 0)) + if (waited && (waited % 30 == 0)) LCONSOLE_WARN("Mount still busy with %d refs after " - "%d secs\n", atomic_read(&mnt->mnt_count), + "%d secs\n", atomic_read(&mnt->mnt_count), waited); /* Wait for a bit */ waited += 3; diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index f6251c30b5..423e3d2d3f 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -548,7 +548,7 @@ static int filter_init_export(struct obd_export *exp) { spin_lock_init(&exp->exp_filter_data.fed_lock); INIT_LIST_HEAD(&exp->exp_filter_data.fed_mod_list); - + spin_lock(&exp->exp_lock); exp->exp_connecting = 1; spin_unlock(&exp->exp_lock); @@ -805,9 +805,9 @@ static int filter_init_server_data(struct obd_device *obd, struct file * filp) obd->obd_next_recovery_transno = obd->obd_last_committed + 1; obd->obd_recovering = 1; obd->obd_recovery_start = CURRENT_SECONDS; - /* Only used for lprocfs_status */ + obd->obd_recovery_timeout = OBD_RECOVERY_TIMEOUT; obd->obd_recovery_end = obd->obd_recovery_start + - OBD_RECOVERY_TIMEOUT; + obd->obd_recovery_timeout; } out: @@ -1156,7 +1156,7 @@ struct dentry *filter_parent_lock(struct obd_device *obd, obd_gr group, return dparent; rc = filter_lock_dentry(obd, dparent); - fsfilt_check_slow(obd, now, obd_timeout, "parent lock"); + fsfilt_check_slow(obd, now, "parent lock"); return rc ? ERR_PTR(rc) : dparent; } @@ -1742,8 +1742,8 @@ int filter_common_setup(struct obd_device *obd, obd_count len, void *buf, obd->obd_recoverable_clients, (obd->obd_recoverable_clients == 1) ? "client" : "clients", - (int)(OBD_RECOVERY_TIMEOUT) / 60, - (int)(OBD_RECOVERY_TIMEOUT) % 60, + obd->obd_recovery_timeout / 60, + obd->obd_recovery_timeout % 60, obd->obd_name); } else { LCONSOLE_INFO("OST %s now serving %s (%s%s%s) with recovery " @@ -2821,7 +2821,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, struct dentry *dchild = NULL, *dparent = NULL; struct filter_obd *filter; int err = 0, rc = 0, recreate_obj = 0, i; - unsigned long enough_time = jiffies + min(obd_timeout * HZ / 4, 10U*HZ); + cfs_time_t enough_time = cfs_time_shift(DISK_TIMEOUT/2); obd_id next_id; void *handle = NULL; ENTRY; @@ -2952,7 +2952,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, if (rc) break; - if (time_after(jiffies, enough_time)) { + if (cfs_time_after(cfs_time_current(), enough_time)) { CDEBUG(D_RPCTRACE, "%s: precreate slow - want %d got %d \n", obd->obd_name, *num, i); @@ -3441,8 +3441,8 @@ int filter_iocontrol(unsigned int cmd, struct obd_export *exp, struct super_block *sb = obd->u.obt.obt_sb; struct inode *inode = sb->s_root->d_inode; BDEVNAME_DECLARE_STORAGE(tmp); - CERROR("*** setting device %s read-only ***\n", - ll_bdevname(sb, tmp)); + LCONSOLE_WARN("*** setting obd %s device '%s' read-only ***\n", + obd->obd_name, ll_bdevname(sb, tmp)); handle = fsfilt_start(obd, inode, FSFILT_OP_MKNOD, NULL); if (!IS_ERR(handle)) diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h index 65e54431eb..03eabfe5e9 100644 --- a/lustre/obdfilter/filter_internal.h +++ b/lustre/obdfilter/filter_internal.h @@ -29,8 +29,6 @@ #define FILTER_GRANT_CHUNK (2ULL * PTLRPC_MAX_BRW_SIZE) #define GRANT_FOR_LLOG(obd) 16 -#define FILTER_RECOVERY_TIMEOUT (obd_timeout * 5 * HZ / 2) /* *waves hands* */ - extern struct file_operations filter_per_export_stats_fops; /* Data stored per client in the last_rcvd file. In le32 order. */ @@ -67,6 +65,7 @@ struct filter_mod_data { #else #define FILTER_FMD_MAX_NUM_DEFAULT 32 #endif +/* Client cache seconds */ #define FILTER_FMD_MAX_AGE_DEFAULT ((obd_timeout + 10) * HZ) struct filter_mod_data *filter_fmd_find(struct obd_export *exp, diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index 72e3344868..c0f865177e 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -304,7 +304,7 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, inode = dentry->d_inode; obdo_to_inode(inode, oa, OBD_MD_FLATIME); - fsfilt_check_slow(obd, now, obd_timeout, "preprw_read setup"); + fsfilt_check_slow(obd, now, "preprw_read setup"); for (i = 0, lnb = res, rnb = nb; i < obj->ioo_bufcnt; i++, rnb++, lnb++) { @@ -337,7 +337,7 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, filter_iobuf_add_page(obd, iobuf, inode, lnb->page); } - fsfilt_check_slow(obd, now, obd_timeout, "start_page_read"); + fsfilt_check_slow(obd, now, "start_page_read"); rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp, NULL, NULL, NULL); @@ -558,7 +558,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, fso.fso_dentry = dentry; fso.fso_bufcnt = obj->ioo_bufcnt; - fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "preprw_write setup"); + fsfilt_check_slow(exp->exp_obd, now, "preprw_write setup"); /* Don't update inode timestamps if this write is older than a * setattr which modifies the timestamps. b=10150 */ @@ -661,7 +661,7 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, rc = filter_direct_io(OBD_BRW_READ, dentry, iobuf, exp, NULL, NULL, NULL); - fsfilt_check_slow(exp->exp_obd, now, obd_timeout, "start_page_write"); + fsfilt_check_slow(exp->exp_obd, now, "start_page_write"); lprocfs_counter_add(exp->exp_obd->obd_stats, LPROC_FILTER_WRITE_BYTES, tot_bytes); diff --git a/lustre/obdfilter/filter_io_24.c b/lustre/obdfilter/filter_io_24.c index 0ff1a0f201..fe5e6c6658 100644 --- a/lustre/obdfilter/filter_io_24.c +++ b/lustre/obdfilter/filter_io_24.c @@ -474,7 +474,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, GOTO(cleanup, rc); } - fsfilt_check_slow(obd, now, obd_timeout, "brw_start"); + fsfilt_check_slow(obd, now, "brw_start"); i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME; @@ -510,7 +510,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, if (rc == 0) obdo_from_inode(oa, inode, FILTER_VALID_FLAGS); - fsfilt_check_slow(obd, now, obd_timeout, "direct_io"); + fsfilt_check_slow(obd, now, "direct_io"); err = fsfilt_commit_wait(obd, inode, wait_handle); if (err) { @@ -521,7 +521,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int objcount, LASSERTF(oti->oti_transno <= obd->obd_last_committed, "oti_transno "LPU64" last_committed "LPU64"\n", oti->oti_transno, obd->obd_last_committed); - fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit"); + fsfilt_check_slow(obd, now, "commitrw commit"); cleanup: filter_grant_commit(exp, niocount, res); diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 652e6c1f36..20550d5ca9 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -694,7 +694,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, DQUOT_INIT(inode); LOCK_INODE_MUTEX(inode); - fsfilt_check_slow(obd, now, obd_timeout, "i_mutex"); + fsfilt_check_slow(obd, now, "i_mutex"); oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res, oti); if (IS_ERR(oti->oti_handle)) { @@ -707,7 +707,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, } /* have to call fsfilt_commit() from this point on */ - fsfilt_check_slow(obd, now, obd_timeout, "brw_start"); + fsfilt_check_slow(obd, now, "brw_start"); i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME; @@ -759,7 +759,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, lquota_getflag(filter_quota_interface_ref, obd, oa); - fsfilt_check_slow(obd, now, obd_timeout, "direct_io"); + fsfilt_check_slow(obd, now, "direct_io"); err = fsfilt_commit_wait(obd, inode, wait_handle); if (err) { @@ -772,7 +772,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, "oti_transno "LPU64" last_committed "LPU64"\n", oti->oti_transno, obd->obd_last_committed); - fsfilt_check_slow(obd, now, obd_timeout, "commitrw commit"); + fsfilt_check_slow(obd, now, "commitrw commit"); cleanup: filter_grant_commit(exp, niocount, res); diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 4a0f183cd9..7a50e66a11 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -329,6 +329,7 @@ static struct lprocfs_vars lprocfs_obd_vars[] = { { "prealloc_next_id", osc_rd_prealloc_next_id, 0, 0 }, { "prealloc_last_id", osc_rd_prealloc_last_id, 0, 0 }, { "checksums", osc_rd_checksum, osc_wr_checksum, 0 }, + { "timeouts", lprocfs_rd_timeouts, lprocfs_wr_timeouts, 0 }, { 0 } }; diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index 4221588ee7..671089dde9 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -362,8 +362,8 @@ int osc_create(struct obd_export *exp, struct obdo *oa, CDEBUG(D_HA,"%s: oscc recovery in progress, waiting\n", oscc->oscc_obd->obd_name); - lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds(obd_timeout/4)), - NULL, NULL); + lwi = LWI_TIMEOUT(cfs_timeout_cap(cfs_time_seconds( + obd_timeout / 4)), NULL, NULL); rc = l_wait_event(oscc->oscc_waitq, !oscc_recovering(oscc), &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 803bddcd6b..52262eca2b 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -728,13 +728,14 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) * If getting the lock took more time than * client was willing to wait, drop it. b=11330 */ - if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout || + if (cfs_time_current_sec() > req->rq_deadline || OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) { no_reply = 1; CERROR("Dropping timed-out read from %s because locking" - "object "LPX64" took %ld seconds.\n", + "object "LPX64" took %ld seconds (limit was %ld).\n", libcfs_id2str(req->rq_peer), ioo->ioo_id, - cfs_time_current_sec() - req->rq_arrival_time.tv_sec); + cfs_time_current_sec() - req->rq_arrival_time.tv_sec, + req->rq_deadline - req->rq_arrival_time.tv_sec); goto out_lock; } @@ -789,12 +790,24 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) else rc = ptlrpc_start_bulk_transfer(desc); if (rc == 0) { - lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 4, HZ, - ost_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, - !ptlrpc_bulk_active(desc) || - desc->bd_export->exp_failed, &lwi); - LASSERT(rc == 0 || rc == -ETIMEDOUT); + do { + long timeoutl = req->rq_deadline - + cfs_time_current_sec(); + cfs_duration_t timeout = (timeoutl <= 0 || rc) ? + CFS_TICK : cfs_time_seconds(timeoutl); + lwi = LWI_TIMEOUT_INTERVAL(timeout, + cfs_time_seconds(1), + ost_bulk_timeout, + desc); + rc = l_wait_event(desc->bd_waitq, + !ptlrpc_bulk_active(desc) || + desc->bd_export->exp_failed, + &lwi); + LASSERT(rc == 0 || rc == -ETIMEDOUT); + /* Wait again if we changed deadline */ + } while ((rc == -ETIMEDOUT) && + (req->rq_deadline > cfs_time_current_sec())); + if (rc == -ETIMEDOUT) { DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT"); ptlrpc_abort_bulk(desc); @@ -848,11 +861,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) req->rq_status = rc; ptlrpc_error(req); } else { - if (req->rq_reply_state != NULL) { - /* reply out callback would free */ - ptlrpc_rs_decref(req->rq_reply_state); - req->rq_reply_state = NULL; - } + /* reply out callback would free */ + ptlrpc_req_drop_rs(req); CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " "client will retry\n", req->rq_export->exp_obd->obd_name, @@ -977,13 +987,14 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) * If getting the lock took more time than * client was willing to wait, drop it. b=11330 */ - if (cfs_time_current_sec() > req->rq_arrival_time.tv_sec + obd_timeout || + if (cfs_time_current_sec() > req->rq_deadline || OBD_FAIL_CHECK(OBD_FAIL_OST_DROP_REQ)) { no_reply = 1; - CERROR("Dropping timed-out write from %s because locking" - "object "LPX64" took %ld seconds.\n", + CERROR("Dropping timed-out write from %s because locking " + "object "LPX64" took %ld seconds (limit was %ld).\n", libcfs_id2str(req->rq_peer), ioo->ioo_id, - cfs_time_current_sec() - req->rq_arrival_time.tv_sec); + cfs_time_current_sec() - req->rq_arrival_time.tv_sec, + req->rq_deadline - req->rq_arrival_time.tv_sec); goto out_lock; } @@ -1019,11 +1030,21 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) else rc = ptlrpc_start_bulk_transfer (desc); if (rc == 0) { - lwi = LWI_TIMEOUT_INTERVAL(obd_timeout * HZ / 2, HZ, - ost_bulk_timeout, desc); - rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc) || - desc->bd_export->exp_failed, &lwi); - LASSERT(rc == 0 || rc == -ETIMEDOUT); + do { + long timeoutl = req->rq_deadline - + cfs_time_current_sec(); + cfs_duration_t timeout = (timeoutl <= 0 || rc) ? + CFS_TICK : cfs_time_seconds(timeoutl); + lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1), + ost_bulk_timeout, desc); + rc = l_wait_event(desc->bd_waitq, + !ptlrpc_bulk_active(desc) || + desc->bd_export->exp_failed, &lwi); + LASSERT(rc == 0 || rc == -ETIMEDOUT); + /* Wait again if we changed deadline */ + } while ((rc == -ETIMEDOUT) && + (req->rq_deadline > cfs_time_current_sec())); + if (rc == -ETIMEDOUT) { DEBUG_REQ(D_ERROR, req, "timeout on bulk GET"); ptlrpc_abort_bulk(desc); @@ -1149,11 +1170,8 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) req->rq_status = rc; ptlrpc_error(req); } else { - if (req->rq_reply_state != NULL) { - /* reply out callback would free */ - ptlrpc_rs_decref(req->rq_reply_state); - req->rq_reply_state = NULL; - } + /* reply out callback would free */ + ptlrpc_req_drop_rs(req); CWARN("%s: ignoring bulk IO comm error with %s@%s id %s - " "client will retry\n", req->rq_export->exp_obd->obd_name, @@ -1427,6 +1445,7 @@ static int ost_handle(struct ptlrpc_request *req) case OST_CREATE: CDEBUG(D_INODE, "create\n"); OBD_FAIL_RETURN(OBD_FAIL_OST_CREATE_NET, 0); + OBD_FAIL_TIMEOUT_MS(OBD_FAIL_OST_PAUSE_CREATE, obd_fail_val); if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_ENOSPC)) GOTO(out, rc = -ENOSPC); if (OBD_FAIL_CHECK_ONCE(OBD_FAIL_OST_EROFS)) @@ -1702,8 +1721,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) ost->ost_service = ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_MAXREPSIZE, OST_REQUEST_PORTAL, - OSC_REPLY_PORTAL, - obd_timeout * 1000, ost_handle, LUSTRE_OSS_NAME, + OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, + ost_handle, LUSTRE_OSS_NAME, obd->obd_proc_entry, ost_print_req, oss_min_threads, oss_max_threads, "ll_ost"); if (ost->ost_service == NULL) { @@ -1730,8 +1749,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) ost->ost_create_service = ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_MAXREPSIZE, OST_CREATE_PORTAL, - OSC_REPLY_PORTAL, - obd_timeout * 1000, ost_handle, "ost_create", + OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, + ost_handle, "ost_create", obd->obd_proc_entry, ost_print_req, oss_min_create_threads, oss_max_create_threads, @@ -1748,8 +1767,8 @@ static int ost_setup(struct obd_device *obd, obd_count len, void *buf) ost->ost_io_service = ptlrpc_init_svc(OST_NBUFS, OST_BUFSIZE, OST_MAXREQSIZE, OST_MAXREPSIZE, OST_IO_PORTAL, - OSC_REPLY_PORTAL, - obd_timeout * 1000, ost_handle, "ost_io", + OSC_REPLY_PORTAL, OSS_SERVICE_WATCHDOG_FACTOR, + ost_handle, "ost_io", obd->obd_proc_entry, ost_print_req, oss_min_threads, oss_max_threads, "ll_ost_io"); if (ost->ost_io_service == NULL) { diff --git a/lustre/ost/ost_internal.h b/lustre/ost/ost_internal.h index 82a40b90b4..e572eadb1f 100644 --- a/lustre/ost/ost_internal.h +++ b/lustre/ost/ost_internal.h @@ -11,6 +11,8 @@ extern void ost_print_req(void *seq_file, struct ptlrpc_request *req); # define ost_print_req NULL #endif +#define OSS_SERVICE_WATCHDOG_FACTOR 2000 + /* * tunables for per-thread page pool (bug 5137) */ @@ -37,7 +39,7 @@ struct ost_thread_local_cache { struct ost_thread_local_cache *ost_tls(struct ptlrpc_request *r); -#define OSS_DEF_CREATE_THREADS 1UL +#define OSS_DEF_CREATE_THREADS 2UL #define OSS_MAX_CREATE_THREADS 16UL /* Quota stuff */ diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index c3f3f20803..75b62b92fc 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -189,6 +189,190 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) EXIT; } +/* Set server timelimit for this req */ +static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) { + __u32 serv_est; + int idx; + struct imp_at *at; + LASSERT(req->rq_import); + + if (AT_OFF || + ((idx = import_at_get_index(req->rq_import, + req->rq_request_portal)) < 0)) { + /* non-AT settings */ + req->rq_timeout = req->rq_import->imp_server_timeout ? + obd_timeout / 2 : obd_timeout; + goto out; + } + + at = &req->rq_import->imp_at; + serv_est = at_get(&at->iat_service_estimate[idx]); + /* add an arbitrary minimum: 125% +5 sec */ + req->rq_timeout = serv_est + (serv_est >> 2) + 5; + /* We could get even fancier here, using history to predict increased + loading... */ + + if (at->iat_drain > req->rq_timeout) { + /* If we're trying to drain the network queues, give this + req a long timeout */ + req->rq_timeout = at->iat_drain; + CDEBUG(D_ADAPTTO, "waiting %ds to let queues drain\n", + req->rq_timeout); + } + +out: + /* Let the server know what this RPC timeout is by putting it in the + reqmsg*/ + lustre_msg_set_timeout(req->rq_reqmsg, req->rq_timeout); +} + +/* Adjust max service estimate based on server value */ +static void ptlrpc_at_adj_service(struct ptlrpc_request *req) { + int idx; + unsigned int serv_est, oldse; + struct imp_at *at = &req->rq_import->imp_at; + LASSERT(req->rq_import); + + /* service estimate is returned in the repmsg timeout field, + may be 0 on err */ + serv_est = lustre_msg_get_timeout(req->rq_repmsg); + + idx = import_at_get_index(req->rq_import, req->rq_request_portal); + if (idx < 0) + return; + + oldse = at_get(&at->iat_service_estimate[idx]); + /* max service estimates are tracked on the server side, + so just keep minimal history here */ + at_add(&at->iat_service_estimate[idx], serv_est); + + if (at_get(&at->iat_service_estimate[idx]) != oldse) + CDEBUG(D_ADAPTTO, "The RPC service estimate for %s ptl %d " + "has changed %ds to %u\n", + req->rq_import->imp_obd->obd_name, req->rq_request_portal, + at_get(&at->iat_service_estimate[idx]) - oldse, serv_est); +} + +/* Expected network latency per remote node (secs) */ +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req) { + return AT_OFF ? 0 : at_get(&req->rq_import->imp_at.iat_net_latency); +} + +/* Adjust expected network latency */ +static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req) { + unsigned int st, nl, oldnl; + struct imp_at *at = &req->rq_import->imp_at; + time_t now = cfs_time_current_sec(); + LASSERT(req->rq_import); + + st = lustre_msg_get_service_time(req->rq_repmsg); + + /* Network latency is total time less server processing time */ + nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/; + if (st > now - req->rq_sent + 1 /* rounding */) + CERROR("Reported service time %u > total measured time %ld\n", + st, now - req->rq_sent); + + oldnl = at_get(&at->iat_net_latency); + at_add(&at->iat_net_latency, nl); + + if (at_get(&at->iat_net_latency) != oldnl) + CDEBUG(D_ADAPTTO, "The network latency for %s (nid %s)" + " has changed %ds to %u\n", + req->rq_import->imp_obd->obd_name, + obd_uuid2str( + &req->rq_import->imp_connection->c_remote_uuid), + at_get(&at->iat_net_latency) - oldnl, nl); +} + +static int unpack_reply(struct ptlrpc_request *req) { + int rc; + + /* Clear reply swab mask; we may have already swabbed an early reply */ + req->rq_rep_swab_mask = 0; + + rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); + return(-EPROTO); + } + + rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); + return(-EPROTO); + } + return 0; +} + +/* Handle an early reply message. + We can't risk the real reply coming in and changing rq_repmsg, + so this fn must be called under the rq_lock */ +static int ptlrpc_at_early_reply(struct ptlrpc_request *req) { + struct lustre_msg *oldmsg, *msgcpy; + time_t olddl; + int oldlen, rc; + ENTRY; + + req->rq_early = 0; + + rc = unpack_reply(req); + if (rc) + /* Let's just ignore it - same as if it never got here */ + RETURN(rc); + + /* We've got to make sure another early reply doesn't land on + top of our current repbuf. Make a copy and verify checksum. */ + oldlen = req->rq_replen; + spin_unlock(&req->rq_lock); + OBD_ALLOC(msgcpy, oldlen); + if (!msgcpy) { + spin_lock(&req->rq_lock); + RETURN(-ENOMEM); + } + spin_lock(&req->rq_lock); + /* Another reply might have changed the repmsg and replen while + we dropped the lock; doesn't really matter, just use the latest. + If it doesn't fit in oldlen, checksum will be wrong. */ + oldmsg = req->rq_repmsg; + memcpy(msgcpy, oldmsg, oldlen); + if (lustre_msg_get_cksum(msgcpy) != + lustre_msg_calc_cksum(msgcpy)) { + CDEBUG(D_ADAPTTO, "Early reply checksum mismatch, " + "discarding %x != %x\n", lustre_msg_get_cksum(msgcpy), + lustre_msg_calc_cksum(msgcpy)); + GOTO(out, rc = -EINVAL); + } + + /* Our copied msg is valid, now we can adjust the timeouts without + worrying that a new reply will land on the copy. */ + req->rq_repmsg = msgcpy; + + /* Expecting to increase the service time estimate here */ + ptlrpc_at_adj_service(req); + ptlrpc_at_adj_net_latency(req); + + /* Adjust the local timeout for this req */ + ptlrpc_at_set_req_timeout(req); + + olddl = req->rq_deadline; + /* server assumes it now has rq_timeout from when it sent the + early reply, so client should give it at least that long. */ + req->rq_deadline = cfs_time_current_sec() + req->rq_timeout + + ptlrpc_at_get_net_latency(req); + + DEBUG_REQ(D_ADAPTTO, req, + "Early reply #%d, new deadline in %lds (%+lds)", + req->rq_early_count, req->rq_deadline - + cfs_time_current_sec(), req->rq_deadline - olddl); + + req->rq_repmsg = oldmsg; + +out: + OBD_FREE(msgcpy, oldlen); + RETURN(rc); +} + void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) { struct list_head *l, *tmp; @@ -342,11 +526,6 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode, } lustre_msg_add_version(request->rq_reqmsg, version); - - if (imp->imp_server_timeout) - request->rq_timeout = obd_timeout / 2; - else - request->rq_timeout = obd_timeout; request->rq_send_state = LUSTRE_IMP_FULL; request->rq_type = PTL_RPC_MSG_REQUEST; request->rq_import = class_import_get(imp); @@ -363,6 +542,8 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode, /* XXX FIXME bug 249 */ request->rq_request_portal = imp->imp_client->cli_request_portal; request->rq_reply_portal = imp->imp_client->cli_reply_portal; + + ptlrpc_at_set_req_timeout(request); spin_lock_init(&request->rq_lock); CFS_INIT_LIST_HEAD(&request->rq_list); @@ -373,7 +554,7 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode, atomic_set(&request->rq_refcount, 1); lustre_msg_set_opc(request->rq_reqmsg, opcode); - lustre_msg_set_flags(request->rq_reqmsg, 0); + lustre_msg_set_flags(request->rq_reqmsg, imp->imp_msg_flags); RETURN(request); } @@ -558,6 +739,12 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) if (req->rq_restart) GOTO(out, rc = 1); + + if (req->rq_early) { + ptlrpc_at_early_reply(req); + GOTO(out, rc = 0); /* keep waiting */ + } + EXIT; out: spin_unlock(&req->rq_lock); @@ -603,22 +790,14 @@ static int after_reply(struct ptlrpc_request *req) /* NB Until this point, the whole of the incoming message, * including buflens, status etc is in the sender's byte order. */ -#if SWAB_PARANOIA - /* Clear reply swab mask; this is a new reply in sender's byte order */ - req->rq_rep_swab_mask = 0; -#endif LASSERT (req->rq_nob_received <= req->rq_replen); - rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received); - if (rc) { - DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d", rc); - RETURN(-EPROTO); - } + rc = unpack_reply(req); + if (rc) + RETURN(rc); - rc = lustre_unpack_rep_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); - if (rc) { - DEBUG_REQ(D_ERROR, req, "unpack ptlrpc body failed: %d", rc); - RETURN(-EPROTO); - } + OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val); + ptlrpc_at_adj_service(req); + ptlrpc_at_adj_net_latency(req); if (lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_REPLY && lustre_msg_get_type(req->rq_repmsg) != PTL_RPC_MSG_ERR) { @@ -865,6 +1044,14 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) /* Still waiting for a reply? */ if (ptlrpc_client_receiving_reply(req)) continue; + + spin_lock(&req->rq_lock); + if (req->rq_early) { + ptlrpc_at_early_reply(req); + spin_unlock(&req->rq_lock); + continue; + } + spin_unlock(&req->rq_lock); /* Did we actually receive a reply? */ if (!ptlrpc_client_replied(req)) @@ -950,15 +1137,25 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) RETURN(set->set_remaining == 0 || force_timer_recalc); } +/* Return 1 if we should give up, else 0 */ int ptlrpc_expire_one_request(struct ptlrpc_request *req) { struct obd_import *imp = req->rq_import; int rc = 0; ENTRY; - DEBUG_REQ(D_ERROR|D_NETERROR, req, "%s (sent at %lu, %lus ago)", + DEBUG_REQ(D_NETERROR, req, "%s (sent at %lu, %lus ago)", req->rq_net_err ? "network error" : "timeout", - (long)req->rq_sent, CURRENT_SECONDS - req->rq_sent); + (long)req->rq_sent, cfs_time_current_sec() - req->rq_sent); + + if (imp) { + LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago" + " has timed out (limit %lus).\n", req->rq_xid, + req->rq_import->imp_obd->obd_name, + libcfs_nid2str(imp->imp_connection->c_peer.nid), + cfs_time_current_sec() - req->rq_sent, + req->rq_deadline - req->rq_sent); + } if (imp != NULL && obd_debug_peer_on_timeout) LNetCtl(IOC_LIBCFS_DEBUG_PEER, &imp->imp_connection->c_peer); @@ -988,6 +1185,9 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req) * then error it out here. */ if (req->rq_send_state != LUSTRE_IMP_FULL || imp->imp_obd->obd_no_recov) { + DEBUG_REQ(D_RPCTRACE, req, "err -110, sent_state=%s (now=%s)", + ptlrpc_import_state_name(req->rq_send_state), + ptlrpc_import_state_name(imp->imp_state)); spin_lock(&req->rq_lock); req->rq_status = -ETIMEDOUT; req->rq_err = 1; @@ -995,7 +1195,8 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req) RETURN(1); } - /* if request can't be resend we can't wait answer after timeout */ + /* if a request can't be resent we can't wait for an answer after + the timeout */ if (req->rq_no_resend) { DEBUG_REQ(D_RPCTRACE, req, "TIMEOUT-NORESEND:"); rc = 1; @@ -1010,7 +1211,7 @@ int ptlrpc_expired_set(void *data) { struct ptlrpc_request_set *set = data; struct list_head *tmp; - time_t now = CURRENT_SECONDS; + time_t now = cfs_time_current_sec(); ENTRY; LASSERT(set != NULL); @@ -1027,7 +1228,7 @@ int ptlrpc_expired_set(void *data) continue; if (req->rq_timedout || /* already dealt with */ - req->rq_sent + req->rq_timeout > now) /* not expired */ + req->rq_deadline > now) /* not expired */ continue; /* deal with this guy */ @@ -1067,11 +1268,11 @@ void ptlrpc_interrupted_set(void *data) } } +/* get the smallest timeout in the set; this does NOT set a timeout. */ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) { struct list_head *tmp; - time_t now = CURRENT_SECONDS; - time_t deadline; + time_t now = cfs_time_current_sec(); int timeout = 0; struct ptlrpc_request *req; ENTRY; @@ -1089,11 +1290,12 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) if (req->rq_timedout) /* already timed out */ continue; - deadline = req->rq_sent + req->rq_timeout; - if (deadline <= now) /* actually expired already */ + if (req->rq_deadline <= now) { /* actually expired already */ timeout = 1; /* ASAP */ - else if (timeout == 0 || timeout > deadline - now) - timeout = deadline - now; + break; + } else if (timeout == 0 || timeout > req->rq_deadline - now) { + timeout = req->rq_deadline - now; + } } RETURN(timeout); } @@ -1197,8 +1399,9 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) LBUG(); } - if (request->rq_repmsg != NULL) { - OBD_FREE(request->rq_repmsg, request->rq_replen); + if (request->rq_repbuf != NULL) { + OBD_FREE(request->rq_repbuf, request->rq_replen); + request->rq_repbuf = NULL; request->rq_repmsg = NULL; } if (request->rq_export != NULL) { @@ -1285,13 +1488,16 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request) LASSERT(!in_interrupt ()); /* might sleep */ - if (!ptlrpc_client_receiving_reply(request)) + if (!ptlrpc_client_receiving_reply(request) && + !ptlrpc_client_must_unlink(request)) + /* Nothing left to do */ return; LNetMDUnlink (request->rq_reply_md_h); /* We have to l_wait_event() whatever the result, to give liblustre - * a chance to run reply_in_callback() */ + * a chance to run reply_in_callback(), and to make sure we've + * unlinked before returning a req to the pool */ if (request->rq_set != NULL) wq = &request->rq_set->set_waitq; @@ -1301,13 +1507,19 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request) for (;;) { /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL); - rc = l_wait_event (*wq, !ptlrpc_client_receiving_reply(request), &lwi); + lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL); + rc = l_wait_event (*wq, + !ptlrpc_client_receiving_reply(request) && + !ptlrpc_client_must_unlink(request), + &lwi); if (rc == 0) return; LASSERT (rc == -ETIMEDOUT); - DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout"); + DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout " + "rvcng=%d unlnk=%d", + ptlrpc_client_receiving_reply(request), + ptlrpc_client_must_unlink(request)); } EXIT; } @@ -1419,14 +1631,6 @@ void ptlrpc_restart_req(struct ptlrpc_request *req) spin_unlock(&req->rq_lock); } -static int expired_request(void *data) -{ - struct ptlrpc_request *req = data; - ENTRY; - - RETURN(ptlrpc_expire_one_request(req)); -} - static void interrupted_request(void *data) { struct ptlrpc_request *req = data; @@ -1495,7 +1699,8 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) int brc; struct l_wait_info lwi; struct obd_import *imp = req->rq_import; - cfs_duration_t timeout = 0; + cfs_duration_t timeout = CFS_TICK; + long timeoutl; ENTRY; LASSERT(req->rq_set == NULL); @@ -1543,6 +1748,7 @@ restart: list_del_init(&req->rq_list); if (req->rq_err) { + /* rq_status was set locally */ rc = -EIO; } else if (req->rq_intr) { @@ -1589,17 +1795,27 @@ restart: spin_unlock(&imp->imp_lock); rc = ptl_send_rpc(req, 0); - if (rc) { + if (rc) DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc); - timeout = CFS_TICK; - } else { - timeout = cfs_timeout_cap(cfs_time_seconds(req->rq_timeout)); + do { + timeoutl = req->rq_deadline - cfs_time_current_sec(); + timeout = (timeoutl <= 0 || rc) ? CFS_TICK : + cfs_time_seconds(timeoutl); DEBUG_REQ(D_NET, req, - "-- sleeping for "CFS_DURATION_T" jiffies", timeout); + "-- sleeping for "CFS_DURATION_T" ticks", timeout); + lwi = LWI_TIMEOUT_INTR(timeout, NULL, interrupted_request, req); + brc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), + &lwi); + /* Wait again if we changed deadline */ + } while ((brc == -ETIMEDOUT) && + (req->rq_deadline > cfs_time_current_sec())); + + if ((brc == -ETIMEDOUT) && !ptlrpc_expire_one_request(req)) { + /* Wait forever for reconnect / replay or failure */ + lwi = LWI_INTR(interrupted_request, req); + rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), + &lwi); } - lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request, - req); - l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi); CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:nid:opc " "%s:%s:%d:"LPU64":%s:%d\n", cfs_curproc_comm(), @@ -1607,7 +1823,6 @@ restart: lustre_msg_get_status(req->rq_reqmsg), req->rq_xid, libcfs_nid2str(imp->imp_connection->c_peer.nid), lustre_msg_get_opc(req->rq_reqmsg)); - spin_lock(&imp->imp_lock); list_del_init(&req->rq_list); spin_unlock(&imp->imp_lock); @@ -1617,16 +1832,11 @@ restart: * req->rq_receiving_reply is clear and returns. */ ptlrpc_unregister_reply (req); - if (req->rq_err) - GOTO(out, rc = -EIO); - /* Resend if we need to, unless we were interrupted. */ - if (req->rq_resend && !req->rq_intr) { - /* ...unless we were specifically told otherwise. */ - if (req->rq_no_resend) - GOTO(out, rc = -ETIMEDOUT); - spin_lock(&imp->imp_lock); - goto restart; + if (req->rq_err) { + DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d", + rc, req->rq_status); + GOTO(out, rc = -EIO); } if (req->rq_intr) { @@ -1637,6 +1847,15 @@ restart: GOTO(out, rc = -EINTR); } + /* Resend if we need to */ + if (req->rq_resend) { + /* ...unless we were specifically told otherwise. */ + if (req->rq_no_resend) + GOTO(out, rc = -ETIMEDOUT); + spin_lock(&imp->imp_lock); + goto restart; + } + if (req->rq_timedout) { /* non-recoverable timeout */ GOTO(out, rc = -ETIMEDOUT); } @@ -1657,25 +1876,33 @@ restart: out: if (req->rq_bulk != NULL) { - if (rc >= 0) { + while(rc >= 0) { /* success so far. Note that anything going wrong * with bulk now, is EXTREMELY strange, since the * server must have believed that the bulk * tranferred OK before she replied with success to * me. */ + timeoutl = req->rq_deadline - cfs_time_current_sec(); + timeout = (timeoutl <= 0) ? CFS_TICK : + cfs_time_seconds(timeoutl); + lwi = LWI_TIMEOUT(timeout, NULL, NULL); brc = l_wait_event(req->rq_reply_waitq, !ptlrpc_bulk_active(req->rq_bulk), &lwi); LASSERT(brc == 0 || brc == -ETIMEDOUT); + if ((brc == -ETIMEDOUT) && + (req->rq_deadline > cfs_time_current_sec())) + /* Wait again if we changed deadline */ + continue; if (brc != 0) { - LASSERT(brc == -ETIMEDOUT); DEBUG_REQ(D_ERROR, req, "bulk timed out"); rc = brc; } else if (!req->rq_bulk->bd_success) { DEBUG_REQ(D_ERROR, req, "bulk transfer failed"); rc = -EIO; } + break; } if (rc < 0) ptlrpc_unregister_bulk (req); @@ -1769,10 +1996,11 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) aa->praa_old_state = req->rq_send_state; req->rq_send_state = LUSTRE_IMP_REPLAY; req->rq_phase = RQ_PHASE_NEW; - aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); + if (req->rq_repmsg) + aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); req->rq_status = 0; - req->rq_interpret_reply = ptlrpc_replay_interpret; + atomic_inc(&req->rq_import->imp_replay_inflight); ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ @@ -1804,6 +2032,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) spin_lock (&req->rq_lock); if (req->rq_import_generation < imp->imp_generation) { req->rq_err = 1; + req->rq_status = -EINTR; ptlrpc_wake_client_req(req); } spin_unlock (&req->rq_lock); @@ -1818,6 +2047,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) spin_lock (&req->rq_lock); if (req->rq_import_generation < imp->imp_generation) { req->rq_err = 1; + req->rq_status = -EINTR; ptlrpc_wake_client_req(req); } spin_unlock (&req->rq_lock); @@ -1854,3 +2084,4 @@ __u64 ptlrpc_sample_next_xid(void) return tmp; } EXPORT_SYMBOL(ptlrpc_sample_next_xid); + diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 25f13301bf..9df83cb20e 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -82,30 +82,71 @@ void reply_in_callback(lnet_event_t *ev) struct ptlrpc_request *req = cbid->cbid_arg; ENTRY; - LASSERT (ev->type == LNET_EVENT_PUT || - ev->type == LNET_EVENT_UNLINK); - LASSERT (ev->unlinked); - LASSERT (ev->md.start == req->rq_repmsg); - LASSERT (ev->offset == 0); - LASSERT (ev->mlength <= req->rq_replen); - DEBUG_REQ((ev->status == 0) ? D_NET : D_ERROR, req, "type %d, status %d", ev->type, ev->status); + + LASSERT(ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_UNLINK); + LASSERT(ev->md.start == req->rq_repbuf); + LASSERT(ev->mlength <= req->rq_replen); spin_lock(&req->rq_lock); - LASSERT (req->rq_receiving_reply); req->rq_receiving_reply = 0; + req->rq_early = 0; + + if (ev->status) + goto out_wake; + if (ev->type == LNET_EVENT_UNLINK) { + req->rq_must_unlink = 0; + DEBUG_REQ(D_RPCTRACE, req, "unlink"); + goto out_wake; + } - if (ev->type == LNET_EVENT_PUT && ev->status == 0) { + /* We've set LNET_MD_MANAGE_REMOTE for all outgoing requests + for adaptive timeouts' early reply. */ + LASSERT(ev->md.options & LNET_MD_MANAGE_REMOTE); + + if ((ev->offset == 0) && + lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) { + /* Early reply */ + DEBUG_REQ(D_ADAPTTO, req, + "Early reply received: mlen=%u offset=%d replen=%d " + "replied=%d", ev->mlength, ev->offset, req->rq_replen, + req->rq_replied); + + LASSERT(ev->mlength == lustre_msg_early_size()); + req->rq_early_count++; /* number received, client side */ + if (req->rq_replied) + /* If we already got the real reply, ignore the early + reply, but signal so we can unlink */ + goto out_wake; + req->rq_early = 1; + req->rq_nob_received = ev->mlength; + /* repmsg points to early reply */ + req->rq_repmsg = req->rq_repbuf; + /* And we're still receiving */ + req->rq_receiving_reply = 1; + } else { + /* Real reply */ req->rq_replied = 1; req->rq_nob_received = ev->mlength; + /* repmsg points to real reply */ + req->rq_repmsg = (struct lustre_msg *)((char *)req->rq_repbuf + + ev->offset); + /* LNetMDUnlink can't be called under the LNET_LOCK, + so we must unlink in ptlrpc_unregister_reply */ + DEBUG_REQ(D_INFO, req, + "reply in flags=%x mlen=%u offset=%d replen=%d", + lustre_msg_get_flags(req->rq_reqmsg), + ev->mlength, ev->offset, req->rq_replen); } + req->rq_import->imp_last_reply_time = cfs_time_current_sec(); + +out_wake: /* NB don't unlock till after wakeup; req can disappear under us * since we don't have our own ref */ ptlrpc_wake_client_req(req); - spin_unlock(&req->rq_lock); EXIT; } @@ -212,6 +253,12 @@ void request_in_callback(lnet_event_t *ev) #ifdef CRAY_XT3 req->rq_uid = ev->uid; #endif + spin_lock_init(&req->rq_lock); + sema_init(&req->rq_rs_sem, 1); + CFS_INIT_LIST_HEAD(&req->rq_timed_list); + atomic_set(&req->rq_refcount, 1); + if (ev->type == LNET_EVENT_PUT) + DEBUG_REQ(D_RPCTRACE, req, "incoming req"); spin_lock(&service->srv_lock); @@ -237,7 +284,7 @@ void request_in_callback(lnet_event_t *ev) rqbd->rqbd_refcount++; } - list_add_tail(&req->rq_list, &service->srv_request_queue); + list_add_tail(&req->rq_list, &service->srv_req_in_queue); service->srv_n_queued_reqs++; /* NB everything can disappear under us once the request diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 66adf403cc..c5106ba65b 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -185,8 +185,11 @@ void ptlrpc_deactivate_import(struct obd_import *imp) */ void ptlrpc_invalidate_import(struct obd_import *imp) { + struct list_head *tmp, *n; + struct ptlrpc_request *req; struct l_wait_info lwi; - int rc; + time_t last = 0; + int timeout, rc = 0; atomic_inc(&imp->imp_inval_count); @@ -196,16 +199,41 @@ void ptlrpc_invalidate_import(struct obd_import *imp) LASSERT(imp->imp_invalid); /* wait for all requests to error out and call completion callbacks */ - lwi = LWI_TIMEOUT_INTERVAL(cfs_timeout_cap(cfs_time_seconds(obd_timeout)), - HZ, NULL, NULL); - rc = l_wait_event(imp->imp_recovery_waitq, - (atomic_read(&imp->imp_inflight) == 0), - &lwi); + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + last = max(last, req->rq_deadline); + } + list_for_each_safe(tmp, n, &imp->imp_delayed_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + last = max(last, req->rq_deadline); + } + spin_unlock(&imp->imp_lock); - if (rc) - CDEBUG(D_HA, "%s: rc = %d waiting for callback (%d != 0)\n", + timeout = (int)(last - cfs_time_current_sec()); + if (timeout > 0) { + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(timeout), + HZ, NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inflight) == 0), + &lwi); + } + + if (atomic_read(&imp->imp_inflight)) { + CERROR("%s: rc = %d waiting for callback (%d != 0)\n", obd2cli_tgt(imp->imp_obd), rc, atomic_read(&imp->imp_inflight)); + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + DEBUG_REQ(D_ERROR, req, "still on sending list"); + } + list_for_each_safe(tmp, n, &imp->imp_delayed_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + DEBUG_REQ(D_ERROR, req, "still on delayed list"); + } + spin_unlock(&imp->imp_lock); + } obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); @@ -257,6 +285,7 @@ static int import_select_connection(struct obd_import *imp) { struct obd_import_conn *imp_conn = NULL, *conn; struct obd_export *dlmexp; + int tried_all = 1; ENTRY; spin_lock(&imp->imp_lock); @@ -273,44 +302,57 @@ static int import_select_connection(struct obd_import *imp) imp->imp_obd->obd_name, libcfs_nid2str(conn->oic_conn->c_peer.nid), conn->oic_last_attempt); - /* Throttle the reconnect rate to once per RECONNECT_INTERVAL */ - if (cfs_time_before_64(conn->oic_last_attempt + - RECONNECT_INTERVAL * HZ, - cfs_time_current_64())) { - /* If we have never tried this connection since the - the last successful attempt, go with this one */ - if (cfs_time_beforeq_64(conn->oic_last_attempt, - imp->imp_last_success_conn)) { - imp_conn = conn; - break; - } + + /* Don't thrash connections */ + if (cfs_time_before_64(cfs_time_current_64(), + conn->oic_last_attempt + + cfs_time_seconds(CONNECTION_SWITCH_MIN))) { + continue; + } - /* Both of these connections have already been tried - since the last successful connection; just choose the - least recently used */ - if (!imp_conn) - imp_conn = conn; - else if (cfs_time_before_64(conn->oic_last_attempt, - imp_conn->oic_last_attempt)) - imp_conn = conn; - } else { - /* Exceptionally unlikely case caused by the node - * booting and attempting to mount lustre faster than - * than RECONNECT_INTERVAL seconds. */ - if (unlikely(conn->oic_last_attempt == 0)) { - imp_conn = conn; - break; - } + /* If we have not tried this connection since the + the last successful attempt, go with this one */ + if ((conn->oic_last_attempt == 0) || + cfs_time_beforeq_64(conn->oic_last_attempt, + imp->imp_last_success_conn)) { + imp_conn = conn; + tried_all = 0; + break; } + + /* If all of the connections have already been tried + since the last successful connection; just choose the + least recently used */ + if (!imp_conn) + imp_conn = conn; + else if (cfs_time_before_64(conn->oic_last_attempt, + imp_conn->oic_last_attempt)) + imp_conn = conn; } /* if not found, simply choose the current one */ if (!imp_conn) { LASSERT(imp->imp_conn_current); imp_conn = imp->imp_conn_current; + tried_all = 0; } LASSERT(imp_conn->oic_conn); + /* If we've tried everything, and we're back to the beginning of the + list, wait for LND_TIMEOUT to give the queues a chance to drain. */ + if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) { + int must_wait; + LASSERT(imp_conn->oic_last_attempt); + must_wait = LND_TIMEOUT - + (int)cfs_duration_sec(cfs_time_current_64() - + imp_conn->oic_last_attempt); + imp->imp_at.iat_drain = max(0, must_wait); + CWARN("Tried all connections, %lus drain time\n", + imp->imp_at.iat_drain); + } else { + imp->imp_at.iat_drain = 0; + } + imp_conn->oic_last_attempt = cfs_time_current_64(); /* switch connection, don't mind if it's same as the current one */ @@ -455,21 +497,19 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) aa->pcaa_peer_committed = committed_before_reconnect; aa->pcaa_initial_connect = initial_connect; - if (aa->pcaa_initial_connect) { spin_lock(&imp->imp_lock); imp->imp_replayable = 1; spin_unlock(&imp->imp_lock); - /* On an initial connect, we don't know which one of a - failover server pair is up. Don't wait long. */ -#ifdef CRAY_XT3 - request->rq_timeout = max((int)(obd_timeout / 2), 5); -#else - request->rq_timeout = max((int)(obd_timeout / 20), 5); -#endif + if (AT_OFF) + /* AT will use INITIAL_CONNECT_TIMEOUT the first + time, adaptive after that. */ + request->rq_timeout = INITIAL_CONNECT_TIMEOUT; } - DEBUG_REQ(D_RPCTRACE, request, "(re)connect request"); + DEBUG_REQ(D_RPCTRACE, request, "%sconnect request %d", + aa->pcaa_initial_connect ? "initial " : "re", + imp->imp_conn_cnt); ptlrpcd_add_req(request); rc = 0; out: @@ -528,6 +568,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, ENTRY; spin_lock(&imp->imp_lock); + imp->imp_at.iat_drain = 0; if (imp->imp_state == LUSTRE_IMP_CLOSED) { spin_unlock(&imp->imp_lock); RETURN(0); @@ -633,6 +674,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, imp->imp_last_replay_transno = 0; IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); } else { + DEBUG_REQ(D_HA, request, "evicting, flags=%x", msg_flags); imp->imp_remote_handle = *lustre_msg_get_handle(request->rq_repmsg); IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); @@ -745,6 +787,20 @@ finish: ocd->ocd_brw_size >> CFS_PAGE_SHIFT; } + if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) && + (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2)) + /* We need a per-message support flag, because + a. we don't know if the incoming connect reply + supports AT or not (in reply_in_callback) + until we unpack it. + b. failovered server means export and flags are gone + (in ptlrpc_send_reply). + Can only be set when we know AT is supported at + both ends */ + imp->imp_msg_flags |= MSG_AT_SUPPORT; + else + imp->imp_msg_flags &= ~MSG_AT_SUPPORT; + LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) && (cli->cl_max_pages_per_rpc > 0)); } @@ -991,13 +1047,19 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) if (ptlrpc_import_in_recovery(imp)) { struct l_wait_info lwi; - cfs_duration_t timeout = cfs_time_seconds(obd_timeout); + cfs_duration_t timeout; + int idx; + if (AT_OFF || (idx = import_at_get_index(imp, + imp->imp_client->cli_request_portal)) < 0) + timeout = cfs_time_seconds(obd_timeout); + else + timeout = cfs_time_seconds( + at_get(&imp->imp_at.iat_service_estimate[idx])); lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout), back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL); rc = l_wait_event(imp->imp_recovery_waitq, !ptlrpc_import_in_recovery(imp), &lwi); - } spin_lock(&imp->imp_lock); @@ -1012,11 +1074,18 @@ int ptlrpc_disconnect_import(struct obd_import *imp, int noclose) * it fails. We can get through the above with a down server * if the client doesn't know the server is gone yet. */ req->rq_no_resend = 1; -#ifdef CRAY_XT3 - req->rq_timeout = obd_timeout / 3; + +#ifndef CRAY_XT3 + /* We want client umounts to happen quickly, no matter the + server state... */ + req->rq_timeout = min(req->rq_timeout, INITIAL_CONNECT_TIMEOUT); #else - req->rq_timeout = 5; + /* ... but we always want liblustre clients to nicely + disconnect, so only use the adaptive value. */ + if (AT_OFF) + req->rq_timeout = obd_timeout / 3; #endif + IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING); req->rq_send_state = LUSTRE_IMP_CONNECTING; ptlrpc_req_set_repsize(req, 1, NULL); @@ -1031,6 +1100,8 @@ out: else IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED); memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle)); + /* Try all connections in the future - bz 12758 */ + imp->imp_last_recon = 0; spin_unlock(&imp->imp_lock); RETURN(rc); @@ -1044,3 +1115,142 @@ void ptlrpc_import_setasync(struct obd_import *imp, int count) LNetSetAsync(imp->imp_connection->c_peer, count); } + +/* Adaptive Timeout utils */ + +/* Bin into timeslices using AT_BINS bins. + This gives us a max of the last binlimit*AT_BINS secs without the storage, + but still smoothing out a return to normalcy from a slow response. + (E.g. remember the maximum latency in each minute of the last 4 minutes.) */ +void at_add(struct adaptive_timeout *at, unsigned int val) { + /*unsigned int old = at->at_current;*/ + time_t now = cfs_time_current_sec(); + + LASSERT(at); +#if 0 + CDEBUG(D_INFO, "add %u to %p time=%lu tb=%lu v=%u (%u %u %u %u)\n", + val, at, now - at->at_binstart, at->at_binlimit, at->at_current, + at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]); +#endif + if (val == 0) + /* 0's don't count, because we never want our timeout to + drop to 0, and because 0 could mean an error */ + return; + + spin_lock(&at->at_lock); + + if (unlikely(at->at_binstart == 0)) { + /* Special case to remove default from history */ + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = now; + at->at_hist[0] = val; + at->at_binstart = now; + } else if (now - at->at_binstart < at->at_binlimit ) { + /* in bin 0 */ + at->at_hist[0] = max(val, at->at_hist[0]); + at->at_current = max(val, at->at_current); + } else { + int i, shift; + unsigned int maxv = val; + /* move bins over */ + shift = (now - at->at_binstart) / at->at_binlimit; + LASSERT(shift > 0); + for(i = AT_BINS - 1; i >= 0; i--) { + if (i >= shift) { + at->at_hist[i] = at->at_hist[i - shift]; + maxv = max(maxv, at->at_hist[i]); + } else { + at->at_hist[i] = 0; + } + } + at->at_hist[0] = val; + at->at_current = maxv; + at->at_binstart += shift * at->at_binlimit; + } + + if ((at->at_flags & AT_FLG_MIN) && + (at->at_current < adaptive_timeout_min)) + at->at_current = adaptive_timeout_min; + + if (at->at_current > at->at_worst_ever) { + at->at_worst_ever = at->at_current; + at->at_worst_time = now; + } + + if (at->at_flags & AT_FLG_NOHIST) + /* Only keep last reported val; keeping the rest of the history + for proc only */ + at->at_current = val; + +#if 0 + if (at->at_current != old) + CDEBUG(D_ADAPTTO, "AT change: old=%u new=%u delta=%d (val=%u) " + "hist %u %u %u %u\n", + old, at->at_current, at->at_current - old, val, + at->at_hist[0], at->at_hist[1], at->at_hist[2], + at->at_hist[3]); +#endif + spin_unlock(&at->at_lock); +} + +/* Find the imp_at index for a given portal; assign if space available */ +int import_at_get_index(struct obd_import *imp, int portal) { + struct imp_at *at = &imp->imp_at; + int i; + + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + return i; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + /* Not found in list, add it under a lock */ + spin_lock(&imp->imp_lock); + + /* Check unused under lock */ + for (; i < IMP_AT_MAX_PORTALS; i++) { + if (at->iat_portal[i] == portal) + goto out; + if (at->iat_portal[i] == 0) + /* unused */ + break; + } + + if (i >= IMP_AT_MAX_PORTALS) { + CERROR("Tried to use more than %d portals, not enough room " + "in adaptive timeout stats.\n", IMP_AT_MAX_PORTALS); + i = -1; + goto out; + } + at->iat_portal[i] = portal; + +out: + spin_unlock(&imp->imp_lock); + return i; +} + +/* Get total expected lock callback time (net + service). + Since any early reply will only affect the RPC wait time, and not + any local lock timer we set based on the return value here, + we should be conservative. */ +int import_at_get_ldlm(struct obd_import *imp) { + int idx, tot; + + if (!imp || !imp->imp_client || AT_OFF) + return obd_timeout; + + tot = at_get(&imp->imp_at.iat_net_latency); + idx = import_at_get_index(imp, imp->imp_client->cli_request_portal); + if (idx < 0) + tot += obd_timeout; + else + tot += at_get(&imp->imp_at.iat_service_estimate[idx]); + + /* add an arbitrary minimum: 150% + 10 sec */ + tot += (tot >> 1) + 10; + return tot; +} + diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 9d9ca6cc7b..3901854043 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -153,6 +153,8 @@ void ptlrpc_lprocfs_register(struct proc_dir_entry *root, char *dir, svc_counter_config, "req_qdepth", "reqs"); lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR, svc_counter_config, "req_active", "reqs"); + lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT, + svc_counter_config, "adaptive_timeout", "sec"); lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR, svc_counter_config, "reqbuf_avail", "bufs"); for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { @@ -385,6 +387,56 @@ ptlrpc_lprocfs_svc_req_history_open(struct inode *inode, struct file *file) return 0; } +/* See also lprocfs_rd_timeouts */ +static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + unsigned int cur, worst; + time_t worstt; + struct dhms ts; + int rc = 0; + + *eof = 1; + cur = at_get(&svc->srv_at_estimate); + worst = svc->srv_at_estimate.at_worst_ever; + worstt = svc->srv_at_estimate.at_worst_time; + s2dhms(&ts, cfs_time_current_sec() - worstt); + if (AT_OFF) + rc += snprintf(page + rc, count - rc, + "adaptive timeouts off, using obd_timeout %u\n", + obd_timeout); + rc += snprintf(page + rc, count - rc, + "%10s : %ld sec\n", "timebase", + svc->srv_at_estimate.at_binlimit * AT_BINS); + rc += snprintf(page + rc, count - rc, + "%10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ", + "service", cur, worst, worstt, + DHMS_VARS(&ts)); + rc = lprocfs_at_hist_helper(page, count, rc, + &svc->srv_at_estimate); + return rc; +} + +static int ptlrpc_lprocfs_wr_timeouts(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ptlrpc_service *svc = data; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + if (val <= 0) + return -ERANGE; + + spin_lock(&svc->srv_at_estimate.at_lock); + svc->srv_at_estimate.at_binlimit = max(1, val / AT_BINS); + spin_unlock(&svc->srv_at_estimate.at_lock); + + return count; +} + void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, struct ptlrpc_service *svc) { @@ -397,6 +449,10 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, .write_fptr = ptlrpc_lprocfs_write_req_history_max, .read_fptr = ptlrpc_lprocfs_read_req_history_max, .data = svc}, + {.name = "timeouts", + .write_fptr = ptlrpc_lprocfs_wr_timeouts, + .read_fptr = ptlrpc_lprocfs_rd_timeouts, + .data = svc}, {NULL} }; static struct file_operations req_history_fops = { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index 5a863ecf67..49bbcdef3b 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -35,7 +35,8 @@ static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len, lnet_ack_req_t ack, struct ptlrpc_cb_id *cbid, - struct ptlrpc_connection *conn, int portal, __u64 xid) + struct ptlrpc_connection *conn, int portal, __u64 xid, + unsigned int offset) { int rc; lnet_md_t md; @@ -65,11 +66,11 @@ static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len, RETURN (-ENOMEM); } - CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64"\n", - len, portal, xid); + CDEBUG(D_NET, "Sending %d bytes to portal %d, xid "LPD64", offset %u\n", + len, portal, xid, offset); rc = LNetPut (conn->c_self, *mdh, ack, - conn->c_peer, portal, xid, 0, 0); + conn->c_peer, portal, xid, offset, 0); if (rc != 0) { int rc2; /* We're going to get an UNLINK event when I unlink below, @@ -305,11 +306,13 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req) } } -int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) +int ptlrpc_send_reply (struct ptlrpc_request *req, int flags) { struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; struct ptlrpc_reply_state *rs = req->rq_reply_state; struct ptlrpc_connection *conn; + int service_time; + unsigned int offset = 0; int rc; /* We must already have a reply buffer (only ptlrpc_error() may be @@ -320,10 +323,12 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) LASSERT (req->rq_repmsg != NULL); LASSERT (rs != NULL); LASSERT (req->rq_repmsg == rs->rs_msg); - LASSERT (may_be_difficult || !rs->rs_difficult); + LASSERT ((flags & PTLRPC_REPLY_MAYBE_DIFFICULT) || !rs->rs_difficult); LASSERT (rs->rs_cb_id.cbid_fn == reply_out_callback); LASSERT (rs->rs_cb_id.cbid_arg == rs); + /* There may be no rq_export during failover */ + if (req->rq_export && req->rq_export->exp_obd && req->rq_export->exp_obd->obd_fail) { /* Failed obd's only send ENODEV */ @@ -339,6 +344,44 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) lustre_msg_set_type(req->rq_repmsg, req->rq_type); lustre_msg_set_status(req->rq_repmsg, req->rq_status); lustre_msg_set_opc(req->rq_repmsg, lustre_msg_get_opc(req->rq_reqmsg)); + + service_time = max_t(int, cfs_time_current_sec() - + req->rq_arrival_time.tv_sec, 1); + if (!(flags & PTLRPC_REPLY_EARLY) && + (req->rq_type != PTL_RPC_MSG_ERR)) { + int oldse = at_get(&svc->srv_at_estimate); + /* early replies and errors don't count toward our service + time estimate */ + at_add(&svc->srv_at_estimate, service_time); + if (service_time > oldse) + DEBUG_REQ(D_ADAPTTO, req, + "svc %s increased estimate from %d to %d", + svc->srv_name, oldse, service_time); + } + /* Report actual service time for client latency calc */ + lustre_msg_set_service_time(req->rq_repmsg, service_time); + /* Report service time estimate for future client reqs */ + lustre_msg_set_timeout(req->rq_repmsg, at_get(&svc->srv_at_estimate)); + + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) { + /* early replies go to offset 0, regular replies go after that*/ + if (flags & PTLRPC_REPLY_EARLY) { + offset = 0; + /* Include a checksum on early replies - must be done + after all other lustre_msg_set_* */ + lustre_msg_set_cksum(req->rq_repmsg, + lustre_msg_calc_cksum(req->rq_repmsg)); + } else { + offset = lustre_msg_early_size(); + } + } else { + CDEBUG(D_ADAPTTO, "No early reply support: flags=%#x " + "req_flags=%#x magic=%d:%x/%x len=%d\n", + flags, lustre_msg_get_flags(req->rq_reqmsg), + lustre_msg_is_v1(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_reqmsg), + lustre_msg_get_magic(req->rq_repmsg), req->rq_replen); + } if (req->rq_export == NULL || req->rq_export->exp_connection == NULL) conn = ptlrpc_get_connection(req->rq_peer, req->rq_self, NULL); @@ -349,16 +392,18 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int may_be_difficult) CERROR("not replying on NULL connection\n"); /* bug 9635 */ return -ENOTCONN; } + atomic_inc (&svc->srv_outstanding_replies); ptlrpc_rs_addref(rs); /* +1 ref for the network */ - + req->rq_sent = cfs_time_current_sec(); + rc = ptl_send_buf (&rs->rs_md_h, req->rq_repmsg, req->rq_replen, rs->rs_difficult ? LNET_ACK_REQ : LNET_NOACK_REQ, - &rs->rs_cb_id, conn, - svc->srv_rep_portal, req->rq_xid); + &rs->rs_cb_id, conn, svc->srv_rep_portal, + req->rq_xid, offset); if (rc != 0) { atomic_dec (&svc->srv_outstanding_replies); - ptlrpc_rs_decref(rs); + ptlrpc_req_drop_rs(req); } ptlrpc_put_connection(conn); return rc; @@ -428,10 +473,11 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) if (!noreply) { LASSERT (request->rq_replen != 0); - if (request->rq_repmsg == NULL) - OBD_ALLOC(request->rq_repmsg, request->rq_replen); - if (request->rq_repmsg == NULL) + if (request->rq_repbuf == NULL) + OBD_ALLOC(request->rq_repbuf, request->rq_replen); + if (request->rq_repbuf == NULL) GOTO(cleanup_bulk, rc = -ENOMEM); + request->rq_repmsg = NULL; rc = LNetMEAttach(request->rq_reply_portal,/*XXX FIXME bug 249*/ connection->c_peer, request->rq_xid, 0, @@ -446,6 +492,8 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) spin_lock(&request->rq_lock); /* If the MD attach succeeds, there _will_ be a reply_in callback */ request->rq_receiving_reply = !noreply; + /* We are responsible for unlinking the reply buffer */ + request->rq_must_unlink = !noreply; /* Clear any flags that may be present from previous sends. */ request->rq_replied = 0; request->rq_err = 0; @@ -456,14 +504,19 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) spin_unlock(&request->rq_lock); if (!noreply) { - reply_md.start = request->rq_repmsg; + reply_md.start = request->rq_repbuf; reply_md.length = request->rq_replen; - reply_md.threshold = 1; - reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT; + /* Allow multiple early replies */ + reply_md.threshold = LNET_MD_THRESH_INF; + /* Manage remote for early replies */ + reply_md.options = PTLRPC_MD_OPTIONS | LNET_MD_OP_PUT | + LNET_MD_MANAGE_REMOTE; reply_md.user_ptr = &request->rq_reply_cbid; reply_md.eq_handle = ptlrpc_eq_h; - rc = LNetMDAttach(reply_me_h, reply_md, LNET_UNLINK, + /* We must see the unlink callback to unset rq_must_unlink, + so we can't auto-unlink */ + rc = LNetMDAttach(reply_me_h, reply_md, LNET_RETAIN, &request->rq_reply_md_h); if (rc != 0) { CERROR("LNetMDAttach failed: %d\n", rc); @@ -487,14 +540,22 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_SEND, request->rq_timeout + 5); - request->rq_sent = CURRENT_SECONDS; + request->rq_sent = cfs_time_current_sec(); + /* We give the server rq_timeout secs to process the req, and + add the network latency for our local timeout. */ + request->rq_deadline = request->rq_sent + request->rq_timeout + + ptlrpc_at_get_net_latency(request); + ptlrpc_pinger_sending_on_import(request->rq_import); + + DEBUG_REQ(D_INFO, request, "send flg=%x", + lustre_msg_get_flags(request->rq_reqmsg)); rc = ptl_send_buf(&request->rq_req_md_h, request->rq_reqmsg, request->rq_reqlen, LNET_NOACK_REQ, &request->rq_req_cbid, connection, request->rq_request_portal, - request->rq_xid); + request->rq_xid, 0); if (rc == 0) { ptlrpc_lprocfs_rpc_sent(request); RETURN(rc); @@ -506,8 +567,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) if (noreply) RETURN(rc); - else - GOTO(cleanup_me, rc); + cleanup_me: /* MEUnlink is safe; the PUT didn't even get off the ground, and * nobody apart from the PUT's target has the right nid+XID to @@ -518,8 +578,9 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) LASSERT (!request->rq_receiving_reply); cleanup_repmsg: - OBD_FREE(request->rq_repmsg, request->rq_replen); - request->rq_repmsg = NULL; + OBD_FREE(request->rq_repbuf, request->rq_replen); + request->rq_repbuf = NULL; + request->rq_repmsg = NULL; //remove cleanup_bulk: if (request->rq_bulk != NULL) diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 6f5306371b..09ec8e1455 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -83,6 +83,15 @@ int lustre_msg_check_version(struct lustre_msg *msg, __u32 version) } } +/* early reply size */ +int lustre_msg_early_size() { + static int size = 0; + if (!size) + size = lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL); + return size; +} +EXPORT_SYMBOL(lustre_msg_early_size); + static inline int lustre_msg_size_v1(int count, int *lengths) { int size; @@ -244,6 +253,7 @@ static int lustre_pack_request_v2(struct ptlrpc_request *req, lustre_init_msg_v2(req->rq_reqmsg, count, lens, bufs); lustre_msg_add_version(req->rq_reqmsg, PTLRPC_MSG_VERSION); lustre_set_req_swabbed(req, MSG_PTLRPC_BODY_OFF); + return 0; } @@ -329,22 +339,31 @@ out: } static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count, - int *lens, char **bufs) + int *lens, char **bufs, int flags) { struct ptlrpc_reply_state *rs; int msg_len; int size; ENTRY; - LASSERT (req->rq_reply_state == NULL); + /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */ + down(&req->rq_rs_sem); + LASSERT(req->rq_reply_state == NULL); + if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) { + /* Already packed final, no more early */ + up(&req->rq_rs_sem); + RETURN(-EALREADY); + } msg_len = lustre_msg_size_v1(count, lens); size = sizeof(struct ptlrpc_reply_state) + msg_len; OBD_ALLOC(rs, size); if (unlikely(rs == NULL)) { rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size); - if (!rs) + if (!rs) { + up(&req->rq_rs_sem); RETURN (-ENOMEM); + } } atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ rs->rs_cb_id.cbid_fn = reply_out_callback; @@ -358,6 +377,9 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count, req->rq_replen = msg_len; req->rq_reply_state = rs; req->rq_repmsg = rs->rs_msg; + if (!flags) + req->rq_final = 1; /* checked in lustre_packed_reply */ + lustre_init_msg_v1(rs->rs_msg, count, lens, bufs); PTLRPC_RS_DEBUG_LRU_ADD(rs); @@ -366,22 +388,31 @@ static int lustre_pack_reply_v1(struct ptlrpc_request *req, int count, } static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, - int *lens, char **bufs) + int *lens, char **bufs, int flags) { struct ptlrpc_reply_state *rs; int msg_len; int size; ENTRY; + /* Insure only 1 reply at a time: hold sem until ptlrpc_req_drop_rs */ + down(&req->rq_rs_sem); LASSERT(req->rq_reply_state == NULL); + if (req->rq_final && (flags & LPRFL_EARLY_REPLY)) { + /* Already packed final, no more early */ + up(&req->rq_rs_sem); + RETURN(-EALREADY); + } msg_len = lustre_msg_size_v2(count, lens); size = sizeof(struct ptlrpc_reply_state) + msg_len; OBD_ALLOC(rs, size); if (unlikely(rs == NULL)) { rs = lustre_get_emerg_rs(req->rq_rqbd->rqbd_service, size); - if (!rs) + if (!rs) { + up(&req->rq_rs_sem); RETURN (-ENOMEM); + } } atomic_set(&rs->rs_refcount, 1); /* 1 ref for rq_reply_state */ rs->rs_cb_id.cbid_fn = reply_out_callback; @@ -395,6 +426,10 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, req->rq_replen = msg_len; req->rq_reply_state = rs; req->rq_repmsg = rs->rs_msg; + if (!(flags & LPRFL_EARLY_REPLY)) + req->rq_final = 1; /* checked in lustre_packed_reply */ + + /* server side, no rq_repbuf */ lustre_init_msg_v2(rs->rs_msg, count, lens, bufs); lustre_msg_add_version(rs->rs_msg, PTLRPC_MSG_VERSION); lustre_set_rep_swabbed(req, MSG_PTLRPC_BODY_OFF); @@ -404,8 +439,8 @@ static int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, RETURN(0); } -int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens, - char **bufs) +int lustre_pack_reply_flags(struct ptlrpc_request *req, int count, int *lens, + char **bufs, int flags) { int size[] = { sizeof(struct ptlrpc_body) }; @@ -421,10 +456,10 @@ int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens, case LUSTRE_MSG_MAGIC_V1: case LUSTRE_MSG_MAGIC_V1_SWABBED: return lustre_pack_reply_v1(req, count - 1, lens + 1, - bufs ? bufs + 1 : NULL); + bufs ? bufs + 1 : NULL, flags); case LUSTRE_MSG_MAGIC_V2: case LUSTRE_MSG_MAGIC_V2_SWABBED: - return lustre_pack_reply_v2(req, count, lens, bufs); + return lustre_pack_reply_v2(req, count, lens, bufs, flags); default: LASSERTF(0, "incorrect message magic: %08x\n", req->rq_reqmsg->lm_magic); @@ -432,6 +467,13 @@ int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens, } } +int lustre_pack_reply(struct ptlrpc_request *req, int count, int *lens, + char **bufs) +{ + return lustre_pack_reply_flags(req, count, lens, bufs, 0); +} + + void *lustre_msg_buf_v1(void *msg, int n, int min_size) { struct lustre_msg_v1 *m = (struct lustre_msg_v1 *)msg; @@ -717,7 +759,7 @@ static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) __swab32s(&m->lm_bufcount); __swab32s(&m->lm_secflvr); __swab32s(&m->lm_repsize); - __swab32s(&m->lm_timeout); + __swab32s(&m->lm_cksum); CLASSERT(offsetof(typeof(*m), lm_padding_1) != 0); CLASSERT(offsetof(typeof(*m), lm_padding_2) != 0); CLASSERT(offsetof(typeof(*m), lm_padding_3) != 0); @@ -1406,6 +1448,17 @@ __u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg) } } +int lustre_msg_is_v1(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 1; + default: + return 0; + } +} + __u32 lustre_msg_get_magic(struct lustre_msg *msg) { switch (msg->lm_magic) { @@ -1420,6 +1473,88 @@ __u32 lustre_msg_get_magic(struct lustre_msg *msg) } } +__u32 lustre_msg_get_timeout(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + case LUSTRE_MSG_MAGIC_V2_SWABBED: { + struct ptlrpc_body *pb; + + pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + + } + return pb->pb_timeout; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_get_service_time(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + case LUSTRE_MSG_MAGIC_V2_SWABBED: { + struct ptlrpc_body *pb; + + pb = lustre_msg_buf(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + if (!pb) { + CERROR("invalid msg %p: no ptlrpc body!\n", msg); + return 0; + + } + return pb->pb_service_time; + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_get_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + case LUSTRE_MSG_MAGIC_V2_SWABBED: + return msg->lm_cksum; + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + case LUSTRE_MSG_MAGIC_V1_SWABBED: + return 0; + case LUSTRE_MSG_MAGIC_V2: + case LUSTRE_MSG_MAGIC_V2_SWABBED: { + struct ptlrpc_body *pb; + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + return crc32_le(~(__u32)0, (char *)pb, sizeof(*pb)); + } + default: + CERROR("incorrect message magic: %08x\n", msg->lm_magic); + return 0; + } +} + void lustre_msg_set_handle(struct lustre_msg *msg, struct lustre_handle *handle) { switch (msg->lm_magic) { @@ -1572,6 +1707,56 @@ void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt) } } +void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_timeout = timeout; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: { + struct ptlrpc_body *pb; + + pb = lustre_msg_buf_v2(msg, MSG_PTLRPC_BODY_OFF, sizeof(*pb)); + LASSERTF(pb, "invalid msg %p: no ptlrpc body!\n", msg); + pb->pb_service_time = service_time; + return; + } + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum) +{ + switch (msg->lm_magic) { + case LUSTRE_MSG_MAGIC_V1: + return; + case LUSTRE_MSG_MAGIC_V2: + msg->lm_cksum = cksum; + return; + default: + LASSERTF(0, "incorrect message magic: %08x\n", msg->lm_magic); + } +} + + /* byte flipping routines for all wire types declared in * lustre_idl.h implemented here. */ @@ -1588,11 +1773,11 @@ void lustre_swab_ptlrpc_body(struct ptlrpc_body *b) __swab32s (&b->pb_flags); __swab32s (&b->pb_op_flags); __swab32s (&b->pb_conn_cnt); + __swab32s (&b->pb_timeout); + __swab32s (&b->pb_service_time); CLASSERT(offsetof(typeof(*b), pb_padding_1) != 0); CLASSERT(offsetof(typeof(*b), pb_padding_2) != 0); CLASSERT(offsetof(typeof(*b), pb_padding_3) != 0); - CLASSERT(offsetof(typeof(*b), pb_padding_4) != 0); - CLASSERT(offsetof(typeof(*b), pb_padding_5) != 0); } void lustre_swab_connect(struct obd_connect_data *ocd) @@ -2134,8 +2319,8 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, va_start(args, fmt); libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file, data->msg_fn, data->msg_line, fmt, args, - " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d ref %d fl " - REQ_FLAGS_FMT"/%x/%x rc %d/%d\n", + " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d " + "e %d ref %d fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n", req, req->rq_xid, req->rq_transno, req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1, req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) : @@ -2147,8 +2332,8 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, (char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "<?>", (req->rq_import && req->rq_import->imp_client) ? req->rq_import->imp_client->cli_request_portal : -1, - req->rq_reqlen, req->rq_replen, atomic_read(&req->rq_refcount), - DEBUG_REQ_FLAGS(req), + req->rq_reqlen, req->rq_replen, req->rq_early_count, + atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req), req->rq_reqmsg ? lustre_msg_get_flags(req->rq_reqmsg) : 0, req->rq_repmsg ? lustre_msg_get_flags(req->rq_repmsg) : 0, req->rq_status, diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 52d95743e9..4f320c2665 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -65,9 +65,12 @@ int ptlrpc_ping(struct obd_import *imp) static void ptlrpc_update_next_ping(struct obd_import *imp) { - imp->imp_next_ping = cfs_time_shift( - (imp->imp_state == LUSTRE_IMP_DISCON ? - RECONNECT_INTERVAL : PING_INTERVAL)); + int time = (imp->imp_state != LUSTRE_IMP_DISCON) ? PING_INTERVAL : + /* FIXME should this be limited to LND_TIMEOUT so we don't + build up pings in LND output queues? */ + max_t(int, CONNECTION_SWITCH_MIN, + at_get(&imp->imp_at.iat_net_latency)); + imp->imp_next_ping = cfs_time_shift(time); } void ptlrpc_ping_import_soon(struct obd_import *imp) @@ -364,7 +367,7 @@ static int ping_evictor_main(void *arg) obd = pet_exp->exp_obd; spin_unlock(&pet_lock); - expire_time = CURRENT_SECONDS - (3 * obd_timeout / 2); + expire_time = cfs_time_current_sec() - PING_EVICT_TIMEOUT; CDEBUG(D_HA, "evicting all exports of obd %s older than %ld\n", obd->obd_name, expire_time); @@ -386,7 +389,7 @@ static int ping_evictor_main(void *arg) " it.\n", obd->obd_name, obd_uuid2str(&exp->exp_client_uuid), obd_export_nid2str(exp), - (long)(CURRENT_SECONDS - + (long)(cfs_time_current_sec() - exp->exp_last_request_time)); CDEBUG(D_HA, "Last request was at %ld\n", exp->exp_last_request_time); diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index 8648fd741c..25ec036c45 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -115,6 +115,7 @@ enum { PTLRPC_REQWAIT_CNTR = 0, PTLRPC_REQQDEPTH_CNTR, PTLRPC_REQACTIVE_CNTR, + PTLRPC_TIMEOUT, PTLRPC_REQBUF_AVAIL_CNTR, PTLRPC_LAST_CNTR }; diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 2ecf093c61..ba4c3792c3 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -176,6 +176,7 @@ EXPORT_SYMBOL(lustre_msg_swabbed); EXPORT_SYMBOL(lustre_msg_check_version); EXPORT_SYMBOL(lustre_pack_request); EXPORT_SYMBOL(lustre_pack_reply); +EXPORT_SYMBOL(lustre_pack_reply_flags); EXPORT_SYMBOL(lustre_shrink_reply); EXPORT_SYMBOL(lustre_free_reply_state); EXPORT_SYMBOL(lustre_msg_size); @@ -231,6 +232,7 @@ EXPORT_SYMBOL(lustre_msg_get_last_committed); EXPORT_SYMBOL(lustre_msg_get_transno); EXPORT_SYMBOL(lustre_msg_get_status); EXPORT_SYMBOL(lustre_msg_get_conn_cnt); +EXPORT_SYMBOL(lustre_msg_is_v1); EXPORT_SYMBOL(lustre_msg_get_magic); EXPORT_SYMBOL(lustre_msg_set_handle); EXPORT_SYMBOL(lustre_msg_set_type); diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index af8ffbf327..42918aaf86 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -251,10 +251,20 @@ ptlrpc_server_post_idle_rqbds (struct ptlrpc_service *svc) return (-1); } +static void ptlrpc_at_timer(unsigned long castmeharder) +{ + struct ptlrpc_service *svc = (struct ptlrpc_service *)castmeharder; + CDEBUG(D_ADAPTTO, "at timer %s hit at %ld%s\n", + svc->srv_name, cfs_time_current_sec(), + list_empty(&svc->srv_at_list) ? ", empty" : ""); + svc->srv_at_check = 1; + cfs_waitq_signal(&svc->srv_waitq); +} + /* @threadname should be 11 characters or less - 3 will be added on */ struct ptlrpc_service * ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, - int req_portal, int rep_portal, int watchdog_timeout, + int req_portal, int rep_portal, int watchdog_factor, svc_handler_t handler, char *name, cfs_proc_dir_entry_t *proc_entry, svcreq_printfn_t svcreq_printfn, @@ -283,7 +293,7 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, service->srv_buf_size = bufsize; service->srv_rep_portal = rep_portal; service->srv_req_portal = req_portal; - service->srv_watchdog_timeout = watchdog_timeout; + service->srv_watchdog_factor = watchdog_factor; service->srv_handler = handler; service->srv_request_history_print_fn = svcreq_printfn; service->srv_request_seq = 1; /* valid seq #s start at 1 */ @@ -305,6 +315,14 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, CFS_INIT_LIST_HEAD(&service->srv_free_rs_list); cfs_waitq_init(&service->srv_free_rs_waitq); + spin_lock_init(&service->srv_at_lock); + CFS_INIT_LIST_HEAD(&service->srv_req_in_queue); + CFS_INIT_LIST_HEAD(&service->srv_at_list); + cfs_timer_init(&service->srv_at_timer, ptlrpc_at_timer, service); + /* At SOW, service time should be quick; 10s seems generous. If client + timeout is less than this, we'll be sending an early reply. */ + at_init(&service->srv_at_estimate, 10, AT_TIMEBASE_DEFAULT, 0); + spin_lock (&ptlrpc_all_services_lock); list_add (&service->srv_list, &ptlrpc_all_services); spin_unlock (&ptlrpc_all_services_lock); @@ -334,27 +352,37 @@ failed: return NULL; } -static void __ptlrpc_server_free_request(struct ptlrpc_request *req) +static void ptlrpc_server_decref(struct ptlrpc_request *req) { struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; - list_del(&req->rq_list); - - if (req->rq_reply_state != NULL) { - ptlrpc_rs_decref(req->rq_reply_state); - req->rq_reply_state = NULL; - } + if (!atomic_dec_and_test(&req->rq_refcount)) + return; if (req != &rqbd->rqbd_req) { /* NB request buffers use an embedded * req if the incoming req unlinked the * MD; this isn't one of them! */ OBD_FREE(req, sizeof(*req)); + } else { + struct ptlrpc_service *svc = rqbd->rqbd_service; + /* schedule request buffer for re-use. + * NB I can only do this after I've disposed of their + * reqs; particularly the embedded req */ + spin_lock(&svc->srv_lock); + list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds); + spin_unlock(&svc->srv_lock); } } -static void -ptlrpc_server_free_request(struct ptlrpc_request *req) +static void __ptlrpc_server_free_request(struct ptlrpc_request *req) +{ + list_del(&req->rq_list); + ptlrpc_req_drop_rs(req); + ptlrpc_server_decref(req); +} + +static void ptlrpc_server_free_request(struct ptlrpc_request *req) { struct ptlrpc_request_buffer_desc *rqbd = req->rq_rqbd; struct ptlrpc_service *svc = rqbd->rqbd_service; @@ -362,6 +390,11 @@ ptlrpc_server_free_request(struct ptlrpc_request *req) struct list_head *tmp; struct list_head *nxt; + DEBUG_REQ(D_INFO, req, "free req"); + spin_lock(&svc->srv_at_lock); + list_del_init(&req->rq_timed_list); + spin_unlock(&svc->srv_at_lock); + spin_lock(&svc->srv_lock); svc->srv_n_active_reqs--; @@ -407,11 +440,6 @@ ptlrpc_server_free_request(struct ptlrpc_request *req) } spin_lock(&svc->srv_lock); - - /* schedule request buffer for re-use. - * NB I can only do this after I've disposed of their - * reqs; particularly the embedded req */ - list_add_tail(&rqbd->rqbd_list, &svc->srv_idle_rqbds); } } else if (req->rq_reply_state && req->rq_reply_state->rs_prealloc) { /* If we are low on memory, we are not interested in @@ -421,7 +449,6 @@ ptlrpc_server_free_request(struct ptlrpc_request *req) } spin_unlock(&svc->srv_lock); - } /* This function makes sure dead exports are evicted in a timely manner. @@ -442,7 +469,7 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) at the exact right moment. Eventually, all silent exports will make it to the top of the list. */ exp->exp_last_request_time = max(exp->exp_last_request_time, - (time_t)CURRENT_SECONDS + extra_delay); + cfs_time_current_sec() + extra_delay); CDEBUG(D_INFO, "updating export %s at %ld\n", exp->exp_client_uuid.uuid, @@ -477,21 +504,21 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) /* Note - racing to start/reset the obd_eviction timer is safe */ if (exp->exp_obd->obd_eviction_timer == 0) { /* Check if the oldest entry is expired. */ - if (CURRENT_SECONDS > (oldest_time + - (3 * obd_timeout / 2) + extra_delay)) { + if (cfs_time_current_sec() > (oldest_time + PING_EVICT_TIMEOUT + + extra_delay)) { /* We need a second timer, in case the net was down and * it just came back. Since the pinger may skip every * other PING_INTERVAL (see note in ptlrpc_pinger_main), * we better wait for 3. */ - exp->exp_obd->obd_eviction_timer = CURRENT_SECONDS + - 3 * PING_INTERVAL; + exp->exp_obd->obd_eviction_timer = + cfs_time_current_sec() + 3 * PING_INTERVAL; CDEBUG(D_HA, "%s: Think about evicting %s from %ld\n", exp->exp_obd->obd_name, obd_export_nid2str(exp), oldest_time); } } else { - if (CURRENT_SECONDS > (exp->exp_obd->obd_eviction_timer + - extra_delay)) { + if (cfs_time_current_sec() > + (exp->exp_obd->obd_eviction_timer + extra_delay)) { /* The evictor won't evict anyone who we've heard from * recently, so we don't have to check before we start * it. */ @@ -503,6 +530,344 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) EXIT; } +static int ptlrpc_check_req(struct ptlrpc_request *req) +{ + if (lustre_msg_get_conn_cnt(req->rq_reqmsg) < + req->rq_export->exp_conn_cnt) { + DEBUG_REQ(D_ERROR, req, + "DROPPING req from old connection %d < %d", + lustre_msg_get_conn_cnt(req->rq_reqmsg), + req->rq_export->exp_conn_cnt); + return -EEXIST; + } + if (req->rq_export->exp_obd && req->rq_export->exp_obd->obd_fail) { + /* Failing over, don't handle any more reqs, send + error response instead. */ + CDEBUG(D_RPCTRACE, "Dropping req %p for failed obd %s\n", + req, req->rq_export->exp_obd->obd_name); + req->rq_status = -ENODEV; + ptlrpc_error(req); + return -ENODEV; + } + return 0; +} + +/* If closest exipiration is within EARLY_MIN, send early replies to everybody + expiring within EARLY_MAX, asking for AT_EXTRA time */ +#define AT_EARLY_MIN 2 /* Min time needed to send an early reply */ +#define AT_EARLY_MAX 5 /* Dont send early replies if deadline is beyond */ +#define AT_EXTRA 10 /* Early replies add this time to client timeout */ + +static void ptlrpc_at_set_timer(struct ptlrpc_service *svc) +{ + struct ptlrpc_request *rq; + time_t next; + + spin_lock(&svc->srv_at_lock); + if (list_empty(&svc->srv_at_list)) { + cfs_timer_disarm(&svc->srv_at_timer); + spin_unlock(&svc->srv_at_lock); + return; + } + + /* Set timer for closest deadline */ + rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, + rq_timed_list); + next = rq->rq_deadline - cfs_time_current_sec() - AT_EARLY_MIN; + if (next <= 0) + ptlrpc_at_timer((unsigned long)svc); + else + cfs_timer_arm(&svc->srv_at_timer, cfs_time_shift(next)); + spin_unlock(&svc->srv_at_lock); + CDEBUG(D_ADAPTTO, "armed %s at %+lds\n", svc->srv_name, next); +} + +/* Add rpc to early reply check list */ +static int ptlrpc_at_add_timed(struct ptlrpc_request *req) +{ + struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; + struct ptlrpc_request *rq; + int found = 0; + + if (AT_OFF) + return(0); + + if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) == 0) + return(-ENOSYS); + + DEBUG_REQ(D_ADAPTTO, req, "add timed %lds", + req->rq_deadline - cfs_time_current_sec()); + + spin_lock(&svc->srv_at_lock); + + LASSERT(list_empty(&req->rq_timed_list)); + /* Add to sorted list. Presumably latest rpcs will have the latest + deadlines, so search backward. */ + list_for_each_entry_reverse(rq, &svc->srv_at_list, rq_timed_list) { + if (req->rq_deadline > rq->rq_deadline) { + list_add(&req->rq_timed_list, &rq->rq_timed_list); + found++; + break; + } + } + if (!found) + /* Add to front if shortest deadline or list empty */ + list_add(&req->rq_timed_list, &svc->srv_at_list); + + /* Check if we're the head of the list */ + found = (svc->srv_at_list.next == &req->rq_timed_list); + + spin_unlock(&svc->srv_at_lock); + + if (found) + ptlrpc_at_set_timer(svc); + + return 0; +} + +static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req, + int extra_time) +{ + struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; + long deadline = req->rq_deadline - cfs_time_current_sec(); + int rc; + ENTRY; + + /* deadline is when the client expects us to reply, margin is the + difference between clients' and servers' expectations */ + DEBUG_REQ(D_ADAPTTO, req, + "%ssending early reply (deadline %+lds, margin %+lds) for " + "%d+%d", AT_OFF ? "AT off - not " : "", + deadline, deadline - at_get(&svc->srv_at_estimate), + at_get(&svc->srv_at_estimate), extra_time); + + if (AT_OFF) + RETURN(0); + + if ((lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) == 0) { + CERROR("Wanted to ask client for more time, but no AT " + "support\n"); + RETURN(-ENOSYS); + } + + rc = lustre_pack_reply_flags(req, 1, NULL, NULL, LPRFL_EARLY_REPLY); + if (rc) + /* EALREADY means final reply was already packed */ + RETURN(rc); + + if (extra_time) { + /* Fake our processing time into the future to ask the + clients for some extra amount of time */ + extra_time += cfs_time_current_sec() - + req->rq_arrival_time.tv_sec; + at_add(&svc->srv_at_estimate, extra_time); + } + + req->rq_early_count++; /* number sent, server side */ + rc = ptlrpc_send_reply(req, PTLRPC_REPLY_EARLY); + + if (!rc) { + /* Adjust our own deadline to what we told the client */ + req->rq_deadline = req->rq_arrival_time.tv_sec + + at_get(&svc->srv_at_estimate); + } else { + DEBUG_REQ(D_ERROR, req, "Early reply send failed %d", rc); + } + + /* Reset reply */ + req->rq_rep_swab_mask = 0; + /* Free the (early) reply state from lustre_pack_reply. + (ptlrpc_send_reply takes it's own rs ref, so this is safe here) */ + ptlrpc_req_drop_rs(req); + + RETURN(rc); +} + +/* Check if we need to send any early replies, and send them */ +static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) +{ + struct ptlrpc_request *rq, *n; + struct list_head work_list; + time_t now = cfs_time_current_sec(); + int first, counter = 0; + ENTRY; + + if (AT_OFF) + RETURN(0); + + spin_lock(&svc->srv_at_lock); + if (svc->srv_at_check == 0) { + spin_unlock(&svc->srv_at_lock); + RETURN(0); + } + svc->srv_at_check = 0; + + if (list_empty(&svc->srv_at_list)) { + spin_unlock(&svc->srv_at_lock); + RETURN(0); + } + + /* The timer went off, but maybe the nearest rpc already completed. */ + rq = list_entry(svc->srv_at_list.next, struct ptlrpc_request, + rq_timed_list); + first = (int)(rq->rq_deadline - now); + if (first > AT_EARLY_MIN) { + /* We've still got plenty of time. Reset the timer. */ + spin_unlock(&svc->srv_at_lock); + ptlrpc_at_set_timer(svc); + RETURN(0); + } + + /* We're close to a timeout, and we don't know how much longer the + server will take. Send early replies to everyone expiring soon. */ + CFS_INIT_LIST_HEAD(&work_list); + list_for_each_entry_safe(rq, n, &svc->srv_at_list, rq_timed_list) { + if (rq->rq_deadline <= now + AT_EARLY_MAX) { + list_move(&rq->rq_timed_list, &work_list); + counter++; + } else { + break; + } + } + + spin_unlock(&svc->srv_at_lock); + + CDEBUG(D_ADAPTTO, "timeout in %+ds, asking for %d secs on %d early " + "replies\n", first, AT_EXTRA, counter); + if (first < 0) + /* We're already past request deadlines before we even get a + chance to send early replies */ + LCONSOLE_WARN("%s: This server is not able to keep up with " + "request traffic (cpu-bound).\n", svc->srv_name); + + /* ptlrpc_server_free_request may delete an entry out of the work + list */ + counter = 0; + spin_lock(&svc->srv_at_lock); + while (!list_empty(&work_list)) { + rq = list_entry(work_list.next, struct ptlrpc_request, + rq_timed_list); + list_del_init(&rq->rq_timed_list); + /* if the entry is still in the worklist, it hasn't been + deleted, and is safe to take a ref to keep the req around */ + atomic_inc(&rq->rq_refcount); + spin_unlock(&svc->srv_at_lock); + if (!rq->rq_final && + (ptlrpc_at_send_early_reply(rq, AT_EXTRA) == 0)) { + counter++; + ptlrpc_at_add_timed(rq); + } else if (rq->rq_final) { + DEBUG_REQ(D_ADAPTTO, rq, "already packed final reply, " + "not sending early"); + } + ptlrpc_server_decref(rq); + spin_lock(&svc->srv_at_lock); + } + spin_unlock(&svc->srv_at_lock); + + if (!counter) + /* Nothing added to timed list, so we have to kick the timer + ourselves. */ + ptlrpc_at_set_timer(svc); + + RETURN(0); +} + +/* Handle freshly incoming reqs, check timeout, send early reply if needed, + pass on to regular request queue */ +static int +ptlrpc_server_handle_req_in(struct ptlrpc_service *svc) +{ + struct ptlrpc_request *req; + __u32 deadline; + int rc; + ENTRY; + + LASSERT(svc); + + spin_lock(&svc->srv_lock); + if (list_empty(&svc->srv_req_in_queue)) { + spin_unlock(&svc->srv_lock); + RETURN(0); + } + + req = list_entry(svc->srv_req_in_queue.next, + struct ptlrpc_request, rq_list); + list_del_init (&req->rq_list); + /* Consider this still a "queued" request as far as stats are + concerned */ + spin_unlock(&svc->srv_lock); + +#if SWAB_PARANOIA + /* Clear request swab mask; this is a new request */ + req->rq_req_swab_mask = 0; +#endif + rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen); + if (rc != 0) { + CERROR ("error unpacking request: ptl %d from %s" + " xid "LPU64"\n", svc->srv_req_portal, + libcfs_id2str(req->rq_peer), req->rq_xid); + goto err_req; + } + + rc = lustre_unpack_req_ptlrpc_body(req, MSG_PTLRPC_BODY_OFF); + if (rc) { + CERROR ("error unpacking ptlrpc body: ptl %d from %s" + " xid "LPU64"\n", svc->srv_req_portal, + libcfs_id2str(req->rq_peer), req->rq_xid); + goto err_req; + } + + rc = -EINVAL; + if (lustre_msg_get_type(req->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { + CERROR("wrong packet type received (type=%u) from %s\n", + lustre_msg_get_type(req->rq_reqmsg), + libcfs_id2str(req->rq_peer)); + goto err_req; + } + + CDEBUG(D_NET, "got req "LPD64"\n", req->rq_xid); + + req->rq_export = class_conn2export( + lustre_msg_get_handle(req->rq_reqmsg)); + if (req->rq_export) { + rc = ptlrpc_check_req(req); + class_export_put(req->rq_export); + if (rc) + goto err_req; + } + + /* req_in handling should/must be fast */ + if (cfs_time_current_sec() - req->rq_arrival_time.tv_sec > 5) + DEBUG_REQ(D_WARNING, req, "Slow req_in handling %lus", + cfs_time_current_sec() - req->rq_arrival_time.tv_sec); + + /* Set rpc server deadline and add it to the timed list */ + deadline = (lustre_msg_get_flags(req->rq_reqmsg) & MSG_AT_SUPPORT) ? + /* The max time the client expects us to take */ + lustre_msg_get_timeout(req->rq_reqmsg) : obd_timeout; + LASSERT(deadline > 0); + req->rq_deadline = req->rq_arrival_time.tv_sec + deadline; + + ptlrpc_at_add_timed(req); + + /* Move it over to the request processing queue */ + spin_lock(&svc->srv_lock); + list_add_tail(&req->rq_list, &svc->srv_request_queue); + cfs_waitq_signal(&svc->srv_waitq); + spin_unlock(&svc->srv_lock); + RETURN(1); + +err_req: + spin_lock(&svc->srv_lock); + svc->srv_n_queued_reqs--; + svc->srv_n_active_reqs++; + spin_unlock(&svc->srv_lock); + ptlrpc_server_free_request(req); + + RETURN(1); +} + static int ptlrpc_server_handle_request(struct ptlrpc_service *svc, struct ptlrpc_thread *thread) @@ -512,18 +877,23 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, struct timeval work_start; struct timeval work_end; long timediff; - int rc, reply; + int rc; ENTRY; LASSERT(svc); spin_lock(&svc->srv_lock); if (list_empty (&svc->srv_request_queue) || - (svc->srv_n_difficult_replies != 0 && + ( +#ifndef __KERNEL__ + /* !@%$# liblustre only has 1 thread */ + svc->srv_n_difficult_replies != 0 && +#endif svc->srv_n_active_reqs >= (svc->srv_threads_running - 1))) { - /* If all the other threads are handling requests, I must - * remain free to handle any 'difficult' reply that might - * block them */ + /* Don't handle regular requests in the last thread, in order * remain free to handle any 'difficult' replies (that might + * to handle difficult replies (which might block other threads) + * as well as handle any incoming reqs, early replies, etc. + * That means we always need at least 2 service threads. */ spin_unlock(&svc->srv_lock); RETURN(0); } @@ -545,78 +915,23 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, svc->srv_n_queued_reqs); lprocfs_counter_add(svc->srv_stats, PTLRPC_REQACTIVE_CNTR, svc->srv_n_active_reqs); + lprocfs_counter_add(svc->srv_stats, PTLRPC_TIMEOUT, + at_get(&svc->srv_at_estimate)); } - -#if SWAB_PARANOIA - /* Clear request swab mask; this is a new request */ - request->rq_req_swab_mask = 0; -#endif - rc = lustre_unpack_msg(request->rq_reqmsg, request->rq_reqlen); - if (rc != 0) { - CERROR ("error unpacking request: ptl %d from %s" - " xid "LPU64"\n", svc->srv_req_portal, - libcfs_id2str(request->rq_peer), request->rq_xid); - goto out_req; - } - - rc = lustre_unpack_req_ptlrpc_body(request, MSG_PTLRPC_BODY_OFF); - if (rc) { - CERROR ("error unpacking ptlrpc body: ptl %d from %s" - " xid "LPU64"\n", svc->srv_req_portal, - libcfs_id2str(request->rq_peer), request->rq_xid); - goto out_req; - } - - rc = -EINVAL; - if (lustre_msg_get_type(request->rq_reqmsg) != PTL_RPC_MSG_REQUEST) { - CERROR("wrong packet type received (type=%u) from %s\n", - lustre_msg_get_type(request->rq_reqmsg), - libcfs_id2str(request->rq_peer)); - goto out_req; - } - + CDEBUG(D_NET, "got req "LPD64"\n", request->rq_xid); - + request->rq_svc_thread = thread; request->rq_export = class_conn2export( lustre_msg_get_handle(request->rq_reqmsg)); if (request->rq_export) { - if (lustre_msg_get_conn_cnt(request->rq_reqmsg) < - request->rq_export->exp_conn_cnt) { - DEBUG_REQ(D_ERROR, request, - "DROPPING req from old connection %d < %d", - lustre_msg_get_conn_cnt(request->rq_reqmsg), - request->rq_export->exp_conn_cnt); - goto put_conn; - } - if (request->rq_export->exp_obd && - request->rq_export->exp_obd->obd_fail) { - /* Failing over, don't handle any more reqs, send - error response instead. */ - CDEBUG(D_RPCTRACE,"Dropping req %p for failed obd %s\n", - request, request->rq_export->exp_obd->obd_name); - request->rq_status = -ENODEV; - ptlrpc_error(request); + if (ptlrpc_check_req(request)) goto put_conn; - } - - ptlrpc_update_export_timer(request->rq_export, timediff/500000); + ptlrpc_update_export_timer(request->rq_export, timediff >> 19); export = class_export_rpc_get(request->rq_export); } - /* Discard requests queued for longer than my timeout. If the - * client's timeout is similar to mine, she'll be timing out this - * REQ anyway (bug 1502) */ - if (timediff / 1000000 > (long)obd_timeout) { - CERROR("Dropping timed-out opc %d request from %s" - ": %ld seconds old\n", - lustre_msg_get_opc(request->rq_reqmsg), - libcfs_id2str(request->rq_peer), - timediff / 1000000); - goto put_rpc_export; - } - request->rq_phase = RQ_PHASE_INTERPRET; CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc " @@ -629,8 +944,10 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, libcfs_id2str(request->rq_peer), lustre_msg_get_opc(request->rq_reqmsg)); - rc = svc->srv_handler(request); + OBD_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_PAUSE_REQ, obd_fail_val); + rc = svc->srv_handler(request); + request->rq_phase = RQ_PHASE_COMPLETE; CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc " @@ -643,7 +960,6 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, libcfs_id2str(request->rq_peer), lustre_msg_get_opc(request->rq_reqmsg)); -put_rpc_export: if (export != NULL) class_export_rpc_put(export); @@ -651,31 +967,23 @@ put_conn: if (request->rq_export != NULL) class_export_put(request->rq_export); - reply = request->rq_reply_state && request->rq_repmsg; /* bug 11169 */ + if (cfs_time_current_sec() > request->rq_deadline) { + DEBUG_REQ(D_WARNING, request, "Request took longer than" + " estimated (%+lds); client may timeout.", + cfs_time_current_sec() - request->rq_deadline); + } do_gettimeofday(&work_end); timediff = cfs_timeval_sub(&work_end, &work_start, NULL); - if (timediff / 1000000 > (long)obd_timeout) - CERROR("request "LPU64" opc %u from %s processed in %lds " - "trans "LPU64" rc %d/%d\n", - request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg), - libcfs_id2str(request->rq_peer), - cfs_timeval_sub(&work_end, &request->rq_arrival_time, - NULL) / 1000000, - reply ? lustre_msg_get_transno(request->rq_repmsg) : - request->rq_transno, - request->rq_status, - reply ? lustre_msg_get_status(request->rq_repmsg): -999); - else - CDEBUG(D_RPCTRACE,"request "LPU64" opc %u from %s processed in " - "%ldus (%ldus total) trans "LPU64" rc %d/%d\n", - request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg), - libcfs_id2str(request->rq_peer), timediff, - cfs_timeval_sub(&work_end, &request->rq_arrival_time, - NULL), - request->rq_transno, request->rq_status, - reply ? lustre_msg_get_status(request->rq_repmsg): -999); - + CDEBUG(D_RPCTRACE, "request "LPU64" opc %u from %s processed in " + "%ldus (%ldus total) trans "LPU64" rc %d/%d\n", + request->rq_xid, lustre_msg_get_opc(request->rq_reqmsg), + libcfs_id2str(request->rq_peer), timediff, + cfs_timeval_sub(&work_end, &request->rq_arrival_time, NULL), + request->rq_repmsg ? lustre_msg_get_transno(request->rq_repmsg) : + request->rq_transno, request->rq_status, + request->rq_repmsg ? lustre_msg_get_status(request->rq_repmsg): + -999); if (svc->srv_stats != NULL) { int opc = opcode_offset(lustre_msg_get_opc(request->rq_reqmsg)); if (opc > 0) { @@ -685,8 +993,13 @@ put_conn: timediff); } } + if (request->rq_early_count) { + DEBUG_REQ(D_ADAPTTO, request, + "sent %d early replies before finishing in %lds", + request->rq_early_count, + work_end.tv_sec - request->rq_arrival_time.tv_sec); + } -out_req: ptlrpc_server_free_request(request); RETURN(1); @@ -813,7 +1126,9 @@ liblustre_check_services (void *arg) svc->srv_threads_running++; do { - rc = ptlrpc_server_handle_reply(svc); + rc = ptlrpc_server_handle_req_in(svc); + rc |= ptlrpc_server_handle_reply(svc); + rc |= ptlrpc_at_check_timed(svc); rc |= ptlrpc_server_handle_request(svc, NULL); rc |= (ptlrpc_server_post_idle_rqbds(svc) > 0); did_something |= rc; @@ -882,7 +1197,7 @@ static int ptlrpc_main(void *arg) #ifdef WITH_GROUP_INFO struct group_info *ginfo = NULL; #endif - int rc = 0; + int counter, rc = 0; ENTRY; ptlrpc_daemonize(data->name); @@ -936,7 +1251,9 @@ static int ptlrpc_main(void *arg) */ cfs_waitq_signal(&thread->t_ctl_waitq); - watchdog = lc_watchdog_add(svc->srv_watchdog_timeout, NULL, NULL); + watchdog = lc_watchdog_add((AT_OFF ? obd_timeout : + at_get(&svc->srv_at_estimate)) * + svc->srv_watchdog_factor, NULL, NULL); spin_lock(&svc->srv_lock); svc->srv_threads_running++; @@ -964,14 +1281,17 @@ static int ptlrpc_main(void *arg) svc->srv_n_difficult_replies == 0) || (!list_empty(&svc->srv_idle_rqbds) && svc->srv_rqbd_timeout == 0) || - !list_empty (&svc->srv_reply_queue) || - (!list_empty (&svc->srv_request_queue) && - (svc->srv_n_difficult_replies == 0 || - svc->srv_n_active_reqs < - (svc->srv_threads_running - 1))), + !list_empty(&svc->srv_req_in_queue) || + !list_empty(&svc->srv_reply_queue) || + (!list_empty(&svc->srv_request_queue) && + (svc->srv_n_active_reqs < + (svc->srv_threads_running - 1))) || + svc->srv_at_check, &lwi); - lc_watchdog_touch(watchdog); + lc_watchdog_touch_ms(watchdog, (AT_OFF ? obd_timeout : + at_get(&svc->srv_at_estimate)) * + svc->srv_watchdog_factor); ptlrpc_check_rqbd_pool(svc); @@ -981,15 +1301,24 @@ static int ptlrpc_main(void *arg) ptlrpc_start_thread(dev, svc); } - if (!list_empty (&svc->srv_reply_queue)) + if (!list_empty(&svc->srv_reply_queue)) ptlrpc_server_handle_reply (svc); - /* only handle requests if there are no difficult replies - * outstanding, or I'm not the last thread handling - * requests */ + counter = 0; + while(!list_empty(&svc->srv_req_in_queue)) { + /* Process all incoming reqs before handling any */ + ptlrpc_server_handle_req_in(svc); + /* but limit ourselves in case of flood */ + if (counter++ > 1000) + break; + } + + if (svc->srv_at_check) + ptlrpc_at_check_timed(svc); + + /* don't handle requests in the last thread */ if (!list_empty (&svc->srv_request_queue) && - (svc->srv_n_difficult_replies == 0 || - svc->srv_n_active_reqs < (svc->srv_threads_running - 1))) + (svc->srv_n_active_reqs < (svc->srv_threads_running - 1))) ptlrpc_server_handle_request(svc, thread); if (!list_empty(&svc->srv_idle_rqbds) && @@ -1068,7 +1397,9 @@ int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc) int i, rc = 0; ENTRY; - LASSERT(svc->srv_threads_min > 0); + /* We require 2 threads min - see note in + ptlrpc_server_handle_request */ + LASSERT(svc->srv_threads_min >= 2); for (i = 0; i < svc->srv_threads_min; i++) { rc = ptlrpc_start_thread(dev, svc); if (rc) { @@ -1149,6 +1480,8 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) struct list_head *tmp; struct ptlrpc_reply_state *rs, *t; + cfs_timer_disarm(&service->srv_at_timer); + ptlrpc_stop_all_threads(service); LASSERT(list_empty(&service->srv_threads)); @@ -1190,7 +1523,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT(cfs_time_seconds(300), NULL, NULL); + lwi = LWI_TIMEOUT(cfs_time_seconds(FOREVER), NULL, NULL); rc = l_wait_event(service->srv_waitq, service->srv_nrqbd_receiving == 0, &lwi); @@ -1212,6 +1545,17 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) /* purge the request queue. NB No new replies (rqbds all unlinked) * and no service threads, so I'm the only thread noodling the * request queue now */ + while (!list_empty(&service->srv_req_in_queue)) { + struct ptlrpc_request *req = + list_entry(service->srv_req_in_queue.next, + struct ptlrpc_request, + rq_list); + + list_del(&req->rq_list); + service->srv_n_queued_reqs--; + service->srv_n_active_reqs++; + ptlrpc_server_free_request(req); + } while (!list_empty(&service->srv_request_queue)) { struct ptlrpc_request *req = list_entry(service->srv_request_queue.next, @@ -1221,7 +1565,6 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) list_del(&req->rq_list); service->srv_n_queued_reqs--; service->srv_n_active_reqs++; - ptlrpc_server_free_request(req); } LASSERT(service->srv_n_queued_reqs == 0); @@ -1261,6 +1604,9 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) OBD_FREE(rs, service->srv_max_reply_size); } + /* In case somebody rearmed this in the meantime */ + cfs_timer_disarm(&service->srv_at_timer); + OBD_FREE(service, sizeof(*service)); return 0; } @@ -1274,31 +1620,31 @@ int ptlrpc_service_health_check(struct ptlrpc_service *svc) { struct ptlrpc_request *request; struct timeval right_now; - long timediff, cutoff; - int rc = 0; + long timediff; if (svc == NULL) return 0; - spin_lock(&svc->srv_lock); - - if (list_empty(&svc->srv_request_queue)) - goto out; + do_gettimeofday(&right_now); + spin_lock(&svc->srv_lock); + if (list_empty(&svc->srv_request_queue)) { + spin_unlock(&svc->srv_lock); + return 0; + } + + /* How long has the next entry been waiting? */ request = list_entry(svc->srv_request_queue.next, struct ptlrpc_request, rq_list); - - do_gettimeofday(&right_now); timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL); + spin_unlock(&svc->srv_lock); - cutoff = obd_health_check_timeout; - - if (timediff / 1000000 > cutoff) { - rc = -1; - goto out; + if ((timediff / ONE_MILLION) > (AT_OFF ? obd_timeout * 3/2 : + adaptive_timeout_max)) { + CERROR("%s: unhealthy - request has been waiting %lds\n", + svc->srv_name, timediff / ONE_MILLION); + return (-1); } - out: - spin_unlock(&svc->srv_lock); - return rc; + return 0; } diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index c29cb6f21e..f9ca561f6f 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -12,7 +12,7 @@ void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' * (make -C lustre/utils newwirecheck) - * running on Linux pancake 2.6.18-skas3-v9-pre9 #2 Tue Oct 17 13:08:24 PDT 2006 i686 i686 i3 + * running on Linux pancake 2.6.18-skas3-v9-pre9 #1 Tue Feb 20 10:37:58 PST 2007 i686 i686 i3 * with gcc version 3.4.4 */ @@ -35,6 +35,8 @@ void lustre_assert_wire_constants(void) (long long)MSG_RESENT); LASSERTF(MSG_REPLAY == 4, " found %lld\n", (long long)MSG_REPLAY); + LASSERTF(MSG_AT_SUPPORT == 8, " found %lld\n", + (long long)MSG_AT_SUPPORT); LASSERTF(MSG_CONNECT_RECOVERING == 1, " found %lld\n", (long long)MSG_CONNECT_RECOVERING); LASSERTF(MSG_CONNECT_RECONNECT == 2, " found %lld\n", @@ -298,10 +300,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize)); LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n", (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum)); LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n", (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1)); LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n", @@ -371,26 +373,26 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n", + LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 76, " found %lld\n", (long long)(int)offsetof(struct ptlrpc_body, pb_padding_1)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n", + LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 80, " found %lld\n", (long long)(int)offsetof(struct ptlrpc_body, pb_padding_2)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 76, " found %lld\n", + LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 84, " found %lld\n", (long long)(int)offsetof(struct ptlrpc_body, pb_padding_3)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_4) == 80, " found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body, pb_padding_4)); - LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_5) == 84, " found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body, pb_padding_5)); - LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5)); /* Checks for struct obd_connect_data */ LASSERTF((int)sizeof(struct obd_connect_data) == 72, " found %lld\n", @@ -465,7 +467,9 @@ void lustre_assert_wire_constants(void) CLASSERT(OBD_CONNECT_FID_CAPA == 0x100000ULL); CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL); CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL); - + CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL); + CLASSERT(OBD_CONNECT_AT == 0x01000000ULL); + /* Checks for struct obdo */ LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n", (long long)(int)sizeof(struct obdo)); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index c5aedced85..34214c180f 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -15,8 +15,8 @@ ONLY=${ONLY:-"$*"} # xml xml xml xml xml xml dumb FIXME MOUNTCONFSKIP="10 11 12 13 13b 14 15 18" -# bug number for skipped test: -ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT $MOUNTCONFSKIP" +# bug number for skipped test: 13369 +ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT $MOUNTCONFSKIP 34a" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! SRCDIR=`dirname $0` @@ -1066,7 +1066,7 @@ test_31() { # bug 10734 mount -t lustre 4.3.2.1@tcp:/lustre $MOUNT || true cleanup } -run_test 31 "Connect to non-existent node (shouldn't crash)" +run_test 31 "Connect to non-existent node (returns errors, should not crash)" test_32a() { # XXX - make this run on client-only systems with real hardware on @@ -1108,6 +1108,7 @@ test_32a() { load_modules # mount a second time to make sure we didnt leave upgrade flag on + load_modules $TUNEFS --dryrun $TMP/$tdir/mds || error "tunefs failed" load_modules start mds $TMP/$tdir/mds "-o loop,exclude=lustre-OST0000" || return 12 @@ -1128,32 +1129,36 @@ test_32b() { unzip -o -j -d $TMP/$tdir disk1_4.zip || { skip "Cant unzip disk1_4, skipping" && return ; } load_modules sysctl lnet.debug=$PTLDEBUG + NEWNAME=sofia # writeconf will cause servers to register with their current nids - $TUNEFS --writeconf $TMP/$tdir/mds || error "tunefs failed" + $TUNEFS --writeconf --fsname=$NEWNAME $TMP/$tdir/mds || error "tunefs failed" start mds $TMP/$tdir/mds "-o loop" || return 3 - local UUID=$(cat $LPROC/mds/lustre-MDT0000/uuid) + local UUID=$(cat $LPROC/mds/${NEWNAME}-MDT0000/uuid) echo MDS uuid $UUID [ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID" - $TUNEFS --mgsnode=`hostname` $TMP/$tdir/ost1 || error "tunefs failed" + $TUNEFS --mgsnode=`hostname` --fsname=$NEWNAME --writeconf $TMP/$tdir/ost1 || error "tunefs failed" start ost1 $TMP/$tdir/ost1 "-o loop" || return 5 - UUID=$(cat $LPROC/obdfilter/lustre-OST0000/uuid) + UUID=$(cat $LPROC/obdfilter/${NEWNAME}-OST0000/uuid) echo OST uuid $UUID - [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID" + [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID" echo "OSC changes should succeed:" - $LCTL conf_param lustre-OST0000.osc.max_dirty_mb=15 || return 7 - $LCTL conf_param lustre-OST0000.failover.node=$NID || return 8 + $LCTL conf_param ${NEWNAME}-OST0000.osc.max_dirty_mb=15 || return 7 + $LCTL conf_param ${NEWNAME}-OST0000.failover.node=$NID || return 8 echo "ok." echo "MDC changes should succeed:" - $LCTL conf_param lustre-MDT0000.mdc.max_rpcs_in_flight=9 || return 9 + $LCTL conf_param ${NEWNAME}-MDT0000.mdc.max_rpcs_in_flight=9 || return 9 echo "ok." # MDT and OST should have registered with new nids, so we should have # a fully-functioning client echo "Check client and old fs contents" + OLDFS=$FSNAME + FSNAME=$NEWNAME mount_client $MOUNT + FSNAME=$OLDFS set_and_check client "cat $LPROC/mdc/*/max_rpcs_in_flight" "lustre-MDT0000.mdc.max_rpcs_in_flight" || return 11 [ "$(cksum $MOUNT/passwd | cut -d' ' -f 1,2)" == "2479747619 779" ] || return 12 echo "ok." diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 89c2acb0b5..f927fb683a 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -5,7 +5,7 @@ set -e # bug 5493 ALWAYS_EXCEPT="52 $RECOVERY_SMALL_EXCEPT" -PTLDEBUG=${PTLDEBUG:--1} +#PTLDEBUG=${PTLDEBUG:--1} LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh init_test_env $@ @@ -253,8 +253,9 @@ test_18b() { do_facet client cp $SAMPLE_FILE $f sync ost_evict_client - # allow recovery to complete - sleep $((TIMEOUT + 2)) + # force reconnect + df $MOUNT > /dev/null 2>&1 + sleep 2 # my understanding is that there should be nothing in the page # cache after the client reconnects? rc=0 @@ -593,12 +594,13 @@ test_26b() { # bug 10140 - evict dead exports by pinger OST_FILE=$LPROC/obdfilter/${ost1_svc}/num_exports OST_NEXP1="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`" echo starting with $OST_NEXP1 OST and $MDS_NEXP1 MDS exports + #force umount a client; exports should get evicted zconf_umount `hostname` $MOUNT2 -f - # evictor takes up to 2.25x to evict. But if there's a - # race to start the evictor from various obds, the loser - # might have to wait for the next ping. - echo Waiting for $(($TIMEOUT * 4)) secs - sleep $(($TIMEOUT * 4)) + # evictor takes PING_EVICT_TIMEOUT + 3 * PING_INTERVAL to evict. + # But if there's a race to start the evictor from various obds, + # the loser might have to wait for the next ping. + echo Waiting for $(($TIMEOUT * 8)) secs + sleep $(($TIMEOUT * 8)) OST_NEXP2="`do_facet ost1 cat $OST_FILE | cut -d' ' -f2`" MDS_NEXP2="`do_facet mds cat $MDS_FILE | cut -d' ' -f2`" echo ending with $OST_NEXP2 OST and $MDS_NEXP2 MDS exports @@ -646,25 +648,38 @@ run_test 28 "handle error adding new clients (bug 6086)" test_50() { mkdir -p $DIR/$tdir + debugsave + sysctl -w lnet.debug="-dlmtrace -ha" # put a load of file creates/writes/deletes - writemany -q $DIR/$tdir/$tfile 0 5 & + writemany -a -q $DIR/$tdir/$tfile 0 5 & CLIENT_PID=$! echo writemany pid $CLIENT_PID sleep 10 + ps $CLIENT_PID > /dev/null || error "Writemany died 1.1" FAILURE_MODE="SOFT" + $LCTL mark "$TESTNAME fail mds 1" fail mds + ps $CLIENT_PID > /dev/null || error "Writemany died 2.1" # wait for client to reconnect to MDS sleep 60 + ps $CLIENT_PID > /dev/null || error "Writemany died 2.2" + $LCTL mark "$TESTNAME fail mds 2" fail mds + ps $CLIENT_PID > /dev/null || error "Writemany died 3.1" sleep 60 + ps $CLIENT_PID > /dev/null || error "Writemany died 3.2" + $LCTL mark "$TESTNAME fail mds 3" fail mds # client process should see no problems even though MDS went down + ps $CLIENT_PID > /dev/null || error "Writemany died 4.1" sleep $TIMEOUT + ps $CLIENT_PID > /dev/null || error "Writemany died 4.2" kill -USR1 $CLIENT_PID wait $CLIENT_PID rc=$? echo writemany returned $rc #these may fail because of eviction due to slow AST response. + debugrestore return $rc } run_test 50 "failover MDS under load" @@ -672,7 +687,7 @@ run_test 50 "failover MDS under load" test_51() { mkdir -p $DIR/$tdir # put a load of file creates/writes/deletes - writemany -q $DIR/$tdir/$tfile 0 5 & + writemany -a -q $DIR/$tdir/$tfile 0 5 & CLIENT_PID=$! sleep 1 FAILURE_MODE="SOFT" @@ -683,7 +698,10 @@ test_51() { for i in $SEQ do echo failover in $i sec + ps $CLIENT_PID > /dev/null || error "Writemany died $i.1" sleep $i + ps $CLIENT_PID > /dev/null || error "Writemany died $i.1" + $LCTL mark "$TESTNAME fail mds $i" facet_failover mds done # client process should see no problems even though MDS went down @@ -703,6 +721,7 @@ test_52_guts() { echo writemany pid $CLIENT_PID sleep 10 FAILURE_MODE="SOFT" + $LCTL mark "$TESTNAME fail ost $1" fail ost1 rc=0 wait $CLIENT_PID || rc=$? @@ -716,16 +735,16 @@ test_52_guts() { test_52() { mkdir -p $DIR/$tdir - test_52_guts + test_52_guts 1 rc=$? [ $rc -ne 0 ] && { return $rc; } # wait for client to reconnect to OST sleep 30 - test_52_guts + test_52_guts 2 rc=$? [ $rc -ne 0 ] && { return $rc; } sleep 30 - test_52_guts + test_52_guts 3 rc=$? client_reconnect #return $rc diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index ab25f6b3cd..8fa5dc1c47 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -204,7 +204,7 @@ test_12() { #define OBD_FAIL_LDLM_ENQUEUE 0x302 do_facet mds sysctl -w lustre.fail_loc=0x80000302 facet_failover mds - df $MOUNT || return 1 + df $MOUNT || { kill -USR1 $MULTIPID && return 1; } do_facet mds sysctl -w lustre.fail_loc=0 ls $DIR/$tfile diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index da1b1d3d9b..5ddab23917 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -916,13 +916,19 @@ run_test 43 "mds osc import failure during recovery; don't LBUG" test_44() { mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices` [ "$mdcdev" ] || exit 2 + # adaptive timeouts slow this way down + MDS_AT_MAX=$(do_facet mds "sysctl -n lustre.adaptive_max") + do_facet mds "sysctl -w lustre.adaptive_max=40" for i in `seq 1 10`; do + echo "$i of 10 ($(date +%s))" + do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts" #define OBD_FAIL_TGT_CONN_RACE 0x701 do_facet mds "sysctl -w lustre.fail_loc=0x80000701" $LCTL --device $mdcdev recover df $MOUNT done do_facet mds "sysctl -w lustre.fail_loc=0" + do_facet mds "sysctl -w lustre.adaptive_max=$MDS_AT_MAX" return 0 } run_test 44 "race in target handle connect" @@ -931,6 +937,8 @@ test_44b() { mdcdev=`awk '/-mdc-/ {print $1}' $LPROC/devices` [ "$mdcdev" ] || exit 2 for i in `seq 1 10`; do + echo "$i of 10 ($(date +%s))" + do_facet mds "grep service $LPROC/mdt/MDS/mds/timeouts" #define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 do_facet mds "sysctl -w lustre.fail_loc=0x80000704" $LCTL --device $mdcdev recover @@ -1130,7 +1138,7 @@ test_60() { run_test 60 "test llog post recovery init vs llog unlink" #test race llog recovery thread vs llog cleanup -test_61() { +test_61a() { mkdir $DIR/$tdir createmany -o $DIR/$tdir/$tfile-%d 800 replay_barrier ost1 @@ -1145,7 +1153,7 @@ test_61() { $CHECKSTAT -t file $DIR/$tdir/$tfile-* && return 1 rmdir $DIR/$tdir } -run_test 61 "test race llog recovery vs llog cleanup" +run_test 61a "test race llog recovery vs llog cleanup" #test race mds llog sync vs llog cleanup test_61b() { @@ -1169,6 +1177,138 @@ test_61c() { } run_test 61c "test race mds llog sync vs llog cleanup" + +at_start() #bug 3055 +{ + if [ -z "$ATOLDBASE" ]; then + ATOLDBASE=$(do_facet mds "grep timebase $LPROC/mdt/MDS/mds/timeouts" | awk '{print $3}' ) + # speed up the timebase so we can check decreasing AT + do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds/timeouts" + do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_readpage/timeouts" + do_facet mds "echo 8 >> $LPROC/mdt/MDS/mds_setattr/timeouts" + do_facet ost1 "echo 8 >> $LPROC/ost/OSS/ost/timeouts" + fi +} + +test_65() #bug 3055 +{ + at_start + $LCTL dk > /dev/null + # slow down a request + sysctl -w lustre.fail_val=30000 +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a + sysctl -w lustre.fail_loc=0x8000050a + createmany -o $DIR/$tfile 10 > /dev/null + unlinkmany $DIR/$tfile 10 > /dev/null + # check for log message + $LCTL dk | grep "Early reply #" || error "No early reply" + # client should show 30s timeouts + grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + sleep 9 + grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts +} +run_test 65 "AT: verify early replies" + +test_66a() #bug 3055 +{ + at_start + grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + # adjust 5s at a time so no early reply is sent (within deadline) + do_facet mds "sysctl -w lustre.fail_val=5000" +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a + do_facet mds "sysctl -w lustre.fail_loc=0x8000050a" + createmany -o $DIR/$tfile 20 > /dev/null + unlinkmany $DIR/$tfile 20 > /dev/null + grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + do_facet mds "sysctl -w lustre.fail_val=10000" + do_facet mds "sysctl -w lustre.fail_loc=0x8000050a" + createmany -o $DIR/$tfile 20 > /dev/null + unlinkmany $DIR/$tfile 20 > /dev/null + grep "portal 12" $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts + do_facet mds "sysctl -w lustre.fail_loc=0" + sleep 9 + createmany -o $DIR/$tfile 20 > /dev/null + unlinkmany $DIR/$tfile 20 > /dev/null + grep portal $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts | grep "portal 12" + CUR=$(awk '/portal 12/ {print $5}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts) + WORST=$(awk '/portal 12/ {print $7}' $LPROC/mdc/${FSNAME}-MDT0000-mdc-*/timeouts) + echo "Current MDT timeout $CUR, worst $WORST" + [ $CUR -lt $WORST ] || error "Current $CUR should be less than worst $WORST" +} +run_test 66a "AT: verify MDT service time adjusts with no early replies" + +test_66b() #bug 3055 +{ + at_start + ORIG=$(awk '/network/ {print $4}' $LPROC/mdc/lustre-*/timeouts) + sysctl -w lustre.fail_val=$(($ORIG + 5)) +#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c + sysctl -w lustre.fail_loc=0x50c + ls $DIR/$tfile > /dev/null 2>&1 + sysctl -w lustre.fail_loc=0 + CUR=$(awk '/network/ {print $4}' $LPROC/mdc/${FSNAME}-*/timeouts) + WORST=$(awk '/network/ {print $6}' $LPROC/mdc/${FSNAME}-*/timeouts) + echo "network timeout orig $ORIG, cur $CUR, worst $WORST" + [ $WORST -gt $ORIG ] || error "Worst $WORST should be worse than orig $ORIG" +} +run_test 66b "AT: verify net latency adjusts" + +test_67a() #bug 3055 +{ + at_start + CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + # sleeping threads may drive values above this + do_facet ost1 "sysctl -w lustre.fail_val=400" +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a + do_facet ost1 "sysctl -w lustre.fail_loc=0x50a" + createmany -o $DIR/$tfile 20 > /dev/null + unlinkmany $DIR/$tfile 20 > /dev/null + do_facet ost1 "sysctl -w lustre.fail_loc=0" + CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + ATTEMPTS=$(($CONN2 - $CONN1)) + echo "$ATTEMPTS osc reconnect attemps on gradual slow" + [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect" + return 0 +} +run_test 67a "AT: verify slow request processing doesn't induce reconnects" + +test_67b() #bug 3055 +{ + at_start + CONN1=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) +#define OBD_FAIL_OST_PAUSE_CREATE 0x223 + do_facet ost1 "sysctl -w lustre.fail_val=20000" + do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" + cp /etc/profile $DIR/$tfile || error "cp failed" + client_reconnect + cat $LPROC/ost/OSS/ost_create/timeouts + log "phase 2" + CONN2=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + ATTEMPTS=$(($CONN2 - $CONN1)) + echo "$ATTEMPTS osc reconnect attemps on instant slow" + # do it again; should not timeout + do_facet ost1 "sysctl -w lustre.fail_loc=0x80000223" + cp /etc/profile $DIR/$tfile || error "cp failed" + do_facet ost1 "sysctl -w lustre.fail_loc=0" + client_reconnect + cat $LPROC/ost/OSS/ost_create/timeouts + CONN3=$(awk '/_connect/ {total+=$2} END {print total}' $LPROC/osc/*/stats) + ATTEMPTS=$(($CONN3 - $CONN2)) + echo "$ATTEMPTS osc reconnect attemps on 2nd slow" + [ $ATTEMPTS -gt 0 ] && error "AT should have prevented reconnect" + return 0 +} +run_test 67b "AT: verify instant slowdown doesn't induce reconnects" + +if [ -n "$ATOLDBASE" ]; then + do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds/timeouts" + do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_readpage/timeouts" + do_facet mds "echo $ATOLDBASE >> $LPROC/mdt/MDS/mds_setattr/timeouts" + do_facet ost1 "echo $ATOLDBASE >> $LPROC/ost/OSS/ost/timeouts" +fi +# end of AT tests includes above lines + + equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index f09d249507..2a7c03aa15 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -783,7 +783,7 @@ test_27a() { $SETSTRIPE $DIR/d27/f0 65536 0 1 || error "lstripe failed" $CHECKSTAT -t file $DIR/d27/f0 || error "checkstat failed" pass - log "== test_27b: write to one stripe file =========================" + log "== test 27b: write to one stripe file =========================" cp /etc/hosts $DIR/d27/f0 || error } run_test 27a "one stripe file ==================================" @@ -795,7 +795,7 @@ test_27c() { [ `$GETSTRIPE $DIR/d27/f01 | grep -A 10 obdidx | wc -l` -eq 4 ] || error "two-stripe file doesn't have two stripes" pass - log "== test_27d: write to two stripe file file f01 ================" + log "== test 27d: write to two stripe file file f01 ================" dd if=/dev/zero of=$DIR/d27/f01 bs=4k count=4 || error "dd failed" } run_test 27c "create two stripe file f01 =======================" @@ -3211,7 +3211,7 @@ test_99b() { TOIGNORE=$(find . -type l -printf '-I %f\n' -o \ ! -perm +4 -printf '-I %f\n') $RUNAS cvs -d $DIR/d99cvsroot import -m "nomesg" $TOIGNORE \ - d99reposname vtag rtag || error "cvs import failed" + d99reposname vtag rtag > /dev/null || error "cvs import failed" } run_test 99b "cvs import =======================================" @@ -3221,7 +3221,7 @@ test_99c() { mkdir -p $DIR/d99reposname || error "mkdir $DIR/d99reposname failed" chown $RUNAS_ID $DIR/d99reposname || \ error "chown $DIR/d99reposname failed" - $RUNAS cvs -d $DIR/d99cvsroot co d99reposname || \ + $RUNAS cvs -d $DIR/d99cvsroot co d99reposname > /dev/null || \ error "cvs co d99reposname failed" } run_test 99c "cvs checkout =====================================" @@ -3622,20 +3622,20 @@ test_103 () { echo "performing cp ..." run_acl_subtest cp || error echo "performing getfacl-noacl..." - run_acl_subtest getfacl-noacl || error + run_acl_subtest getfacl-noacl > /dev/null || error echo "performing misc..." - run_acl_subtest misc || error + run_acl_subtest misc > /dev/null || error # XXX add back permission test when we support supplementary groups. # echo "performing permissions..." # run_acl_subtest permissions || error echo "performing setfacl..." - run_acl_subtest setfacl || error + run_acl_subtest setfacl > /dev/null || error # inheritance test got from HP echo "performing inheritance..." cp $SAVE_PWD/acl/make-tree . || error chmod +x make-tree || error - run_acl_subtest inheritance || error + run_acl_subtest inheritance > /dev/null || error rm -f make-tree cd $SAVE_PWD diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 5cd12861b0..3b87e0945c 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -4,7 +4,7 @@ set -e ONLY=${ONLY:-"$*"} # bug number for skipped test: 3192 -ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b"} +ALWAYS_EXCEPT=${ALWAYS_EXCEPT:-"14b $SANITYN_EXCEPT"} # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! [ "$SLOW" = "no" ] && EXCEPT="$EXCEPT 16" diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 7c5dc63a6b..8fed513238 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -225,8 +225,8 @@ start() { do_facet ${facet} mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} RC=${PIPESTATUS[0]} if [ $RC -ne 0 ]; then - echo mount -t lustre $@ ${device} ${MOUNT%/*}/${facet} - echo Start of ${device} on ${facet} failed ${RC} + echo "mount -t lustre $@ ${device} ${MOUNT%/*}/${facet}" + echo "Start of ${device} on ${facet} failed ${RC}" else do_facet ${facet} sync label=$(do_facet ${facet} "e2label ${device}") @@ -586,7 +586,7 @@ do_facet() { shift HOST=`facet_active_host $facet` [ -z $HOST ] && echo No host defined for facet ${facet} && exit 1 - do_node $HOST $@ + do_node $HOST "$@" } add() { diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index dfd2604c23..a843db8fed 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -109,7 +109,7 @@ check_lustre_msg_v2(void) CHECK_MEMBER(lustre_msg_v2, lm_secflvr); CHECK_MEMBER(lustre_msg_v2, lm_magic); CHECK_MEMBER(lustre_msg_v2, lm_repsize); - CHECK_MEMBER(lustre_msg_v2, lm_timeout); + CHECK_MEMBER(lustre_msg_v2, lm_cksum); CHECK_MEMBER(lustre_msg_v2, lm_padding_1); CHECK_MEMBER(lustre_msg_v2, lm_padding_2); CHECK_MEMBER(lustre_msg_v2, lm_padding_3); @@ -133,11 +133,11 @@ check_ptlrpc_body(void) CHECK_MEMBER(ptlrpc_body, pb_flags); CHECK_MEMBER(ptlrpc_body, pb_op_flags); CHECK_MEMBER(ptlrpc_body, pb_conn_cnt); + CHECK_MEMBER(ptlrpc_body, pb_timeout); + CHECK_MEMBER(ptlrpc_body, pb_service_time); CHECK_MEMBER(ptlrpc_body, pb_padding_1); CHECK_MEMBER(ptlrpc_body, pb_padding_2); CHECK_MEMBER(ptlrpc_body, pb_padding_3); - CHECK_MEMBER(ptlrpc_body, pb_padding_4); - CHECK_MEMBER(ptlrpc_body, pb_padding_5); } static void check_obd_connect_data(void) @@ -1069,6 +1069,7 @@ main(int argc, char **argv) CHECK_VALUE(MSG_LAST_REPLAY); CHECK_VALUE(MSG_RESENT); CHECK_VALUE(MSG_REPLAY); + CHECK_VALUE(MSG_AT_SUPPORT); CHECK_VALUE(MSG_CONNECT_RECOVERING); CHECK_VALUE(MSG_CONNECT_RECONNECT); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index e07146fded..992292caf4 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -28,7 +28,7 @@ void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' * (make -C lustre/utils newwirecheck) - * running on Linux pancake 2.6.18-skas3-v9-pre9 #2 Tue Oct 17 13:08:24 PDT 2006 i686 i686 i3 + * running on Linux pancake 2.6.18-skas3-v9-pre9 #1 Tue Feb 20 10:37:58 PST 2007 i686 i686 i3 * with gcc version 3.4.4 */ @@ -51,6 +51,8 @@ void lustre_assert_wire_constants(void) (long long)MSG_RESENT); LASSERTF(MSG_REPLAY == 4, " found %lld\n", (long long)MSG_REPLAY); + LASSERTF(MSG_AT_SUPPORT == 8, " found %lld\n", + (long long)MSG_AT_SUPPORT); LASSERTF(MSG_CONNECT_RECOVERING == 1, " found %lld\n", (long long)MSG_CONNECT_RECOVERING); LASSERTF(MSG_CONNECT_RECONNECT == 2, " found %lld\n", @@ -314,10 +316,10 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct lustre_msg_v2, lm_repsize)); LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize) == 4, " found %lld\n", (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_repsize)); - LASSERTF((int)offsetof(struct lustre_msg_v2, lm_timeout) == 16, " found %lld\n", - (long long)(int)offsetof(struct lustre_msg_v2, lm_timeout)); - LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout) == 4, " found %lld\n", - (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_timeout)); + LASSERTF((int)offsetof(struct lustre_msg_v2, lm_cksum) == 16, " found %lld\n", + (long long)(int)offsetof(struct lustre_msg_v2, lm_cksum)); + LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lustre_msg_v2 *)0)->lm_cksum)); LASSERTF((int)offsetof(struct lustre_msg_v2, lm_padding_1) == 20, " found %lld\n", (long long)(int)offsetof(struct lustre_msg_v2, lm_padding_1)); LASSERTF((int)sizeof(((struct lustre_msg_v2 *)0)->lm_padding_1) == 4, " found %lld\n", @@ -387,26 +389,26 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct ptlrpc_body, pb_conn_cnt)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_conn_cnt)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 68, " found %lld\n", + LASSERTF((int)offsetof(struct ptlrpc_body, pb_timeout) == 68, " found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body, pb_timeout)); + LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_timeout) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_timeout)); + LASSERTF((int)offsetof(struct ptlrpc_body, pb_service_time) == 72, " found %lld\n", + (long long)(int)offsetof(struct ptlrpc_body, pb_service_time)); + LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_service_time) == 4, " found %lld\n", + (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_service_time)); + LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_1) == 76, " found %lld\n", (long long)(int)offsetof(struct ptlrpc_body, pb_padding_1)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_1)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 72, " found %lld\n", + LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_2) == 80, " found %lld\n", (long long)(int)offsetof(struct ptlrpc_body, pb_padding_2)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_2)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 76, " found %lld\n", + LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_3) == 84, " found %lld\n", (long long)(int)offsetof(struct ptlrpc_body, pb_padding_3)); LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3) == 4, " found %lld\n", (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_3)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_4) == 80, " found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body, pb_padding_4)); - LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_4)); - LASSERTF((int)offsetof(struct ptlrpc_body, pb_padding_5) == 84, " found %lld\n", - (long long)(int)offsetof(struct ptlrpc_body, pb_padding_5)); - LASSERTF((int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5) == 4, " found %lld\n", - (long long)(int)sizeof(((struct ptlrpc_body *)0)->pb_padding_5)); /* Checks for struct obd_connect_data */ LASSERTF((int)sizeof(struct obd_connect_data) == 72, " found %lld\n", @@ -481,7 +483,9 @@ void lustre_assert_wire_constants(void) CLASSERT(OBD_CONNECT_FID_CAPA == 0x100000ULL); CLASSERT(OBD_CONNECT_OSS_CAPA == 0x200000ULL); CLASSERT(OBD_CONNECT_CANCELSET == 0x400000ULL); - + CLASSERT(OBD_CONNECT_SOM == 0x00800000ULL); + CLASSERT(OBD_CONNECT_AT == 0x01000000ULL); + /* Checks for struct obdo */ LASSERTF((int)sizeof(struct obdo) == 208, " found %lld\n", (long long)(int)sizeof(struct obdo)); -- GitLab