From 874f562aebeb86c9cc3c9b19d3c60e919489d905 Mon Sep 17 00:00:00 2001 From: alex <alex> Date: Fri, 9 Jul 2004 07:15:17 +0000 Subject: [PATCH] b=3405 robert's fixes: - add CONNECT_INITAL flag for the initial connect message from a client. When server sees an old client reconnect with that flag, then it needs to set the handles as if it was a new client. - set the initial imp_conn_cnt from the server's conn_cnt. When client connects with INITIAL set, the server sends back the old conn_cnt+1. - timeout for inter-MDS requests is set to obd_timeout / 2 --- lustre/include/linux/lustre_idl.h | 1 + lustre/include/linux/lustre_lib.h | 2 +- lustre/ldlm/ldlm_lib.c | 32 +++++++++++++++++-------------- lustre/ptlrpc/client.c | 5 ++++- lustre/ptlrpc/import.c | 15 +++++++++++---- 5 files changed, 35 insertions(+), 20 deletions(-) diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 4ab5ce813e..588882627f 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -212,6 +212,7 @@ static inline void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags) #define MSG_CONNECT_REPLAYABLE 0x4 //#define MSG_CONNECT_PEER 0x8 #define MSG_CONNECT_LIBCLIENT 0x10 +#define MSG_CONNECT_INITIAL 0x20 /* * OST requests: OBDO & OBD request records diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index f354ecb6d5..b57146d228 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -79,7 +79,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler); int target_handle_disconnect(struct ptlrpc_request *req); void target_destroy_export(struct obd_export *exp); int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, - struct obd_uuid *cluuid); + struct obd_uuid *cluuid, int); int target_handle_ping(struct ptlrpc_request *req); void target_cancel_recovery_timer(struct obd_device *obd); diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index b6ae61c98f..d764484870 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -333,19 +333,16 @@ int client_disconnect_export(struct obd_export *exp, int failover) * -------------------------------------------------------------------------- */ int target_handle_reconnect(struct lustre_handle *conn, struct obd_export *exp, - struct obd_uuid *cluuid) + struct obd_uuid *cluuid, int initial_conn) { - if (exp->exp_connection) { + if (exp->exp_connection && !initial_conn) { struct lustre_handle *hdl; hdl = &exp->exp_imp_reverse->imp_remote_handle; /* Might be a re-connect after a partition. */ -#warning "FIXME ASAP" - memcpy(&hdl->cookie, &conn->cookie, sizeof(conn->cookie)); - if (1 || !memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) { + if (!memcmp(&conn->cookie, &hdl->cookie, sizeof conn->cookie)) { CERROR("%s reconnecting\n", cluuid->uuid); conn->cookie = exp->exp_handle.h_cookie; - /*RETURN(EALREADY);*/ - RETURN(0); + RETURN(EALREADY); } else { CERROR("%s reconnecting from %s, " "handle mismatch (ours "LPX64", theirs " @@ -377,6 +374,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) char *str, *tmp; int rc = 0, abort_recovery; unsigned long flags; + int initial_conn = 0; ENTRY; OBD_RACE(OBD_FAIL_TGT_CONN_RACE); @@ -444,6 +442,9 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) if (obd_uuid_equals(&cluuid, &target->obd_uuid)) goto dont_check_exports; + if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_INITIAL) + initial_conn = 1; + spin_lock(&target->obd_dev_lock); list_for_each(p, &target->obd_exports) { export = list_entry(p, struct obd_export, exp_obd_chain); @@ -451,7 +452,8 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) spin_unlock(&target->obd_dev_lock); LASSERT(export->exp_obd == target); - rc = target_handle_reconnect(&conn, export, &cluuid); + rc = target_handle_reconnect(&conn, export, &cluuid, + initial_conn); break; } export = NULL; @@ -459,15 +461,16 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) /* If we found an export, we already unlocked. */ if (!export) { spin_unlock(&target->obd_dev_lock); - } else if (req->rq_reqmsg->conn_cnt == 1) { + } else if (req->rq_reqmsg->conn_cnt == 1 && !initial_conn) { CERROR("%s reconnected with 1 conn_cnt; cookies not random?\n", cluuid.uuid); -#warning "FIXME ASAP" - /*GOTO(out, rc = -EALREADY);*/ + GOTO(out, rc = -EALREADY); } /* Tell the client if we're in recovery. */ /* If this is the first client, start the recovery timer */ + CWARN("%s: connection from %s %s\n", target->obd_name, cluuid.uuid, + target->obd_recovering ? "(recovering)" : ""); if (target->obd_recovering) { lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECOVERING); target_start_recovery_timer(target, handler); @@ -516,14 +519,15 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler) LASSERT(export != NULL); spin_lock_irqsave(&export->exp_lock, flags); -#warning "FIXME ASAP" - if (0 && export->exp_conn_cnt >= req->rq_reqmsg->conn_cnt) { + if (initial_conn) { + req->rq_repmsg->conn_cnt = export->exp_conn_cnt + 1; + } else if (export->exp_conn_cnt >= req->rq_reqmsg->conn_cnt) { CERROR("%s: already connected at a higher conn_cnt: %d > %d\n", cluuid.uuid, export->exp_conn_cnt, req->rq_reqmsg->conn_cnt); spin_unlock_irqrestore(&export->exp_lock, flags); GOTO(out, rc = -EALREADY); - } + } export->exp_conn_cnt = req->rq_reqmsg->conn_cnt; spin_unlock_irqrestore(&export->exp_lock, flags); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 452f3edd43..5a4f9ae9cf 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -203,7 +203,10 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, RETURN(NULL); } - request->rq_timeout = obd_timeout; + if (imp->imp_server_timeout) + request->rq_timeout = obd_timeout / 2; + else + request->rq_timeout = obd_timeout; request->rq_send_state = LUSTRE_IMP_FULL; request->rq_type = PTL_RPC_MSG_REQUEST; request->rq_import = class_import_get(imp); diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index e2106b480b..86f3adf64c 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -255,14 +255,13 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING); - imp->imp_conn_cnt++; imp->imp_resend_replay = 0; if (imp->imp_remote_handle.cookie == 0) { initial_connect = 1; } else { committed_before_reconnect = imp->imp_peer_committed_transno;; - + imp->imp_conn_cnt++; } @@ -322,8 +321,11 @@ int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid) aa->pcaa_peer_committed = committed_before_reconnect; aa->pcaa_initial_connect = initial_connect; - if (aa->pcaa_initial_connect) - imp->imp_replayable = 1; + if (aa->pcaa_initial_connect) { + lustre_msg_add_op_flags(request->rq_reqmsg, + MSG_CONNECT_INITIAL); + imp->imp_replayable = 1; + } ptlrpcd_add_req(request); rc = 0; @@ -366,6 +368,10 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, } else { imp->imp_replayable = 0; } + LASSERTF(imp->imp_conn_cnt < request->rq_repmsg->conn_cnt, + "imp conn_cnt %d req conn_cnt %d", + imp->imp_conn_cnt, request->rq_repmsg->conn_cnt); + imp->imp_conn_cnt = request->rq_repmsg->conn_cnt; imp->imp_remote_handle = request->rq_repmsg->handle; IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL); GOTO(finish, rc = 0); @@ -412,6 +418,7 @@ static int ptlrpc_connect_interpret(struct ptlrpc_request *request, imp->imp_last_replay_transno = 0; IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY); } else { + CWARN("oops! we get evicted from %s\n", imp->imp_target_uuid.uuid); imp->imp_remote_handle = request->rq_repmsg->handle; IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED); } -- GitLab