diff --git a/lustre/contrib/adio_driver_mpich2-1.0.7.patch b/lustre/contrib/adio_driver_mpich2-1.0.7.patch index 5f1daa365d6bba529ebffc62fc7050c262f8054f..f05269adef50949848a9eedb8e3ed2518f9700e2 100644 --- a/lustre/contrib/adio_driver_mpich2-1.0.7.patch +++ b/lustre/contrib/adio_driver_mpich2-1.0.7.patch @@ -1,7 +1,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c --- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_aggregate.c 2008-10-15 22:26:35.000000000 +0800 -@@ -0,0 +1,514 @@ ++++ ad_lustre/ad_lustre_aggregate.c 2008-10-17 17:30:00.000000000 +0800 +@@ -0,0 +1,502 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -16,10 +16,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c +#include "adio_extern.h" + +void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, -+ int mode, int nprocs, -+ ADIO_Offset *st_offsets, -+ ADIO_Offset *end_offsets, -+ ADIO_Offset *min_st_offset_ptr) ++ int mode) +{ + int *striping_info = NULL; + /* get striping information: @@ -27,10 +24,8 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + * striping_info[1]: stripe_count + * striping_info[2]: avail_cb_nodes + */ -+ int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag, i; -+ int user_cb_nodes = 0, avail_cb_nodes; -+ int nprocs_for_coll = fd->hints->cb_nodes; -+ ADIO_Offset min_st_offset, max_end_offset; ++ int stripe_size, stripe_count, CO = 1, CO_max = 1, CO_nodes, lflag; ++ int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes; + char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + + /* Get hints value */ @@ -66,48 +61,35 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + CO = atoi(value); + CO = ADIOI_MIN(CO_max, CO); + } ++ /* Calculate how many IO clients we need */ ++ /* To avoid extent lock conflicts, ++ * avail_cb_nodes should divide (stripe_count*CO) exactly, ++ * so that each OST is accessed by only one or more constant clients. */ + avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO); -+ -+ /* user_cb_nodes*/ -+ MPI_Info_get(fd->info, "user_cb_nodes", MPI_MAX_INFO_VAL, value, &lflag); -+ if (lflag) -+ user_cb_nodes = atoi(value); -+ /* If the user doesn't change the cb_nodes and -+ * the whole file access portion is no larger than stripe size, -+ * we will perform the IO by the same process (rank0 by default). -+ */ -+ /* calculate the whole file access portion */ -+ min_st_offset = st_offsets[0]; -+ max_end_offset = end_offsets[0]; -+ for (i = 0; i < nprocs; i ++) { -+ min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]); -+ max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]); -+ } -+ if (!user_cb_nodes) { -+ /* Check the whole file access portion -+ * if (whole_range <= stripe_size) -+ * then always collect data to the same process; -+ * set avail_cb_nodes=1; (rank0 by default). -+ * This pattern can make good use of Lustre client cache and -+ * avoid extent lock assigning and revoking. -+ * -+ * The recent experiments show good performance. We still need more -+ * validation. -+ */ -+ if ((max_end_offset > min_st_offset) && -+ (max_end_offset - min_st_offset) <= (ADIO_Offset) stripe_size) -+ avail_cb_nodes = 1; ++ if (avail_cb_nodes == nprocs_for_coll) { ++ CO_nodes = stripe_count * CO; ++ do { ++ /* find the divisor of CO_nodes */ ++ divisor = 1; ++ do { ++ divisor ++; ++ } while (CO_nodes % divisor); ++ CO_nodes = CO_nodes / divisor; ++ /* if stripe_count*CO is a prime number, change nothing */ ++ if ((CO_nodes <= avail_cb_nodes) && (CO_nodes != 1)) { ++ avail_cb_nodes = CO_nodes; ++ break; ++ } ++ } while (CO_nodes != 1); + } + -+ ADIOI_Free(value); -+ + *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int)); + striping_info = *striping_info_ptr; + striping_info[0] = stripe_size; + striping_info[1] = stripe_count; + striping_info[2] = avail_cb_nodes; + -+ *min_st_offset_ptr = min_st_offset; ++ ADIOI_Free(value); +} + +int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, @@ -319,8 +301,8 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + int *count_my_req_per_proc, + ADIOI_Access * my_req, + int nprocs, int myrank, -+ ADIO_Offset req_len, -+ ADIO_Offset min_st_offset, ++ ADIO_Offset start_offset, ++ ADIO_Offset end_offset, + int *striping_info, + int *count_others_req_procs_ptr, + ADIOI_Access ** others_req_ptr) @@ -334,7 +316,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + MPI_Status *statuses; + ADIOI_Access *others_req; + char *value = NULL; -+ ADIO_Offset off, avail_len, rem_len, *all_lens; ++ ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens; + + /* There are two hints, which could reduce some MPI communication overhead, + * if the users knows the I/O pattern and set them correctly. */ @@ -365,20 +347,26 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c + for (i = 0; i < nprocs; i++) { + others_req[i].count = 0; + } ++ req_len = end_offset - start_offset + 1; + all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); + + /* same req size ? */ + if (samesize == 0) { ++ /* calculate the min_st_offset */ ++ MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG, ++ MPI_MIN, fd->comm); + /* exchange request length */ + MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET, + fd->comm); + } else { /* same request size */ ++ /* calculate the 1st request's offset */ ++ min_st_offset = start_offset - myrank * req_len; + /* assign request length to all_lens[] */ + for (i = 0; i < nprocs; i ++) + all_lens[i] = req_len; + } + if (myrank < avail_cb_nodes) { -+ /* It's a IO client and it will receive data from others */ ++ /* This is a IO client and it will receive data from others */ + off = min_st_offset; + /* calcaulte other_req[i].count */ + for (i = 0; i < nprocs; i++) { @@ -518,7 +506,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c +} diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c --- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800 ++++ ad_lustre/ad_lustre.c 2008-10-17 17:03:42.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -533,73 +521,24 @@ diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c */ #include "ad_lustre.h" -@@ -13,13 +15,13 @@ +@@ -13,12 +15,12 @@ ADIOI_LUSTRE_ReadContig, /* ReadContig */ ADIOI_LUSTRE_WriteContig, /* WriteContig */ ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ - ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */ + ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */ ADIOI_GEN_SeekIndividual, /* SeekIndividual */ -- ADIOI_GEN_Fcntl, /* Fcntl */ -+ ADIOI_LUSTRE_Fcntl, /* Fcntl */ + ADIOI_GEN_Fcntl, /* Fcntl */ ADIOI_LUSTRE_SetInfo, /* SetInfo */ ADIOI_GEN_ReadStrided, /* ReadStrided */ - ADIOI_GEN_WriteStrided, /* WriteStrided */ -- ADIOI_GEN_Close, /* Close */ + ADIOI_LUSTRE_WriteStrided, /* WriteStrided */ -+ ADIOI_LUSTRE_Close, /* Close */ + ADIOI_GEN_Close, /* Close */ #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE) ADIOI_GEN_IreadContig, /* IreadContig */ - ADIOI_GEN_IwriteContig, /* IwriteContig */ -diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c ---- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800 -@@ -0,0 +1,42 @@ -+/* -*- Mode: C; c-basic-offset:4 ; -*- */ -+/* -+ * -+ * Copyright (C) 1997 University of Chicago. -+ * See COPYRIGHT notice in top-level directory. -+ * -+ * Copyright (C) 2007 Oak Ridge National Laboratory -+ * -+ * Copyright (C) 2008 Sun Microsystems, Lustre group -+ */ -+ -+#include "ad_lustre.h" -+ -+#ifdef PROFILE -+#include "mpe.h" -+#endif -+ -+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code) -+{ -+ int err, derr = 0; -+ static char myname[] = "ADIOI_LUSTRE_CLOSE"; -+ -+#ifdef PROFILE -+ MPE_Log_event(9, 0, "start close"); -+#endif -+ -+ err = close(fd->fd_sys); -+ -+#ifdef PROFILE -+ MPE_Log_event(10, 0, "end close"); -+#endif -+ -+ fd->fd_sys = -1; -+ -+ if (err == -1 || derr == -1) { -+ *error_code = -+ MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, -+ __LINE__, MPI_ERR_IO, "**io", "**io %s", -+ strerror(errno)); -+ } else -+ *error_code = MPI_SUCCESS; -+} diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h --- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/ad_lustre.h 2008-10-15 21:22:52.000000000 +0800 ++++ ad_lustre/ad_lustre.h 2008-10-17 17:11:11.000000000 +0800 @@ -1,9 +1,11 @@ /* -*- Mode: C; c-basic-offset:4 ; -*- */ -/* @@ -647,7 +586,7 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h #include "adio.h" /*#include "adioi.h"*/ -@@ -41,24 +68,56 @@ +@@ -41,24 +68,31 @@ void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code); void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); @@ -693,31 +632,6 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h int *error_code); void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); - -+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, -+ int mode, int nprocs, -+ ADIO_Offset *st_offsets, -+ ADIO_Offset *end_offsets, -+ ADIO_Offset *min_st_offset); -+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, -+ ADIO_Offset *len, int *striping_info); -+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, -+ int *len_list, int contig_access_count, -+ int *striping_info, int nprocs, -+ int *count_my_req_procs_ptr, -+ int **count_my_req_per_proc_ptr, -+ ADIOI_Access ** my_req_ptr, -+ int **buf_idx_ptr); -+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count, -+ int *len_list, int nprocs); -+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs, -+ int *count_my_req_per_proc, -+ ADIOI_Access * my_req, -+ int nprocs, int myrank, -+ ADIO_Offset req_len, -+ ADIO_Offset min_st_offset, -+ int *striping_info, -+ int *count_others_req_procs_ptr, -+ ADIOI_Access ** others_req_ptr); #endif /* End of AD_UNIX_INCLUDE */ diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c --- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800 @@ -1269,8 +1183,8 @@ diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c #define _XOPEN_SOURCE 600 diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c --- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800 -+++ ad_lustre/ad_lustre_wrcoll.c 2008-10-15 22:02:53.000000000 +0800 -@@ -0,0 +1,883 @@ ++++ ad_lustre/ad_lustre_wrcoll.c 2008-10-17 16:34:36.000000000 +0800 +@@ -0,0 +1,880 @@ +/* -*- Mode: C; c-basic-offset:4 ; -*- */ +/* + * Copyright (C) 1997 University of Chicago. @@ -1346,7 +1260,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + int i, filetype_is_contig, nprocs, myrank, do_collect = 0; + int contig_access_count = 0, buftype_is_contig, interleave_count = 0; + int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; -+ ADIO_Offset orig_fp, start_offset, end_offset, off, min_st_offset; ++ ADIO_Offset orig_fp, start_offset, end_offset, off; + ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL; + int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL; + int old_error, tmp_error; @@ -1427,9 +1341,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + } + + /* Get Lustre hints information */ -+ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs, -+ st_offsets, end_offsets, -+ &min_st_offset); ++ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1); + /* calculate what portions of the access requests of this process are + * located in which process + */ @@ -1440,8 +1352,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c + ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs, + count_my_req_per_proc, + my_req, nprocs, myrank, -+ end_offset - start_offset + 1, -+ min_st_offset, striping_info, ++ start_offset, end_offset, striping_info, + &count_others_req_procs, &others_req); + ADIOI_Free(count_my_req_per_proc); + @@ -2632,7 +2543,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c +} diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in --- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800 ++++ ad_lustre/Makefile.in 2008-10-17 17:03:06.000000000 +0800 @@ -16,7 +16,9 @@ @VPATH@ @@ -2646,8 +2557,8 @@ diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in @if [ "@ENABLE_SHLIB@" != "none" ] ; then \ diff -ruN ad_lustre_orig/README ad_lustre/README --- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800 -+++ ad_lustre/README 2008-10-15 22:43:07.000000000 +0800 -@@ -5,6 +5,25 @@ ++++ ad_lustre/README 2008-10-17 16:50:15.000000000 +0800 +@@ -5,6 +5,23 @@ o To post the code for ParColl (Partitioned collective IO) ----------------------------------------------------- @@ -2664,10 +2575,8 @@ diff -ruN ad_lustre_orig/README ad_lustre/README + same_io_size to remove unnecessary MPI_Alltoall() + o Control read-modify-write in data sieving in collective IO + by hint ds_in_coll. -+ o Optimize the IO pattern. -+ - If the whole access size <= stripe size, we suggest all the -+ IO data will be performed by the same client, to avoid the -+ extent lock revoking and reassignment. ++ o Reduce extent lock conflicts by make each OST accessed by one or ++ more constant clients. + +----------------------------------------------------- V04: