Skip to content
Snippets Groups Projects
Commit 4ba4fa33 authored by Liu Ying's avatar Liu Ying
Browse files

Branch HEAD

 b=12521
 To avoid extent lock conflicts, if avail_cb_nodes < stripe_count*CO,
 avail_cb_nodes should divide (stripe_count*CO) exactly. So that each OST
 can be accessed by one or more constant clients.
parent b04e3e23
No related branches found
No related tags found
No related merge requests found
diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
--- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800
+++ ad_lustre/ad_lustre_aggregate.c 2008-10-15 22:26:35.000000000 +0800
@@ -0,0 +1,514 @@
+++ ad_lustre/ad_lustre_aggregate.c 2008-10-17 17:30:00.000000000 +0800
@@ -0,0 +1,502 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (C) 1997 University of Chicago.
......@@ -16,10 +16,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
+#include "adio_extern.h"
+
+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
+ int mode, int nprocs,
+ ADIO_Offset *st_offsets,
+ ADIO_Offset *end_offsets,
+ ADIO_Offset *min_st_offset_ptr)
+ int mode)
+{
+ int *striping_info = NULL;
+ /* get striping information:
......@@ -27,10 +24,8 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
+ * striping_info[1]: stripe_count
+ * striping_info[2]: avail_cb_nodes
+ */
+ int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag, i;
+ int user_cb_nodes = 0, avail_cb_nodes;
+ int nprocs_for_coll = fd->hints->cb_nodes;
+ ADIO_Offset min_st_offset, max_end_offset;
+ int stripe_size, stripe_count, CO = 1, CO_max = 1, CO_nodes, lflag;
+ int avail_cb_nodes, divisor, nprocs_for_coll = fd->hints->cb_nodes;
+ char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char));
+
+ /* Get hints value */
......@@ -66,48 +61,35 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
+ CO = atoi(value);
+ CO = ADIOI_MIN(CO_max, CO);
+ }
+ /* Calculate how many IO clients we need */
+ /* To avoid extent lock conflicts,
+ * avail_cb_nodes should divide (stripe_count*CO) exactly,
+ * so that each OST is accessed by only one or more constant clients. */
+ avail_cb_nodes = ADIOI_MIN(nprocs_for_coll, stripe_count * CO);
+
+ /* user_cb_nodes*/
+ MPI_Info_get(fd->info, "user_cb_nodes", MPI_MAX_INFO_VAL, value, &lflag);
+ if (lflag)
+ user_cb_nodes = atoi(value);
+ /* If the user doesn't change the cb_nodes and
+ * the whole file access portion is no larger than stripe size,
+ * we will perform the IO by the same process (rank0 by default).
+ */
+ /* calculate the whole file access portion */
+ min_st_offset = st_offsets[0];
+ max_end_offset = end_offsets[0];
+ for (i = 0; i < nprocs; i ++) {
+ min_st_offset = ADIOI_MIN(min_st_offset, st_offsets[i]);
+ max_end_offset = ADIOI_MAX(max_end_offset, end_offsets[i]);
+ }
+ if (!user_cb_nodes) {
+ /* Check the whole file access portion
+ * if (whole_range <= stripe_size)
+ * then always collect data to the same process;
+ * set avail_cb_nodes=1; (rank0 by default).
+ * This pattern can make good use of Lustre client cache and
+ * avoid extent lock assigning and revoking.
+ *
+ * The recent experiments show good performance. We still need more
+ * validation.
+ */
+ if ((max_end_offset > min_st_offset) &&
+ (max_end_offset - min_st_offset) <= (ADIO_Offset) stripe_size)
+ avail_cb_nodes = 1;
+ if (avail_cb_nodes == nprocs_for_coll) {
+ CO_nodes = stripe_count * CO;
+ do {
+ /* find the divisor of CO_nodes */
+ divisor = 1;
+ do {
+ divisor ++;
+ } while (CO_nodes % divisor);
+ CO_nodes = CO_nodes / divisor;
+ /* if stripe_count*CO is a prime number, change nothing */
+ if ((CO_nodes <= avail_cb_nodes) && (CO_nodes != 1)) {
+ avail_cb_nodes = CO_nodes;
+ break;
+ }
+ } while (CO_nodes != 1);
+ }
+
+ ADIOI_Free(value);
+
+ *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int));
+ striping_info = *striping_info_ptr;
+ striping_info[0] = stripe_size;
+ striping_info[1] = stripe_count;
+ striping_info[2] = avail_cb_nodes;
+
+ *min_st_offset_ptr = min_st_offset;
+ ADIOI_Free(value);
+}
+
+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
......@@ -319,8 +301,8 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
+ int *count_my_req_per_proc,
+ ADIOI_Access * my_req,
+ int nprocs, int myrank,
+ ADIO_Offset req_len,
+ ADIO_Offset min_st_offset,
+ ADIO_Offset start_offset,
+ ADIO_Offset end_offset,
+ int *striping_info,
+ int *count_others_req_procs_ptr,
+ ADIOI_Access ** others_req_ptr)
......@@ -334,7 +316,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
+ MPI_Status *statuses;
+ ADIOI_Access *others_req;
+ char *value = NULL;
+ ADIO_Offset off, avail_len, rem_len, *all_lens;
+ ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens;
+
+ /* There are two hints, which could reduce some MPI communication overhead,
+ * if the users knows the I/O pattern and set them correctly. */
......@@ -365,20 +347,26 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
+ for (i = 0; i < nprocs; i++) {
+ others_req[i].count = 0;
+ }
+ req_len = end_offset - start_offset + 1;
+ all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset));
+
+ /* same req size ? */
+ if (samesize == 0) {
+ /* calculate the min_st_offset */
+ MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG,
+ MPI_MIN, fd->comm);
+ /* exchange request length */
+ MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET,
+ fd->comm);
+ } else { /* same request size */
+ /* calculate the 1st request's offset */
+ min_st_offset = start_offset - myrank * req_len;
+ /* assign request length to all_lens[] */
+ for (i = 0; i < nprocs; i ++)
+ all_lens[i] = req_len;
+ }
+ if (myrank < avail_cb_nodes) {
+ /* It's a IO client and it will receive data from others */
+ /* This is a IO client and it will receive data from others */
+ off = min_st_offset;
+ /* calcaulte other_req[i].count */
+ for (i = 0; i < nprocs; i++) {
......@@ -518,7 +506,7 @@ diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c
+}
diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
--- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800
+++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800
+++ ad_lustre/ad_lustre.c 2008-10-17 17:03:42.000000000 +0800
@@ -1,9 +1,11 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
......@@ -533,73 +521,24 @@ diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c
*/
#include "ad_lustre.h"
@@ -13,13 +15,13 @@
@@ -13,12 +15,12 @@
ADIOI_LUSTRE_ReadContig, /* ReadContig */
ADIOI_LUSTRE_WriteContig, /* WriteContig */
ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */
- ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */
+ ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */
ADIOI_GEN_SeekIndividual, /* SeekIndividual */
- ADIOI_GEN_Fcntl, /* Fcntl */
+ ADIOI_LUSTRE_Fcntl, /* Fcntl */
ADIOI_GEN_Fcntl, /* Fcntl */
ADIOI_LUSTRE_SetInfo, /* SetInfo */
ADIOI_GEN_ReadStrided, /* ReadStrided */
- ADIOI_GEN_WriteStrided, /* WriteStrided */
- ADIOI_GEN_Close, /* Close */
+ ADIOI_LUSTRE_WriteStrided, /* WriteStrided */
+ ADIOI_LUSTRE_Close, /* Close */
ADIOI_GEN_Close, /* Close */
#if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE)
ADIOI_GEN_IreadContig, /* IreadContig */
ADIOI_GEN_IwriteContig, /* IwriteContig */
diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c
--- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800
+++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800
@@ -0,0 +1,42 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ *
+ * Copyright (C) 1997 University of Chicago.
+ * See COPYRIGHT notice in top-level directory.
+ *
+ * Copyright (C) 2007 Oak Ridge National Laboratory
+ *
+ * Copyright (C) 2008 Sun Microsystems, Lustre group
+ */
+
+#include "ad_lustre.h"
+
+#ifdef PROFILE
+#include "mpe.h"
+#endif
+
+void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code)
+{
+ int err, derr = 0;
+ static char myname[] = "ADIOI_LUSTRE_CLOSE";
+
+#ifdef PROFILE
+ MPE_Log_event(9, 0, "start close");
+#endif
+
+ err = close(fd->fd_sys);
+
+#ifdef PROFILE
+ MPE_Log_event(10, 0, "end close");
+#endif
+
+ fd->fd_sys = -1;
+
+ if (err == -1 || derr == -1) {
+ *error_code =
+ MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname,
+ __LINE__, MPI_ERR_IO, "**io", "**io %s",
+ strerror(errno));
+ } else
+ *error_code = MPI_SUCCESS;
+}
diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
--- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800
+++ ad_lustre/ad_lustre.h 2008-10-15 21:22:52.000000000 +0800
+++ ad_lustre/ad_lustre.h 2008-10-17 17:11:11.000000000 +0800
@@ -1,9 +1,11 @@
/* -*- Mode: C; c-basic-offset:4 ; -*- */
-/*
......@@ -647,7 +586,7 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
#include "adio.h"
/*#include "adioi.h"*/
@@ -41,24 +68,56 @@
@@ -41,24 +68,31 @@
void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code);
void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code);
......@@ -693,31 +632,6 @@ diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h
int *error_code);
void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code);
-
+void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr,
+ int mode, int nprocs,
+ ADIO_Offset *st_offsets,
+ ADIO_Offset *end_offsets,
+ ADIO_Offset *min_st_offset);
+int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off,
+ ADIO_Offset *len, int *striping_info);
+void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list,
+ int *len_list, int contig_access_count,
+ int *striping_info, int nprocs,
+ int *count_my_req_procs_ptr,
+ int **count_my_req_per_proc_ptr,
+ ADIOI_Access ** my_req_ptr,
+ int **buf_idx_ptr);
+int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count,
+ int *len_list, int nprocs);
+void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs,
+ int *count_my_req_per_proc,
+ ADIOI_Access * my_req,
+ int nprocs, int myrank,
+ ADIO_Offset req_len,
+ ADIO_Offset min_st_offset,
+ int *striping_info,
+ int *count_others_req_procs_ptr,
+ ADIOI_Access ** others_req_ptr);
#endif /* End of AD_UNIX_INCLUDE */
diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c
--- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800
......@@ -1269,8 +1183,8 @@ diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c
#define _XOPEN_SOURCE 600
diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
--- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800
+++ ad_lustre/ad_lustre_wrcoll.c 2008-10-15 22:02:53.000000000 +0800
@@ -0,0 +1,883 @@
+++ ad_lustre/ad_lustre_wrcoll.c 2008-10-17 16:34:36.000000000 +0800
@@ -0,0 +1,880 @@
+/* -*- Mode: C; c-basic-offset:4 ; -*- */
+/*
+ * Copyright (C) 1997 University of Chicago.
......@@ -1346,7 +1260,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
+ int i, filetype_is_contig, nprocs, myrank, do_collect = 0;
+ int contig_access_count = 0, buftype_is_contig, interleave_count = 0;
+ int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs;
+ ADIO_Offset orig_fp, start_offset, end_offset, off, min_st_offset;
+ ADIO_Offset orig_fp, start_offset, end_offset, off;
+ ADIO_Offset *offset_list = NULL, *st_offsets = NULL, *end_offsets = NULL;
+ int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL;
+ int old_error, tmp_error;
......@@ -1427,9 +1341,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
+ }
+
+ /* Get Lustre hints information */
+ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs,
+ st_offsets, end_offsets,
+ &min_st_offset);
+ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1);
+ /* calculate what portions of the access requests of this process are
+ * located in which process
+ */
......@@ -1440,8 +1352,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c
+ ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs,
+ count_my_req_per_proc,
+ my_req, nprocs, myrank,
+ end_offset - start_offset + 1,
+ min_st_offset, striping_info,
+ start_offset, end_offset, striping_info,
+ &count_others_req_procs, &others_req);
+ ADIOI_Free(count_my_req_per_proc);
+
......@@ -2632,7 +2543,7 @@ diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c
+}
diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
--- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800
+++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800
+++ ad_lustre/Makefile.in 2008-10-17 17:03:06.000000000 +0800
@@ -16,7 +16,9 @@
@VPATH@
......@@ -2646,8 +2557,8 @@ diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in
@if [ "@ENABLE_SHLIB@" != "none" ] ; then \
diff -ruN ad_lustre_orig/README ad_lustre/README
--- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800
+++ ad_lustre/README 2008-10-15 22:43:07.000000000 +0800
@@ -5,6 +5,25 @@
+++ ad_lustre/README 2008-10-17 16:50:15.000000000 +0800
@@ -5,6 +5,23 @@
o To post the code for ParColl (Partitioned collective IO)
-----------------------------------------------------
......@@ -2664,10 +2575,8 @@ diff -ruN ad_lustre_orig/README ad_lustre/README
+ same_io_size to remove unnecessary MPI_Alltoall()
+ o Control read-modify-write in data sieving in collective IO
+ by hint ds_in_coll.
+ o Optimize the IO pattern.
+ - If the whole access size <= stripe size, we suggest all the
+ IO data will be performed by the same client, to avoid the
+ extent lock revoking and reassignment.
+ o Reduce extent lock conflicts by make each OST accessed by one or
+ more constant clients.
+
+-----------------------------------------------------
V04:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment