diff --git a/lustre/contrib/adio_driver_mpich2-1.0.7.patch b/lustre/contrib/adio_driver_mpich2-1.0.7.patch new file mode 100644 index 0000000000000000000000000000000000000000..723dd2f3ee15940af675e30a672395804bbfebaa --- /dev/null +++ b/lustre/contrib/adio_driver_mpich2-1.0.7.patch @@ -0,0 +1,2922 @@ +diff -ruN ad_lustre_orig/ad_lustre_aggregate.c ad_lustre/ad_lustre_aggregate.c +--- ad_lustre_orig/ad_lustre_aggregate.c 1970-01-01 08:00:00.000000000 +0800 ++++ ad_lustre/ad_lustre_aggregate.c 2008-09-17 18:20:35.000000000 +0800 +@@ -0,0 +1,676 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ * ++ * Copyright (C) 2007 Oak Ridge National Laboratory ++ * ++ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ */ ++ ++#include "ad_lustre.h" ++#include "adio_extern.h" ++ ++void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, ++ int mode, int nprocs) ++{ ++ int *striping_info = NULL; ++ /* get striping information: ++ * striping_info[0] = stripe_size; ++ * striping_info[1] = stripe_count; ++ * striping_info[2] = CO; ++ */ ++ /* for easy understanding, we name some variables */ ++ int stripe_size, stripe_count, CO = 1, CO_max = 1, lflag; ++ char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); ++ /* stripe size */ ++ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag); ++ if (lflag) ++ stripe_size = atoi(value); ++ /* stripe count */ ++ MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, &lflag); ++ if (lflag) ++ stripe_count = atoi(value); ++ /* stripe_size and stripe_count have been validated in ADIOI_LUSTRE_Open() */ ++ ++ /* calculate CO */ ++ if (!mode) { ++ /* for collective read, ++ * if "CO" clients access the same OST simultaneously, ++ * the OST disk seek time would be large. So, to avoid this, ++ * it might be better if 1 client only accesses 1 OST. ++ * So, we set CO = 1 to meet the above requirement. ++ */ ++ CO = 1; ++ /*XXX: maybe there are other better way for collective read */ ++ } else { ++ /* CO_max: the largest number of IO clients for each ost group */ ++ CO_max = (nprocs - 1)/ stripe_count + 1; ++ /* CO also has been validated in ADIOI_LUSTRE_Open(), >0 */ ++ MPI_Info_get(fd->info, "CO", MPI_MAX_INFO_VAL, value, &lflag); ++ if (lflag) ++ CO = atoi(value); ++ CO = ADIOI_MIN(CO_max, CO); ++ } ++ ADIOI_Free(value); ++ /* although there are known "N" hints so far, we still malloc space here ++ * instead of declaring an array[3] outside, ++ * because on one hand in the future we probably need more hints, and ++ * on the other hand this function can be called by ++ * both collective read and write conveniently. ++ */ ++ *striping_info_ptr = (int *) ADIOI_Malloc(3 * sizeof(int)); ++ striping_info = *striping_info_ptr; ++ striping_info[0] = stripe_size; ++ striping_info[1] = stripe_count; ++ striping_info[2] = CO; ++} ++ ++int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, ++ ADIO_Offset *len, int nprocs, ++ int *striping_info) ++{ ++ /* please refer the comments in above function for the detailed algorithm */ ++ int rank_index; ++ ADIO_Offset avail_bytes; ++ ++ int stripe_size = striping_info[0]; ++ int stripe_count = striping_info[1]; ++ int CO = striping_info[2]; ++ int avail_nprocs = ADIOI_MIN(stripe_count * CO, nprocs); ++ ++ /* calculate the rank by offset directly */ ++ rank_index = (int)((off / stripe_size) % avail_nprocs); ++ /* XXX: the above method is so simple that the processes in top ranks are always ++ * chosen to be I/O clients. we hope they are different each time. ++ */ ++ ++ avail_bytes = (off / (ADIO_Offset)stripe_size + 1) * ++ (ADIO_Offset)stripe_size - off; ++ if (avail_bytes < *len) { ++ /* this proc only has part of the requested contig. region */ ++ *len = avail_bytes; ++ } ++ ++ return rank_index; ++} ++ ++void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ++ int *len_list, int contig_access_count, ++ int *striping_info, int nprocs, ++ int *count_my_req_procs_ptr, ++ int **count_my_req_per_proc_ptr, ++ ADIOI_Access ** my_req_ptr, ++ int **buf_idx_ptr) ++{ ++ int *count_my_req_per_proc, count_my_req_procs, *buf_idx; ++ int i, l, proc; ++ ADIO_Offset avail_len, rem_len, curr_idx, off; ++ ADIOI_Access *my_req; ++ ++ *count_my_req_per_proc_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); ++ count_my_req_per_proc = *count_my_req_per_proc_ptr; ++ ++ /* buf_idx is relevant only if buftype_is_contig. ++ * buf_idx[i] gives the index into user_buf where data received ++ * from proc. i should be placed. This allows receives to be done ++ * without extra buffer. This can't be done if buftype is not contig. ++ */ ++ buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ /* initialize buf_idx to -1 */ ++ for (i = 0; i < nprocs; i++) ++ buf_idx[i] = -1; ++ ++ /* one pass just to calculate how much space to allocate for my_req; ++ * contig_access_count was calculated way back in ADIOI_Calc_my_off_len() ++ */ ++ for (i = 0; i < contig_access_count; i++) { ++ /* short circuit offset/len processing if len == 0 ++ * (zero-byte read/write ++ */ ++ if (len_list[i] == 0) ++ continue; ++ off = offset_list[i]; ++ avail_len = len_list[i]; ++ /* we set avail_len to be the total size of the access. ++ * then ADIOI_LUSTRE_Calc_aggregator() will modify the value to return ++ * the amount that was available. ++ */ ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, ++ striping_info); ++ count_my_req_per_proc[proc]++; ++ /* figure out how many data is remaining in the access ++ * we'll take care of this data (if there is any) ++ * in the while loop below. ++ */ ++ rem_len = len_list[i] - avail_len; ++ ++ while (rem_len != 0) { ++ off += avail_len; /* point to first remaining byte */ ++ avail_len = rem_len; /* save remaining size, pass to calc */ ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, ++ striping_info); ++ count_my_req_per_proc[proc]++; ++ rem_len -= avail_len; /* reduce remaining length by amount from fd */ ++ } ++ } ++ ++ *my_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * sizeof(ADIOI_Access)); ++ my_req = *my_req_ptr; ++ ++ count_my_req_procs = 0; ++ for (i = 0; i < nprocs; i++) { ++ if (count_my_req_per_proc[i]) { ++ my_req[i].offsets = (ADIO_Offset *) ++ ADIOI_Malloc(count_my_req_per_proc[i] * ++ sizeof(ADIO_Offset)); ++ my_req[i].lens = (int *) ADIOI_Malloc(count_my_req_per_proc[i] * ++ sizeof(int)); ++ count_my_req_procs++; ++ } ++ my_req[i].count = 0; /* will be incremented where needed later */ ++ } ++ ++ /* now fill in my_req */ ++ curr_idx = 0; ++ for (i = 0; i < contig_access_count; i++) { ++ if (len_list[i] == 0) ++ continue; ++ off = offset_list[i]; ++ avail_len = len_list[i]; ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, ++ striping_info); ++ ++ /* for each separate contiguous access from this process */ ++ if (buf_idx[proc] == -1) ++ buf_idx[proc] = (int) curr_idx; ++ ++ l = my_req[proc].count; ++ curr_idx += (int) avail_len; /* NOTE: Why is curr_idx an int? Fix? */ ++ ++ rem_len = len_list[i] - avail_len; ++ ++ /* store the proc, offset, and len information in an array ++ * of structures, my_req. Each structure contains the ++ * offsets and lengths located in that process's FD, ++ * and the associated count. ++ */ ++ my_req[proc].offsets[l] = off; ++ my_req[proc].lens[l] = (int) avail_len; ++ my_req[proc].count++; ++ ++ while (rem_len != 0) { ++ off += avail_len; ++ avail_len = rem_len; ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, nprocs, ++ striping_info); ++ if (buf_idx[proc] == -1) ++ buf_idx[proc] = (int) curr_idx; ++ ++ l = my_req[proc].count; ++ curr_idx += avail_len; ++ rem_len -= avail_len; ++ ++ my_req[proc].offsets[l] = off; ++ my_req[proc].lens[l] = (int) avail_len; ++ my_req[proc].count++; ++ } ++ } ++ ++#ifdef AGG_DEBUG ++ for (i = 0; i < nprocs; i++) { ++ if (count_my_req_per_proc[i] > 0) { ++ FPRINTF(stdout, "data needed from %d (count = %d):\n", ++ i, my_req[i].count); ++ for (l = 0; l < my_req[i].count; l++) { ++ FPRINTF(stdout, " off[%d] = %lld, len[%d] = %d\n", ++ l, my_req[i].offsets[l], l, my_req[i].lens[l]); ++ } ++ } ++ } ++#endif ++#if 0 ++ for (i = 0; i < nprocs; i++) { ++ FPRINTF(stdout, "buf_idx[%d] = 0x%x\n", i, buf_idx[i]); ++ } ++#endif ++ ++ *count_my_req_procs_ptr = count_my_req_procs; ++ *buf_idx_ptr = buf_idx; ++} ++ ++int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count, ++ int *len_list, int nprocs) ++{ ++ /* Algorithm: ++ * So far, only one case is suitable for collective I/O ++ * (1) request size <= big_req_size ++ * ++ * if (avg_req_size > big_req_size) { ++ * docollect = 0; ++ * } ++ */ ++ ++ int i, docollect = 1, lflag, big_req_size = 0; ++ ADIO_Offset req_size = 0, total_req_size; ++ int avg_req_size, total_access_count; ++ char *value = NULL; ++ ++ /* calculate total_req_size and total_access_count */ ++ for (i = 0; i < contig_access_count; i++) ++ req_size += len_list[i]; ++ MPI_Allreduce(&req_size, &total_req_size, 1, MPI_LONG_LONG_INT, MPI_SUM, ++ fd->comm); ++ MPI_Allreduce(&contig_access_count, &total_access_count, 1, MPI_INT, MPI_SUM, ++ fd->comm); ++ /* estimate average req_size */ ++ avg_req_size = (int)(total_req_size / total_access_count); ++ ++ /* get hint of hole_ratio */ ++ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); ++ MPI_Info_get(fd->info, "big_req_size", MPI_MAX_INFO_VAL, value, &lflag); ++ if (lflag) ++ big_req_size = atoi(value); ++ ++ if ((big_req_size > 0) && (avg_req_size > big_req_size)) ++ docollect = 0; ++ ++ ADIOI_Free(value); ++ ++ return docollect; ++} ++ ++void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ++ ADIO_Offset **offset_list_ptr, ++ int **len_list_ptr, ++ ADIO_Offset *start_offset_ptr, ++ ADIO_Offset *end_offset_ptr, ++ int *contig_access_count_ptr) ++{ ++ int filetype_size, buftype_size, etype_size; ++ int i, j, k, frd_size = 0, old_frd_size = 0, st_index = 0; ++ int n_filetypes, etype_in_filetype; ++ ADIO_Offset abs_off_in_filetype = 0; ++ int bufsize, sum, n_etypes_in_filetype, size_in_filetype; ++ int contig_access_count, *len_list, flag, filetype_is_contig; ++ MPI_Aint filetype_extent, filetype_lb; ++ ADIOI_Flatlist_node *flat_file; ++ ADIO_Offset *offset_list, off, end_offset = 0, disp; ++ ++ /* For this process's request, calculate the list of offsets and ++ lengths in the file and determine the start and end offsets. */ ++ ++ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); ++ ++ MPI_Type_size(fd->filetype, &filetype_size); ++ MPI_Type_extent(fd->filetype, &filetype_extent); ++ MPI_Type_lb(fd->filetype, &filetype_lb); ++ MPI_Type_size(datatype, &buftype_size); ++ etype_size = fd->etype_size; ++ ++ if (!filetype_size) { ++ *contig_access_count_ptr = 0; ++ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); ++ *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int)); ++ /* 2 is for consistency. everywhere I malloc one more than needed */ ++ ++ offset_list = *offset_list_ptr; ++ len_list = *len_list_ptr; ++ offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : ++ fd->disp + etype_size * offset; ++ len_list[0] = 0; ++ *start_offset_ptr = offset_list[0]; ++ *end_offset_ptr = offset_list[0] + len_list[0] - 1; ++ return; ++ } ++ ++ if (filetype_is_contig) { ++ *contig_access_count_ptr = 1; ++ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc(2*sizeof(ADIO_Offset)); ++ *len_list_ptr = (int *) ADIOI_Malloc(2 * sizeof(int)); ++ /* 2 is for consistency. everywhere I malloc one more than needed */ ++ ++ offset_list = *offset_list_ptr; ++ len_list = *len_list_ptr; ++ offset_list[0] = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : ++ fd->disp + etype_size * offset; ++ len_list[0] = bufcount * buftype_size; ++ *start_offset_ptr = offset_list[0]; ++ *end_offset_ptr = offset_list[0] + len_list[0] - 1; ++ ++ /* update file pointer */ ++ if (file_ptr_type == ADIO_INDIVIDUAL) ++ fd->fp_ind = *end_offset_ptr + 1; ++ } else { ++ /* First calculate what size of offset_list and len_list to allocate */ ++ /* filetype already flattened in ADIO_Open or ADIO_Fcntl */ ++ flat_file = ADIOI_Flatlist; ++ while (flat_file->type != fd->filetype) ++ flat_file = flat_file->next; ++ disp = fd->disp; ++ ++ if (file_ptr_type == ADIO_INDIVIDUAL) { ++ offset = fd->fp_ind; /* in bytes */ ++ n_filetypes = -1; ++ flag = 0; ++ while (!flag) { ++ n_filetypes++; ++ for (i = 0; i < flat_file->count; i++) { ++ if (disp + flat_file->indices[i] + ++ (ADIO_Offset) n_filetypes * filetype_extent + ++ flat_file->blocklens[i] >= offset) { ++ st_index = i; ++ frd_size = (int) (disp + flat_file->indices[i] + ++ (ADIO_Offset) n_filetypes * ++ filetype_extent + ++ flat_file->blocklens[i] - ++ offset); ++ flag = 1; ++ break; ++ } ++ } ++ } ++ } else { ++ n_etypes_in_filetype = filetype_size / etype_size; ++ n_filetypes = (int) (offset / n_etypes_in_filetype); ++ etype_in_filetype = (int) (offset % n_etypes_in_filetype); ++ size_in_filetype = etype_in_filetype * etype_size; ++ ++ sum = 0; ++ for (i = 0; i < flat_file->count; i++) { ++ sum += flat_file->blocklens[i]; ++ if (sum > size_in_filetype) { ++ st_index = i; ++ frd_size = sum - size_in_filetype; ++ abs_off_in_filetype = flat_file->indices[i] + ++ size_in_filetype - ++ (sum - flat_file->blocklens[i]); ++ break; ++ } ++ } ++ ++ /* abs. offset in bytes in the file */ ++ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + ++ abs_off_in_filetype; ++ } ++ ++ /* calculate how much space to allocate for offset_list, len_list */ ++ ++ old_frd_size = frd_size; ++ contig_access_count = i = 0; ++ j = st_index; ++ bufsize = buftype_size * bufcount; ++ frd_size = ADIOI_MIN(frd_size, bufsize); ++ while (i < bufsize) { ++ if (frd_size) ++ contig_access_count++; ++ i += frd_size; ++ j = (j + 1) % flat_file->count; ++ frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i); ++ } ++ ++ /* allocate space for offset_list and len_list */ ++ ++ *offset_list_ptr = (ADIO_Offset *) ADIOI_Malloc((contig_access_count+1) * ++ sizeof(ADIO_Offset)); ++ *len_list_ptr = (int *) ADIOI_Malloc((contig_access_count + 1) * ++ sizeof(int)); ++ /* +1 to avoid a 0-size malloc */ ++ ++ offset_list = *offset_list_ptr; ++ len_list = *len_list_ptr; ++ ++ /* find start offset, end offset, and fill in offset_list and len_list */ ++ ++ *start_offset_ptr = offset; /* calculated above */ ++ ++ i = k = 0; ++ j = st_index; ++ off = offset; ++ frd_size = ADIOI_MIN(old_frd_size, bufsize); ++ while (i < bufsize) { ++ if (frd_size) { ++ offset_list[k] = off; ++ len_list[k] = frd_size; ++ k++; ++ } ++ i += frd_size; ++ end_offset = off + frd_size - 1; ++ ++ /* Note: end_offset points to the last byte-offset that will be accessed. ++ e.g., if start_offset=0 and 100 bytes to be read, end_offset=99 */ ++ ++ if (off + frd_size < disp + flat_file->indices[j] + ++ flat_file->blocklens[j] + ++ (ADIO_Offset) n_filetypes * filetype_extent) { ++ off += frd_size; ++ /* did not reach end of contiguous block in filetype. ++ * no more I/O needed. off is incremented by frd_size. ++ */ ++ } else { ++ if (j < (flat_file->count - 1)) ++ j++; ++ else { ++ /* hit end of flattened filetype; ++ * start at beginning again ++ */ ++ j = 0; ++ n_filetypes++; ++ } ++ off = disp + flat_file->indices[j] + (ADIO_Offset) n_filetypes * ++ filetype_extent; ++ frd_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i); ++ } ++ } ++ ++ /* update file pointer */ ++ if (file_ptr_type == ADIO_INDIVIDUAL) ++ fd->fp_ind = off; ++ ++ *contig_access_count_ptr = contig_access_count; ++ *end_offset_ptr = end_offset; ++ } ++} ++ ++void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs, ++ int *count_my_req_per_proc, ++ ADIOI_Access * my_req, ++ int nprocs, int myrank, ++ ADIO_Offset start_offset, ++ ADIO_Offset end_offset, ++ int *striping_info, ++ int *count_others_req_procs_ptr, ++ ADIOI_Access ** others_req_ptr) ++{ ++ /* what requests of other processes will be written by this process */ ++ ++ int *count_others_req_per_proc, count_others_req_procs; ++ int i, j, lflag, samesize = 0, contiguous = 0; ++ MPI_Request *send_requests, *recv_requests; ++ MPI_Status *statuses; ++ ADIOI_Access *others_req; ++ char *value = NULL; ++ int proc, avail_nprocs, stripe_count, CO; ++ ADIO_Offset min_st_offset, off, req_len, avail_len, rem_len, *all_lens; ++ ++ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); ++ /* same io size */ ++ MPI_Info_get(fd->info, "same_io_size", MPI_MAX_INFO_VAL, value, &lflag); ++ if (lflag && !strcmp(value, "yes")) ++ samesize = 1; ++ /* contiguous data */ ++ MPI_Info_get(fd->info, "contiguous_data", MPI_MAX_INFO_VAL, value, &lflag); ++ if (lflag && !strcmp(value, "yes")) ++ contiguous = 1; ++ ++ *others_req_ptr = (ADIOI_Access *) ADIOI_Malloc(nprocs * ++ sizeof(ADIOI_Access)); ++ others_req = *others_req_ptr; ++ ++ /* if the data are contiguous, we don't need to do MPI_Alltoall */ ++ if (contiguous) { ++ stripe_count = striping_info[1]; ++ CO = striping_info[2]; ++ ++ for (i = 0; i < nprocs; i++) { ++ others_req[i].count = 0; ++ } ++ req_len = end_offset - start_offset + 1; ++ all_lens = (ADIO_Offset *) ADIOI_Malloc(nprocs * sizeof(ADIO_Offset)); ++ ++ if (samesize == 0) {/* different request size */ ++ /* calculate the min_st_offset */ ++ MPI_Allreduce(&start_offset, &min_st_offset, 1, MPI_LONG_LONG, ++ MPI_MIN, fd->comm); ++ /* exchange request length */ ++ MPI_Allgather(&req_len, 1, ADIO_OFFSET, all_lens, 1, ADIO_OFFSET, ++ fd->comm); ++ } else { /* same request size */ ++ /* calculate the min_st_offset */ ++ min_st_offset = start_offset - myrank * req_len; ++ /* assign request length to all_lens[] */ ++ for (i = 0; i < nprocs; i ++) ++ all_lens[i] = req_len; ++ } ++ avail_nprocs = ADIOI_MIN(nprocs, stripe_count * CO); ++ if (myrank < avail_nprocs) { ++ off = min_st_offset; ++ /* calcaulte other_req[i].count */ ++ for (i = 0; i < nprocs; i++) { ++ avail_len = all_lens[i]; ++ rem_len = avail_len; ++ while (rem_len > 0) { ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, ++ nprocs, striping_info); ++ if (proc == myrank) { ++ others_req[i].count ++; ++ } ++ off += avail_len; ++ rem_len -= avail_len; ++ avail_len = rem_len; ++ } ++ } ++ /* calculate offset and len for each request */ ++ off = min_st_offset; ++ for (i = 0; i < nprocs; i++) { ++ if (others_req[i].count) { ++ others_req[i].offsets = (ADIO_Offset *) ++ ADIOI_Malloc(others_req[i].count * ++ sizeof(ADIO_Offset)); ++ others_req[i].lens = (int *) ++ ADIOI_Malloc(others_req[i].count * ++ sizeof(int)); ++ others_req[i].mem_ptrs = (MPI_Aint *) ++ ADIOI_Malloc(others_req[i].count * ++ sizeof(MPI_Aint)); ++ } ++ j = 0; ++ avail_len = all_lens[i]; ++ rem_len = avail_len; ++ while (rem_len > 0) { ++ proc = ADIOI_LUSTRE_Calc_aggregator(fd, off, &avail_len, ++ nprocs, striping_info); ++ if (proc == myrank) { ++ others_req[i].offsets[j] = off; ++ others_req[i].lens[j] = (int)avail_len; ++ j ++; ++ } ++ off += avail_len; ++ rem_len -= avail_len; ++ avail_len = rem_len; ++ } ++ } ++ } ++ ADIOI_Free(value); ++ ADIOI_Free(all_lens); ++ } else { ++ /* multiple non-contiguous requests */ ++ /* first find out how much to send/recv and from/to whom */ ++ ++ /* ++ * count_others_req_procs: ++ * number of processes whose requests will be written by ++ * this process (including this process itself) ++ * count_others_req_per_proc[i]: ++ * how many separate contiguous requests of proc[i] will be ++ * written by this process. ++ */ ++ ++ count_others_req_per_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ ++ MPI_Alltoall(count_my_req_per_proc, 1, MPI_INT, ++ count_others_req_per_proc, 1, MPI_INT, fd->comm); ++ ++ count_others_req_procs = 0; ++ for (i = 0; i < nprocs; i++) { ++ if (count_others_req_per_proc[i]) { ++ others_req[i].count = count_others_req_per_proc[i]; ++ others_req[i].offsets = (ADIO_Offset *) ++ ADIOI_Malloc(others_req[i].count * ++ sizeof(ADIO_Offset)); ++ others_req[i].lens = (int *) ++ ADIOI_Malloc(others_req[i].count * ++ sizeof(int)); ++ others_req[i].mem_ptrs = (MPI_Aint *) ++ ADIOI_Malloc(others_req[i].count * ++ sizeof(MPI_Aint)); ++ count_others_req_procs++; ++ } else ++ others_req[i].count = 0; ++ } ++ ++ /* now send the calculated offsets and lengths to respective processes */ ++ ++ send_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_my_req_procs + 1) * ++ sizeof(MPI_Request)); ++ recv_requests = (MPI_Request *) ADIOI_Malloc(2 * (count_others_req_procs+1)* ++ sizeof(MPI_Request)); ++ /* +1 to avoid a 0-size malloc */ ++ ++ j = 0; ++ for (i = 0; i < nprocs; i++) { ++ if (others_req[i].count) { ++ MPI_Irecv(others_req[i].offsets, others_req[i].count, ++ ADIO_OFFSET, i, i + myrank, fd->comm, ++ &recv_requests[j]); ++ j++; ++ MPI_Irecv(others_req[i].lens, others_req[i].count, ++ MPI_INT, i, i + myrank + 1, fd->comm, ++ &recv_requests[j]); ++ j++; ++ } ++ } ++ ++ j = 0; ++ for (i = 0; i < nprocs; i++) { ++ if (my_req[i].count) { ++ MPI_Isend(my_req[i].offsets, my_req[i].count, ++ ADIO_OFFSET, i, i + myrank, fd->comm, ++ &send_requests[j]); ++ j++; ++ MPI_Isend(my_req[i].lens, my_req[i].count, ++ MPI_INT, i, i + myrank + 1, fd->comm, ++ &send_requests[j]); ++ j++; ++ } ++ } ++ ++ statuses = (MPI_Status *) ++ ADIOI_Malloc((1 + 2 * ADIOI_MAX(count_my_req_procs, ++ count_others_req_procs)) * ++ sizeof(MPI_Status)); ++ /* +1 to avoid a 0-size malloc */ ++ ++ MPI_Waitall(2 * count_my_req_procs, send_requests, statuses); ++ MPI_Waitall(2 * count_others_req_procs, recv_requests, statuses); ++ ++ ADIOI_Free(send_requests); ++ ADIOI_Free(recv_requests); ++ ADIOI_Free(statuses); ++ ADIOI_Free(count_others_req_per_proc); ++ ++ *count_others_req_procs_ptr = count_others_req_procs; ++ } ++} +diff -ruN ad_lustre_orig/ad_lustre.c ad_lustre/ad_lustre.c +--- ad_lustre_orig/ad_lustre.c 2008-09-17 14:36:57.000000000 +0800 ++++ ad_lustre/ad_lustre.c 2008-09-17 18:20:35.000000000 +0800 +@@ -1,9 +1,11 @@ + /* -*- Mode: C; c-basic-offset:4 ; -*- */ +-/* +- * Copyright (C) 2001 University of Chicago. ++/* ++ * Copyright (C) 2001 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + * Copyright (C) 2007 Oak Ridge National Laboratory ++ * ++ * Copyright (C) 2008 Sun Microsystems, Lustre group + */ + + #include "ad_lustre.h" +@@ -13,13 +15,13 @@ + ADIOI_LUSTRE_ReadContig, /* ReadContig */ + ADIOI_LUSTRE_WriteContig, /* WriteContig */ + ADIOI_GEN_ReadStridedColl, /* ReadStridedColl */ +- ADIOI_GEN_WriteStridedColl, /* WriteStridedColl */ ++ ADIOI_LUSTRE_WriteStridedColl, /* WriteStridedColl */ + ADIOI_GEN_SeekIndividual, /* SeekIndividual */ +- ADIOI_GEN_Fcntl, /* Fcntl */ ++ ADIOI_LUSTRE_Fcntl, /* Fcntl */ + ADIOI_LUSTRE_SetInfo, /* SetInfo */ + ADIOI_GEN_ReadStrided, /* ReadStrided */ +- ADIOI_GEN_WriteStrided, /* WriteStrided */ +- ADIOI_GEN_Close, /* Close */ ++ ADIOI_LUSTRE_WriteStrided, /* WriteStrided */ ++ ADIOI_LUSTRE_Close, /* Close */ + #if defined(ROMIO_HAVE_WORKING_AIO) && !defined(CRAY_XT_LUSTRE) + ADIOI_GEN_IreadContig, /* IreadContig */ + ADIOI_GEN_IwriteContig, /* IwriteContig */ +diff -ruN ad_lustre_orig/ad_lustre_close.c ad_lustre/ad_lustre_close.c +--- ad_lustre_orig/ad_lustre_close.c 1970-01-01 08:00:00.000000000 +0800 ++++ ad_lustre/ad_lustre_close.c 2008-09-17 18:20:35.000000000 +0800 +@@ -0,0 +1,42 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ * ++ * Copyright (C) 2007 Oak Ridge National Laboratory ++ * ++ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ */ ++ ++#include "ad_lustre.h" ++ ++#ifdef PROFILE ++#include "mpe.h" ++#endif ++ ++void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code) ++{ ++ int err, derr = 0; ++ static char myname[] = "ADIOI_LUSTRE_CLOSE"; ++ ++#ifdef PROFILE ++ MPE_Log_event(9, 0, "start close"); ++#endif ++ ++ err = close(fd->fd_sys); ++ ++#ifdef PROFILE ++ MPE_Log_event(10, 0, "end close"); ++#endif ++ ++ fd->fd_sys = -1; ++ ++ if (err == -1 || derr == -1) { ++ *error_code = ++ MPIO_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, myname, ++ __LINE__, MPI_ERR_IO, "**io", "**io %s", ++ strerror(errno)); ++ } else ++ *error_code = MPI_SUCCESS; ++} +diff -ruN ad_lustre_orig/ad_lustre.h ad_lustre/ad_lustre.h +--- ad_lustre_orig/ad_lustre.h 2008-09-17 14:36:57.000000000 +0800 ++++ ad_lustre/ad_lustre.h 2008-09-17 18:20:35.000000000 +0800 +@@ -1,9 +1,11 @@ + /* -*- Mode: C; c-basic-offset:4 ; -*- */ +-/* +- * Copyright (C) 1997 University of Chicago. ++/* ++ * Copyright (C) 1997 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + * Copyright (C) 2007 Oak Ridge National Laboratory ++ * ++ * Copyright (C) 2008 Sun Microsystems, Lustre group + */ + + #ifndef AD_UNIX_INCLUDE +@@ -41,24 +43,62 @@ + + void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code); + void ADIOI_LUSTRE_Close(ADIO_File fd, int *error_code); +-void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, +- MPI_Datatype datatype, int file_ptr_type, +- ADIO_Offset offset, ADIO_Status *status, int +- *error_code); +-void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, +- MPI_Datatype datatype, int file_ptr_type, +- ADIO_Offset offset, ADIO_Status *status, int +- *error_code); ++void ADIOI_LUSTRE_ReadContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, ++ int *error_code); ++void ADIOI_LUSTRE_WriteContig(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, ++ int *error_code); ++void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, ++ int *error_code); + void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, +- MPI_Datatype datatype, int file_ptr_type, +- ADIO_Offset offset, ADIO_Status *status, int +- *error_code); ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, ++ int *error_code); + void ADIOI_LUSTRE_ReadStridedColl(ADIO_File fd, void *buf, int count, +- MPI_Datatype datatype, int file_ptr_type, +- ADIO_Offset offset, ADIO_Status *status, int +- *error_code); ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, ++ int *error_code); ++void ADIOI_LUSTRE_ReadStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status *status, ++ int *error_code); + void ADIOI_LUSTRE_Fcntl(ADIO_File fd, int flag, ADIO_Fcntl_t *fcntl_struct, + int *error_code); + void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code); +- ++void ADIOI_LUSTRE_Get_striping_info(ADIO_File fd, int ** striping_info_ptr, ++ int mode, int nprocs); ++int ADIOI_LUSTRE_Calc_aggregator(ADIO_File fd, ADIO_Offset off, ++ ADIO_Offset *len, int nprocs, ++ int *striping_info); ++void ADIOI_LUSTRE_Calc_my_req(ADIO_File fd, ADIO_Offset *offset_list, ++ int *len_list, int contig_access_count, ++ int *striping_info, int nprocs, ++ int *count_my_req_procs_ptr, ++ int **count_my_req_per_proc_ptr, ++ ADIOI_Access ** my_req_ptr, ++ int **buf_idx_ptr); ++int ADIOI_LUSTRE_Docollect(ADIO_File fd, int contig_access_count, ++ int *len_list, int nprocs); ++void ADIOI_LUSTRE_Calc_my_off_len(ADIO_File fd, int bufcount, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ++ ADIO_Offset **offset_list_ptr, ++ int **len_list_ptr, ++ ADIO_Offset *start_offset_ptr, ++ ADIO_Offset *end_offset_ptr, ++ int *contig_access_count_ptr); ++void ADIOI_LUSTRE_Calc_others_req(ADIO_File fd, int count_my_req_procs, ++ int *count_my_req_per_proc, ++ ADIOI_Access * my_req, ++ int nprocs, int myrank, ++ ADIO_Offset start_offset, ++ ADIO_Offset end_offset, ++ int *striping_info, ++ int *count_others_req_procs_ptr, ++ ADIOI_Access ** others_req_ptr); + #endif /* End of AD_UNIX_INCLUDE */ +diff -ruN ad_lustre_orig/ad_lustre_hints.c ad_lustre/ad_lustre_hints.c +--- ad_lustre_orig/ad_lustre_hints.c 2008-09-17 14:36:57.000000000 +0800 ++++ ad_lustre/ad_lustre_hints.c 2008-09-17 18:20:35.000000000 +0800 +@@ -1,9 +1,11 @@ + /* -*- Mode: C; c-basic-offset:4 ; -*- */ +-/* +- * Copyright (C) 1997 University of Chicago. ++/* ++ * Copyright (C) 1997 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + * Copyright (C) 2007 Oak Ridge National Laboratory ++ * ++ * Copyright (C) 2008 Sun Microsystems, Lustre group + */ + + #include "ad_lustre.h" +@@ -11,130 +13,162 @@ + + void ADIOI_LUSTRE_SetInfo(ADIO_File fd, MPI_Info users_info, int *error_code) + { +- char *value, *value_in_fd; +- int flag, tmp_val[3], str_factor=-1, str_unit=0, start_iodev=-1; +- struct lov_user_md lum = { 0 }; +- int err, myrank, fd_sys, perm, amode, old_mask; ++ char *value = NULL; ++ int flag, tmp_val, int_val, str_factor, str_unit, start_iodev; ++ static char myname[] = "ADIOI_LUSTRE_SETINFO"; + +- value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); + if ( (fd->info) == MPI_INFO_NULL) { +- /* This must be part of the open call. can set striping parameters +- if necessary. */ ++ /* This must be part of the open call. can set striping parameters ++ if necessary. */ + MPI_Info_create(&(fd->info)); + + MPI_Info_set(fd->info, "direct_read", "false"); + MPI_Info_set(fd->info, "direct_write", "false"); + fd->direct_read = fd->direct_write = 0; +- +- /* has user specified striping or server buffering parameters ++ ++ /* has user specified striping or server buffering parameters + and do they have the same value on all processes? */ + if (users_info != MPI_INFO_NULL) { +- MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, +- value, &flag); +- if (flag) +- str_unit=atoi(value); +- +- MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, +- value, &flag); +- if (flag) +- str_factor=atoi(value); ++ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); + +- MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, ++ /* direct read and write */ ++ MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, + value, &flag); +- if (flag) +- start_iodev=atoi(value); +- +- MPI_Info_get(users_info, "direct_read", MPI_MAX_INFO_VAL, +- value, &flag); + if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { + MPI_Info_set(fd->info, "direct_read", "true"); + fd->direct_read = 1; + } + +- MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, ++ MPI_Info_get(users_info, "direct_write", MPI_MAX_INFO_VAL, + value, &flag); + if (flag && (!strcmp(value, "true") || !strcmp(value, "TRUE"))) { + MPI_Info_set(fd->info, "direct_write", "true"); + fd->direct_write = 1; + } +- } + +- MPI_Comm_rank(fd->comm, &myrank); +- if (myrank == 0) { +- tmp_val[0] = str_factor; +- tmp_val[1] = str_unit; +- tmp_val[2] = start_iodev; +- } +- MPI_Bcast(tmp_val, 3, MPI_INT, 0, fd->comm); +- +- if (tmp_val[0] != str_factor +- || tmp_val[1] != str_unit +- || tmp_val[2] != start_iodev) { +- FPRINTF(stderr, "ADIOI_LUSTRE_SetInfo: All keys" +- "-striping_factor:striping_unit:start_iodevice " +- "need to be identical across all processes\n"); +- MPI_Abort(MPI_COMM_WORLD, 1); +- } else if ((str_factor > 0) || (str_unit > 0) || (start_iodev >= 0)) { +- /* if user has specified striping info, process 0 tries to set it */ +- if (!myrank) { +- if (fd->perm == ADIO_PERM_NULL) { +- old_mask = umask(022); +- umask(old_mask); +- perm = old_mask ^ 0666; +- } +- else perm = fd->perm; +- +- amode = 0; +- if (fd->access_mode & ADIO_CREATE) +- amode = amode | O_CREAT; +- if (fd->access_mode & ADIO_RDONLY) +- amode = amode | O_RDONLY; +- if (fd->access_mode & ADIO_WRONLY) +- amode = amode | O_WRONLY; +- if (fd->access_mode & ADIO_RDWR) +- amode = amode | O_RDWR; +- if (fd->access_mode & ADIO_EXCL) +- amode = amode | O_EXCL; +- +- /* we need to create file so ensure this is set */ +- amode = amode | O_LOV_DELAY_CREATE | O_CREAT; +- +- fd_sys = open(fd->filename, amode, perm); +- if (fd_sys == -1) { +- if (errno != EEXIST) +- fprintf(stderr, +- "Failure to open file %s %d %d\n",strerror(errno), amode, perm); +- } else { +- lum.lmm_magic = LOV_USER_MAGIC; +- lum.lmm_pattern = 0; +- lum.lmm_stripe_size = str_unit; +- lum.lmm_stripe_count = str_factor; +- lum.lmm_stripe_offset = start_iodev; +- +- err = ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum); +- if (err == -1 && errno != EEXIST) { +- fprintf(stderr, "Failure to set stripe info %s \n", strerror(errno)); +- } +- close(fd_sys); +- } +- } /* End of striping parameters validation */ ++ /* stripe size */ ++ MPI_Info_get(users_info, "striping_unit", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (str_unit = atoi(value))) { ++ tmp_val = str_unit; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != str_unit) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "striping_unit", ++ error_code); ++ return; ++ } ++ MPI_Info_set(fd->info, "striping_unit", value); ++ } ++ /* stripe count */ ++ MPI_Info_get(users_info, "striping_factor", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (str_factor = atoi(value))) { ++ tmp_val = str_factor; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != str_factor) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "striping_factor", ++ error_code); ++ return; ++ } ++ MPI_Info_set(fd->info, "striping_factor", value); ++ } ++ /* stripe offset */ ++ MPI_Info_get(users_info, "start_iodevice", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && ((start_iodev = atoi(value)) >= 0)) { ++ tmp_val = start_iodev; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != start_iodev) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "start_iodevice", ++ error_code); ++ return; ++ } ++ MPI_Info_set(fd->info, "start_iodevice", value); ++ } ++ /* CO */ ++ MPI_Info_get(users_info, "CO", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag && (int_val = atoi(value)) > 0) { ++ tmp_val = int_val; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "CO", ++ error_code); ++ return; ++ } ++ MPI_Info_set(fd->info, "CO", value); ++ } ++ /* big_req_size */ ++ MPI_Info_get(users_info, "big_req_size", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag && (int_val = atoi(value)) > 0) { ++ tmp_val = int_val; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "big_req_size", ++ error_code); ++ return; ++ } ++ MPI_Info_set(fd->info, "big_req_size", value); ++ } ++ /* hint for disabling data sieving when do collective IO */ ++ MPI_Info_get(users_info, "ds_in_coll", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (!strcmp(value, "enable") || ++ !strcmp(value, "ENABLE"))) { ++ tmp_val = int_val = 1; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "ds_in_coll", ++ error_code); ++ return; ++ } ++ MPI_Info_set(fd->info, "ds_in_coll", "enable"); ++ } ++ /* same io size */ ++ MPI_Info_get(users_info, "same_io_size", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (!strcmp(value, "yes") || ++ !strcmp(value, "YES"))) { ++ tmp_val = int_val = 1; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "same_io_size", ++ error_code); ++ return; ++ } ++ MPI_Info_set(fd->info, "same_io_size", "yes"); ++ } ++ /* contiguous data */ ++ MPI_Info_get(users_info, "contiguous_data", MPI_MAX_INFO_VAL, ++ value, &flag); ++ if (flag && (!strcmp(value, "yes") || ++ !strcmp(value, "YES"))) { ++ tmp_val = int_val = 1; ++ MPI_Bcast(&tmp_val, 1, MPI_INT, 0, fd->comm); ++ if (tmp_val != int_val) { ++ MPIO_ERR_CREATE_CODE_INFO_NOT_SAME(myname, ++ "contiguous_data", ++ error_code); ++ return; ++ } ++ MPI_Info_set(fd->info, "contiguous_data", "yes"); ++ } ++ ADIOI_Free(value); + } +- +- MPI_Barrier(fd->comm); +- /* set the values for collective I/O and data sieving parameters */ +- ADIOI_GEN_SetInfo(fd, users_info, error_code); +- } else { +- /* The file has been opened previously and fd->fd_sys is a valid +- file descriptor. cannot set striping parameters now. */ +- +- /* set the values for collective I/O and data sieving parameters */ +- ADIOI_GEN_SetInfo(fd, users_info, error_code); + } +- ++ /* set the values for collective I/O and data sieving parameters */ ++ ADIOI_GEN_SetInfo(fd, users_info, error_code); ++ + if (ADIOI_Direct_read) fd->direct_read = 1; + if (ADIOI_Direct_write) fd->direct_write = 1; + +- ADIOI_Free(value); +- + *error_code = MPI_SUCCESS; + } +diff -ruN ad_lustre_orig/ad_lustre_open.c ad_lustre/ad_lustre_open.c +--- ad_lustre_orig/ad_lustre_open.c 2008-09-17 14:36:57.000000000 +0800 ++++ ad_lustre/ad_lustre_open.c 2008-09-17 18:55:50.000000000 +0800 +@@ -1,18 +1,21 @@ + /* -*- Mode: C; c-basic-offset:4 ; -*- */ +-/* +- * Copyright (C) 1997 University of Chicago. ++/* ++ * Copyright (C) 1997 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + * Copyright (C) 2007 Oak Ridge National Laboratory ++ * ++ * Copyright (C) 2008 Sun Microsystems, Lustre group + */ + + #include "ad_lustre.h" + + void ADIOI_LUSTRE_Open(ADIO_File fd, int *error_code) + { +- int perm, old_mask, amode, amode_direct; ++ int perm, old_mask, amode = 0, amode_direct = 0, flag = 0, err, myrank; ++ int stripe_size = 0, stripe_count = 0, stripe_offset = -1; + struct lov_user_md lum = { 0 }; +- char *value; ++ char *value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); + + #if defined(MPICH2) || !defined(PRINT_ERR_MSG) + static char myname[] = "ADIOI_LUSTRE_OPEN"; +@@ -22,12 +25,57 @@ + old_mask = umask(022); + umask(old_mask); + perm = old_mask ^ 0666; +- } +- else perm = fd->perm; ++ } else ++ perm = fd->perm; + +- amode = 0; +- if (fd->access_mode & ADIO_CREATE) ++ if (fd->access_mode & ADIO_CREATE) { + amode = amode | O_CREAT; ++ /* Check striping info ++ * if already set by SetInfo(), set them to lum; otherwise, set by lum ++ */ ++ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag) ++ stripe_size = atoi(value); ++ ++ MPI_Info_get(fd->info, "striping_factor", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag) ++ stripe_count = atoi(value); ++ ++ MPI_Info_get(fd->info, "start_iodevice", MPI_MAX_INFO_VAL, value, ++ &flag); ++ if (flag) ++ stripe_offset = atoi(value); ++ ++ /* if user has specified striping info, ++ * process 0 will try to check and set it. ++ */ ++ if ((stripe_size > 0) || (stripe_count > 0) || (stripe_offset >= 0)) { ++ MPI_Comm_rank(fd->comm, &myrank); ++ if (myrank == 0) { ++ int fd_sys = open(fd->filename, amode, perm); ++ if (fd_sys == -1) { ++ if (errno != EEXIST) ++ FPRINTF(stderr, "Failure to open file %s %d %d\n", ++ strerror(errno), amode, perm); ++ } else { ++ lum.lmm_magic = LOV_USER_MAGIC; ++ lum.lmm_pattern = 1; ++ lum.lmm_stripe_size = stripe_size; ++ lum.lmm_stripe_count = stripe_count; ++ lum.lmm_stripe_offset = stripe_offset; ++ ++ if (ioctl(fd_sys, LL_IOC_LOV_SETSTRIPE, &lum)) ++ FPRINTF(stderr, ++ "Failure to set striping info to Lustre!\n"); ++ close(fd_sys); ++ } ++ } ++ MPI_Barrier(fd->comm); ++ } ++ } ++ + if (fd->access_mode & ADIO_RDONLY) + amode = amode | O_RDONLY; + if (fd->access_mode & ADIO_WRONLY) +@@ -42,32 +90,36 @@ + fd->fd_sys = open(fd->filename, amode|O_CREAT, perm); + + if (fd->fd_sys != -1) { +- int err; +- +- value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL+1)*sizeof(char)); +- + /* get file striping information and set it in info */ +- lum.lmm_magic = LOV_USER_MAGIC; +- err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); +- +- if (!err) { +- sprintf(value, "%d", lum.lmm_stripe_size); +- MPI_Info_set(fd->info, "striping_unit", value); +- +- sprintf(value, "%d", lum.lmm_stripe_count); +- MPI_Info_set(fd->info, "striping_factor", value); +- +- sprintf(value, "%d", lum.lmm_stripe_offset); +- MPI_Info_set(fd->info, "start_iodevice", value); +- } +- ADIOI_Free(value); ++ lum.lmm_magic = LOV_USER_MAGIC; ++ err = ioctl(fd->fd_sys, LL_IOC_LOV_GETSTRIPE, (void *) &lum); + ++ if (!err) { ++ if (lum.lmm_stripe_size && lum.lmm_stripe_count && ++ (lum.lmm_stripe_offset >= 0)) { ++ sprintf(value, "%d", lum.lmm_stripe_size); ++ MPI_Info_set(fd->info, "striping_unit", value); ++ ++ sprintf(value, "%d", lum.lmm_stripe_count); ++ MPI_Info_set(fd->info, "striping_factor", value); ++ ++ sprintf(value, "%d", lum.lmm_stripe_offset); ++ MPI_Info_set(fd->info, "start_iodevice", value); ++ } else { ++ FPRINTF(stderr, "Striping info is invalid!\n"); ++ ADIOI_Free(value); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } ++ } else { ++ FPRINTF(stderr, "Failed to get striping info from Lustre!\n"); ++ ADIOI_Free(value); ++ MPI_Abort(MPI_COMM_WORLD, 1); ++ } + if (fd->access_mode & ADIO_APPEND) + fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); +- } +- ++ } + if ((fd->fd_sys != -1) && (fd->access_mode & ADIO_APPEND)) +- fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); ++ fd->fp_ind = fd->fp_sys_posn = lseek(fd->fd_sys, 0, SEEK_END); + + fd->fd_direct = -1; + if (fd->direct_write || fd->direct_read) { +@@ -81,20 +133,22 @@ + } + + /* --BEGIN ERROR HANDLING-- */ +- if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && +- (fd->direct_write || fd->direct_read))) { ++ if (fd->fd_sys == -1 || ((fd->fd_direct == -1) && ++ (fd->direct_write || fd->direct_read))) { + if (errno == ENAMETOOLONG) + *error_code = MPIO_Err_create_code(MPI_SUCCESS, +- MPIR_ERR_RECOVERABLE, myname, +- __LINE__, MPI_ERR_BAD_FILE, ++ MPIR_ERR_RECOVERABLE, ++ myname, __LINE__, ++ MPI_ERR_BAD_FILE, + "**filenamelong", + "**filenamelong %s %d", + fd->filename, + strlen(fd->filename)); + else if (errno == ENOENT) + *error_code = MPIO_Err_create_code(MPI_SUCCESS, +- MPIR_ERR_RECOVERABLE, myname, +- __LINE__, MPI_ERR_NO_SUCH_FILE, ++ MPIR_ERR_RECOVERABLE, ++ myname, __LINE__, ++ MPI_ERR_NO_SUCH_FILE, + "**filenoexist", + "**filenoexist %s", + fd->filename); +@@ -108,27 +162,30 @@ + fd->filename); + else if (errno == EACCES) { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, +- MPIR_ERR_RECOVERABLE, myname, +- __LINE__, MPI_ERR_ACCESS, ++ MPIR_ERR_RECOVERABLE, ++ myname, __LINE__, ++ MPI_ERR_ACCESS, + "**fileaccess", +- "**fileaccess %s", +- fd->filename ); +- } +- else if (errno == EROFS) { ++ "**fileaccess %s", ++ fd->filename); ++ } else if (errno == EROFS) { + /* Read only file or file system and write access requested */ + *error_code = MPIO_Err_create_code(MPI_SUCCESS, +- MPIR_ERR_RECOVERABLE, myname, +- __LINE__, MPI_ERR_READ_ONLY, +- "**ioneedrd", 0 ); +- } +- else { ++ MPIR_ERR_RECOVERABLE, ++ myname, __LINE__, ++ MPI_ERR_READ_ONLY, ++ "**ioneedrd", 0); ++ } else { + *error_code = MPIO_Err_create_code(MPI_SUCCESS, +- MPIR_ERR_RECOVERABLE, myname, +- __LINE__, MPI_ERR_IO, "**io", ++ MPIR_ERR_RECOVERABLE, ++ myname, __LINE__, ++ MPI_ERR_IO, "**io", + "**io %s", strerror(errno)); + } +- } ++ } else { + /* --END ERROR HANDLING-- */ +- else *error_code = MPI_SUCCESS; ++ *error_code = MPI_SUCCESS; ++ } + ++ ADIOI_Free(value); + } +diff -ruN ad_lustre_orig/ad_lustre_rwcontig.c ad_lustre/ad_lustre_rwcontig.c +--- ad_lustre_orig/ad_lustre_rwcontig.c 2008-09-17 14:36:57.000000000 +0800 ++++ ad_lustre/ad_lustre_rwcontig.c 2008-09-17 18:52:01.000000000 +0800 +@@ -1,9 +1,11 @@ + /* -*- Mode: C; c-basic-offset:4 ; -*- */ +-/* +- * Copyright (C) 1997 University of Chicago. ++/* ++ * Copyright (C) 1997 University of Chicago. + * See COPYRIGHT notice in top-level directory. + * + * Copyright (C) 2007 Oak Ridge National Laboratory ++ * ++ * Copyright (C) 2008 Sun Microsystems, Lustre group + */ + + #define _XOPEN_SOURCE 600 +@@ -138,7 +140,7 @@ + + if (io_mode) + err = write(fd->fd_sys, buf, len); +- else ++ else + err = read(fd->fd_sys, buf, len); + } else { + err = ADIOI_LUSTRE_Directio(fd, buf, len, offset, io_mode); +diff -ruN ad_lustre_orig/ad_lustre_wrcoll.c ad_lustre/ad_lustre_wrcoll.c +--- ad_lustre_orig/ad_lustre_wrcoll.c 1970-01-01 08:00:00.000000000 +0800 ++++ ad_lustre/ad_lustre_wrcoll.c 2008-09-17 18:20:35.000000000 +0800 +@@ -0,0 +1,973 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ * ++ * Copyright (C) 2007 Oak Ridge National Laboratory ++ * ++ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ */ ++ ++#include "ad_lustre.h" ++#include "adio_extern.h" ++ ++/* prototypes of functions used for collective writes only. */ ++static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf, ++ MPI_Datatype datatype, int nprocs, ++ int myrank, ++ ADIOI_Access *others_req, ++ ADIOI_Access *my_req, ++ ADIO_Offset *offset_list, ++ int *len_list, ++ int contig_access_count, ++ int * striping_info, ++ int *buf_idx, int *error_code); ++static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf, ++ ADIOI_Flatlist_node * flat_buf, ++ char **send_buf, ++ ADIO_Offset * offset_list, ++ int *len_list, int *send_size, ++ MPI_Request * requests, ++ int *sent_to_proc, int nprocs, ++ int myrank, int contig_access_count, ++ int * striping_info, ++ int *send_buf_idx, ++ int *curr_to_proc, ++ int *done_to_proc, int iter, ++ MPI_Aint buftype_extent); ++static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf, ++ char *write_buf, ++ ADIOI_Flatlist_node * flat_buf, ++ ADIO_Offset * offset_list, ++ int *len_list, int *send_size, ++ int *recv_size, ADIO_Offset off, ++ int size, int *count, ++ int *start_pos, int *partial_recv, ++ int *sent_to_proc, int nprocs, ++ int myrank, int buftype_is_contig, ++ int contig_access_count, ++ int * striping_info, ++ ADIOI_Access * others_req, ++ int *send_buf_idx, ++ int *curr_to_proc, ++ int *done_to_proc, int *hole, ++ int iter, MPI_Aint buftype_extent, ++ int *buf_idx, int *error_code); ++static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count, ++ ADIO_Offset * srt_off, int *srt_len, ++ int *start_pos, int nprocs, int nprocs_recv, ++ int total_elements); ++ ++void ADIOI_LUSTRE_WriteStridedColl(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, ++ int file_ptr_type, ADIO_Offset offset, ++ ADIO_Status * status, int *error_code) ++{ ++ ADIOI_Access *my_req; ++ /* array of nprocs access structures, one for each other process has ++ this process's request */ ++ ++ ADIOI_Access *others_req; ++ /* array of nprocs access structures, one for each other process ++ whose request is written by this process. */ ++ ++ int i, filetype_is_contig, nprocs, nprocs_for_coll, myrank, do_collect = 0; ++ int contig_access_count = 0, buftype_is_contig; ++ int *count_my_req_per_proc, count_my_req_procs, count_others_req_procs; ++ ADIO_Offset orig_fp, start_offset, end_offset, off, *offset_list = NULL; ++ int *buf_idx = NULL, *len_list = NULL, *striping_info = NULL; ++ int old_error, tmp_error; ++ ++ MPI_Comm_size(fd->comm, &nprocs); ++ MPI_Comm_rank(fd->comm, &myrank); ++ ++ nprocs_for_coll = fd->hints->cb_nodes; ++ orig_fp = fd->fp_ind; ++ ++ /* IO patten identification if cb_write isn't disabled */ ++ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ++ /* For this process's request, calculate the list of offsets and ++ lengths in the file and determine the start and end offsets. */ ++ ADIOI_LUSTRE_Calc_my_off_len(fd, count, datatype, file_ptr_type, offset, ++ &offset_list, &len_list, &start_offset, ++ &end_offset, &contig_access_count); ++ /* Get striping information */ ++ ADIOI_LUSTRE_Get_striping_info(fd, &striping_info, 1, nprocs); ++ /* check if the access pattern can benefit from collective write */ ++ do_collect = ADIOI_LUSTRE_Docollect(fd, contig_access_count, ++ len_list, nprocs); ++ } ++ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ++ ++ /* Decide if collective I/O should be done */ ++ if ((!do_collect && fd->hints->cb_write == ADIOI_HINT_AUTO) || ++ fd->hints->cb_write == ADIOI_HINT_DISABLE) { ++ ++ int filerange_is_contig = 0; ++ ++ /* use independent accesses */ ++ if (fd->hints->cb_write != ADIOI_HINT_DISABLE) { ++ ADIOI_Free(offset_list); ++ ADIOI_Free(len_list); ++ } ++ ++ fd->fp_ind = orig_fp; ++ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); ++ if (buftype_is_contig && filetype_is_contig) { ++ if (file_ptr_type == ADIO_EXPLICIT_OFFSET) { ++ off = fd->disp + (fd->etype_size) * offset; ++ ADIO_WriteContig(fd, buf, count, datatype, ++ ADIO_EXPLICIT_OFFSET, ++ off, status, error_code); ++ } else ++ ADIO_WriteContig(fd, buf, count, datatype, ADIO_INDIVIDUAL, ++ 0, status, error_code); ++ } else { ++ ADIO_WriteStrided(fd, buf, count, datatype, file_ptr_type, ++ offset, status, error_code); ++ } ++ return; ++ } ++ ++ /* calculate what portions of the access requests of this process are ++ * located in which process ++ */ ++ ADIOI_LUSTRE_Calc_my_req(fd, offset_list, len_list, contig_access_count, ++ striping_info, nprocs, &count_my_req_procs, ++ &count_my_req_per_proc, &my_req, &buf_idx); ++ /* calculate what process's requests will be written by this process */ ++ ADIOI_LUSTRE_Calc_others_req(fd, count_my_req_procs, ++ count_my_req_per_proc, ++ my_req, nprocs, myrank, ++ start_offset, end_offset, striping_info, ++ &count_others_req_procs, &others_req); ++ ADIOI_Free(count_my_req_per_proc); ++ ++ /* exchange data and write in sizes of no more than stripe_size. */ ++ ADIOI_LUSTRE_Exch_and_write(fd, buf, datatype, nprocs, myrank, ++ others_req, my_req, ++ offset_list, len_list, contig_access_count, ++ striping_info, buf_idx, error_code); ++ ++ old_error = *error_code; ++ if (*error_code != MPI_SUCCESS) ++ *error_code = MPI_ERR_IO; ++ ++ /* optimization: if only one process performing i/o, we can perform ++ * a less-expensive Bcast */ ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_postwrite_a, 0, NULL); ++#endif ++ if (fd->hints->cb_nodes == 1) ++ MPI_Bcast(error_code, 1, MPI_INT, ++ fd->hints->ranklist[0], fd->comm); ++ else { ++ tmp_error = *error_code; ++ MPI_Allreduce(&tmp_error, error_code, 1, MPI_INT, ++ MPI_MAX, fd->comm); ++ } ++#ifdef ADIOI_MPE_LOGGING ++ MPE_Log_event(ADIOI_MPE_postwrite_b, 0, NULL); ++#endif ++ ++ if ((old_error != MPI_SUCCESS) && (old_error != MPI_ERR_IO)) ++ *error_code = old_error; ++ ++ ++ if (!buftype_is_contig) ++ ADIOI_Delete_flattened(datatype); ++ ++ /* free all memory allocated for collective I/O */ ++ /* free others_req */ ++ for (i = 0; i < nprocs; i++) { ++ if (others_req[i].count) { ++ ADIOI_Free(others_req[i].offsets); ++ ADIOI_Free(others_req[i].lens); ++ ADIOI_Free(others_req[i].mem_ptrs); ++ } ++ } ++ ADIOI_Free(others_req); ++ /* free my_req here */ ++ for (i = 0; i < nprocs; i++) { ++ if (my_req[i].count) { ++ ADIOI_Free(my_req[i].offsets); ++ ADIOI_Free(my_req[i].lens); ++ } ++ } ++ ADIOI_Free(my_req); ++ ADIOI_Free(buf_idx); ++ ADIOI_Free(offset_list); ++ ADIOI_Free(len_list); ++ ADIOI_Free(striping_info); ++ ++#ifdef HAVE_STATUS_SET_BYTES ++ if (status) { ++ int bufsize, size; ++ /* Don't set status if it isn't needed */ ++ MPI_Type_size(datatype, &size); ++ bufsize = size * count; ++ MPIR_Status_set_bytes(status, datatype, bufsize); ++ } ++ /* This is a temporary way of filling in status. The right way is to ++ * keep track of how much data was actually written during collective I/O. ++ */ ++#endif ++ ++ fd->fp_sys_posn = -1; /* set it to null. */ ++} ++ ++static void ADIOI_LUSTRE_Exch_and_write(ADIO_File fd, void *buf, ++ MPI_Datatype datatype, int nprocs, ++ int myrank, ADIOI_Access *others_req, ++ ADIOI_Access *my_req, ++ ADIO_Offset *offset_list, ++ int *len_list, int contig_access_count, ++ int *striping_info, int *buf_idx, ++ int *error_code) ++{ ++ int hole, i, j, m, flag, ntimes = 1 , max_ntimes, buftype_is_contig; ++ ADIO_Offset st_loc = -1, end_loc = -1, min_st_loc, max_end_loc; ++ ADIO_Offset off, req_off, send_off, iter_st_off, *off_list; ++ ADIO_Offset max_size, step_size = 0; ++ int real_size, req_len, send_len; ++ int *recv_curr_offlen_ptr, *recv_count, *recv_size; ++ int *send_curr_offlen_ptr, *send_size; ++ int *partial_recv, *sent_to_proc, *recv_start_pos; ++ int *send_buf_idx, *curr_to_proc, *done_to_proc; ++ char *write_buf = NULL, *value; ++ MPI_Status status; ++ ADIOI_Flatlist_node *flat_buf = NULL; ++ MPI_Aint buftype_extent; ++ int stripe_size = striping_info[0], lflag, data_sieving = 0; ++ int stripe_count = striping_info[1], CO = striping_info[2]; ++ /* IO step size in each communication */ ++ static char myname[] = "ADIOI_EXCH_AND_WRITE"; ++ ++ *error_code = MPI_SUCCESS; /* changed below if error */ ++ ++ /* calculate the number of writes of stripe size ++ * to be done by each process and the max among all processes. ++ * That gives the no. of communication phases as well. ++ */ ++ ++ for (i = 0; i < nprocs; i++) { ++ if (others_req[i].count) { ++ st_loc = others_req[i].offsets[0]; ++ end_loc = others_req[i].offsets[0]; ++ break; ++ } ++ } ++ ++ for (i = 0; i < nprocs; i++) { ++ for (j = 0; j < others_req[i].count; j++) { ++ st_loc = ADIOI_MIN(st_loc, others_req[i].offsets[j]); ++ end_loc = ADIOI_MAX(end_loc, (others_req[i].offsets[j] + ++ others_req[i].lens[j] - 1)); ++ } ++ } ++ /* this process does no writing. */ ++ if ((st_loc == -1) && (end_loc == -1)) ++ ntimes = 0; ++ MPI_Allreduce(&end_loc, &max_end_loc, 1, MPI_LONG_LONG_INT, MPI_MAX, fd->comm); ++ /* avoid min_st_loc be -1 */ ++ if (st_loc == -1) ++ st_loc = max_end_loc; ++ MPI_Allreduce(&st_loc, &min_st_loc, 1, MPI_LONG_LONG_INT, MPI_MIN, fd->comm); ++ /* align downward */ ++ min_st_loc -= min_st_loc % (ADIO_Offset)stripe_size; ++ /* when nprocs < stripe_count, there will be trouble, because some client ++ * would access more than one OST in one whole communication. ++ */ ++ step_size = (ADIO_Offset)ADIOI_MIN(nprocs, stripe_count * CO) * stripe_size; ++ max_ntimes = (int)((max_end_loc - min_st_loc) / step_size + 1); ++ if (ntimes) ++ write_buf = (char *) ADIOI_Malloc(stripe_size); ++ ++ /* calculate the start offset for each iteration */ ++ off_list = (ADIO_Offset *) ADIOI_Malloc(max_ntimes * sizeof(ADIO_Offset)); ++ for (m = 0; m < max_ntimes; m ++) ++ off_list[m] = max_end_loc; ++ for (i = 0; i < nprocs; i++) { ++ for (j = 0; j < others_req[i].count; j ++) { ++ req_off = others_req[i].offsets[j]; ++ //m = (req_off - min_st_loc) / (stripe_size * stripe_count * CO); ++ m = (int)((req_off - min_st_loc) / step_size); ++ off_list[m] = ADIOI_MIN(off_list[m], req_off); ++ } ++ } ++ ++ recv_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); ++ send_curr_offlen_ptr = (int *) ADIOI_Calloc(nprocs, sizeof(int)); ++ /* their use is explained below. calloc initializes to 0. */ ++ ++ recv_count = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ /* to store count of how many off-len pairs per proc are satisfied ++ in an iteration. */ ++ ++ send_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ /* total size of data to be sent to each proc. in an iteration. ++ Of size nprocs so that I can use MPI_Alltoall later. */ ++ ++ recv_size = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ /* total size of data to be recd. from each proc. in an iteration. */ ++ ++ sent_to_proc = (int *) ADIOI_Calloc(nprocs, sizeof(int)); ++ /* amount of data sent to each proc so far. Used in ++ ADIOI_Fill_send_buffer. initialized to 0 here. */ ++ ++ send_buf_idx = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ curr_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ done_to_proc = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ /* Above three are used in ADIOI_Fill_send_buffer */ ++ ++ recv_start_pos = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ /* used to store the starting value of recv_curr_offlen_ptr[i] in ++ this iteration */ ++ ++ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ++ if (!buftype_is_contig) { ++ ADIOI_Flatten_datatype(datatype); ++ flat_buf = ADIOI_Flatlist; ++ while (flat_buf->type != datatype) ++ flat_buf = flat_buf->next; ++ } ++ MPI_Type_extent(datatype, &buftype_extent); ++ ++ iter_st_off = min_st_loc; ++ ++ /* Although we have recognized the data according to OST index, ++ * a read-modify-write will be done if there is a hole between the data. ++ * For example: if blocksize=60, transfersize=30 and stripe_size=100, ++ * then process0 will collect data [0, 30] and [60, 90] then write. There ++ * is a hole [30, 60], which will cause a read-modify-write in [0, 90]. ++ * It will degrade collective performance. ++ * So we disable data sieving by default unless the hint "ds_in_coll" ++ * is set to "enable". ++ */ ++ /* check the hint for data sieving */ ++ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); ++ MPI_Info_get(fd->info, "ds_in_coll", MPI_MAX_INFO_VAL, value, &lflag); ++ if (lflag && !strcmp(value, "enable")) ++ data_sieving = 1; ++ ADIOI_Free(value); ++ ++ for (m = 0; m < max_ntimes; m++) { ++ /* go through all others_req and my_req to check which will be received ++ * and sent in this iteration. ++ */ ++ ++ /* Note that MPI guarantees that displacements in filetypes are in ++ monotonically nondecreasing order and that, for writes, the ++ filetypes cannot specify overlapping regions in the file. This ++ simplifies implementation a bit compared to reads. */ ++ ++ /* ++ off = start offset in the file for the data to be written in ++ this iteration ++ iter_st_off = start offset of this iteration ++ real_size = size of data written (bytes) corresponding to off ++ max_size = possible maximum size of data written in this iteration ++ req_off = offset in the file for a particular contiguous request minus ++ what was satisfied in previous iteration ++ send_off = offset the request needed by other processes in this iteration ++ req_len = size corresponding to req_off ++ send_len = size corresponding to send_off ++ */ ++ ++ /* first calculate what should be communicated */ ++ for (i = 0; i < nprocs; i++) ++ recv_count[i] = recv_size[i] = send_size[i] = 0; ++ ++ off = off_list[m]; ++ max_size = ADIOI_MIN(step_size, max_end_loc - iter_st_off + 1); ++ real_size = (int) ADIOI_MIN((off / stripe_size + 1) * stripe_size - off, ++ end_loc - off + 1); ++ ++ for (i = 0; i < nprocs; i++) { ++ if (my_req[i].count) { ++ for (j = send_curr_offlen_ptr[i]; j < my_req[i].count; j++) { ++ send_off = my_req[i].offsets[j]; ++ send_len = my_req[i].lens[j]; ++ if (send_off < iter_st_off + max_size) { ++ send_size[i] += send_len; ++ } else { ++ break; ++ } ++ } ++ send_curr_offlen_ptr[i] = j; ++ } ++ if (others_req[i].count) { ++ recv_start_pos[i] = recv_curr_offlen_ptr[i]; ++ for (j = recv_curr_offlen_ptr[i]; j < others_req[i].count; j++) { ++ req_off = others_req[i].offsets[j]; ++ req_len = others_req[i].lens[j]; ++ if (req_off < iter_st_off + max_size) { ++ recv_count[i]++; ++ MPI_Address(write_buf + req_off - off, ++ &(others_req[i].mem_ptrs[j])); ++ recv_size[i] += req_len; ++ } else { ++ break; ++ } ++ } ++ recv_curr_offlen_ptr[i] = j; ++ } ++ } ++ /* use hole to pass data_sieving flag into W_Exchange_data */ ++ hole = data_sieving; ++ ADIOI_LUSTRE_W_Exchange_data(fd, buf, write_buf, flat_buf, offset_list, ++ len_list, send_size, recv_size, off, real_size, ++ recv_count, recv_start_pos, partial_recv, ++ sent_to_proc, nprocs, myrank, ++ buftype_is_contig, contig_access_count, ++ striping_info, others_req, send_buf_idx, ++ curr_to_proc, done_to_proc, &hole, m, ++ buftype_extent, buf_idx, error_code); ++ if (*error_code != MPI_SUCCESS) ++ return; ++ ++ flag = 0; ++ for (i = 0; i < nprocs; i++) ++ if (recv_count[i]) { ++ flag = 1; ++ break; ++ } ++ if (flag) { ++ /* check whether to do data sieving */ ++ if(data_sieving) { ++ ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE, ++ ADIO_EXPLICIT_OFFSET, off, &status, ++ error_code); ++ } else { ++ /* if there is no hole, write in one time; ++ * otherwise, write data separately */ ++ if (!hole) { ++ ADIO_WriteContig(fd, write_buf, real_size, MPI_BYTE, ++ ADIO_EXPLICIT_OFFSET, off, &status, ++ error_code); ++ } else { ++ for (i = 0; i < nprocs; i++) { ++ if (others_req[i].count) { ++ for (j = 0; j < others_req[i].count; j++) { ++ if (others_req[i].offsets[j] < off + real_size && ++ others_req[i].offsets[j] >= off) { ++ ADIO_WriteContig(fd, ++ write_buf + others_req[i].offsets[j] - off, ++ others_req[i].lens[j], ++ MPI_BYTE, ADIO_EXPLICIT_OFFSET, ++ others_req[i].offsets[j], &status, ++ error_code); ++ } ++ } ++ } ++ } ++ } ++ } ++ if (*error_code != MPI_SUCCESS) ++ return; ++ } ++ ++ iter_st_off += max_size; ++ } ++ if (*error_code != MPI_SUCCESS) ++ return; ++ ++ if (ntimes) ++ ADIOI_Free(write_buf); ++ ADIOI_Free(recv_curr_offlen_ptr); ++ ADIOI_Free(send_curr_offlen_ptr); ++ ADIOI_Free(recv_count); ++ ADIOI_Free(send_size); ++ ADIOI_Free(recv_size); ++ ADIOI_Free(sent_to_proc); ++ ADIOI_Free(recv_start_pos); ++ ADIOI_Free(send_buf_idx); ++ ADIOI_Free(curr_to_proc); ++ ADIOI_Free(done_to_proc); ++ ADIOI_Free(off_list); ++} ++ ++static void ADIOI_LUSTRE_W_Exchange_data(ADIO_File fd, void *buf, ++ char *write_buf, ++ ADIOI_Flatlist_node * flat_buf, ++ ADIO_Offset * offset_list, ++ int *len_list, int *send_size, ++ int *recv_size, ADIO_Offset off, ++ int size, int *count, ++ int *start_pos, int *partial_recv, ++ int *sent_to_proc, int nprocs, ++ int myrank, int buftype_is_contig, ++ int contig_access_count, ++ int * striping_info, ++ ADIOI_Access * others_req, ++ int *send_buf_idx, ++ int *curr_to_proc, int *done_to_proc, ++ int *hole, int iter, ++ MPI_Aint buftype_extent, ++ int *buf_idx, int *error_code) ++{ ++ int i, j, *tmp_len, nprocs_recv, nprocs_send, err; ++ char **send_buf = NULL; ++ MPI_Request *requests, *send_req; ++ MPI_Datatype *recv_types; ++ MPI_Status *statuses, status; ++ int *srt_len, sum, sum_recv; ++ ADIO_Offset *srt_off; ++ int data_sieving = *hole; ++ static char myname[] = "ADIOI_W_EXCHANGE_DATA"; ++ ++ /* create derived datatypes for recv */ ++ nprocs_recv = 0; ++ for (i = 0; i < nprocs; i++) ++ if (recv_size[i]) ++ nprocs_recv++; ++ ++ recv_types = (MPI_Datatype *) ADIOI_Malloc((nprocs_recv + 1) * ++ sizeof(MPI_Datatype)); ++ /* +1 to avoid a 0-size malloc */ ++ ++ tmp_len = (int *) ADIOI_Malloc(nprocs * sizeof(int)); ++ j = 0; ++ for (i = 0; i < nprocs; i++) { ++ if (recv_size[i]) { ++ MPI_Type_hindexed(count[i], ++ &(others_req[i].lens[start_pos[i]]), ++ &(others_req[i].mem_ptrs[start_pos[i]]), ++ MPI_BYTE, recv_types + j); ++ /* absolute displacements; use MPI_BOTTOM in recv */ ++ MPI_Type_commit(recv_types + j); ++ j++; ++ } ++ } ++ ++ /* To avoid a read-modify-write, ++ * check if there are holes in the data to be written. ++ * For this, merge the (sorted) offset lists others_req using a heap-merge. ++ */ ++ ++ sum = 0; ++ for (i = 0; i < nprocs; i++) ++ sum += count[i]; ++ srt_off = (ADIO_Offset *) ADIOI_Malloc((sum + 1) * sizeof(ADIO_Offset)); ++ srt_len = (int *) ADIOI_Malloc((sum + 1) * sizeof(int)); ++ /* +1 to avoid a 0-size malloc */ ++ ++ ADIOI_Heap_merge(others_req, count, srt_off, srt_len, start_pos, ++ nprocs, nprocs_recv, sum); ++ ++ ADIOI_Free(tmp_len); ++ ++ /* check if there are any holes */ ++ *hole = 0; ++ for (i = 0; i < sum - 1; i++) { ++ if (srt_off[i] + srt_len[i] < srt_off[i + 1]) { ++ *hole = 1; ++ break; ++ } ++ } ++ /* In some cases (see John Bent ROMIO REQ # 835), an odd interaction ++ * between aggregation, nominally contiguous regions, and cb_buffer_size ++ * should be handled with a read-modify-write (otherwise we will write out ++ * more data than we receive from everyone else (inclusive), so override ++ * hole detection ++ */ ++ if (*hole == 0) { ++ sum_recv = 0; ++ for (i = 0; i < nprocs; i++) ++ sum_recv += recv_size[i]; ++ if (size > sum_recv) ++ *hole = 1; ++ } ++ /* check the hint for data sieving */ ++ if (data_sieving && nprocs_recv && *hole) { ++ ADIO_ReadContig(fd, write_buf, size, MPI_BYTE, ++ ADIO_EXPLICIT_OFFSET, off, &status, &err); ++ // --BEGIN ERROR HANDLING-- ++ if (err != MPI_SUCCESS) { ++ *error_code = MPIO_Err_create_code(err, ++ MPIR_ERR_RECOVERABLE, ++ myname, __LINE__, ++ MPI_ERR_IO, ++ "**ioRMWrdwr", 0); ++ return; ++ } ++ // --END ERROR HANDLING-- ++ } ++ ADIOI_Free(srt_off); ++ ADIOI_Free(srt_len); ++ ++ nprocs_send = 0; ++ for (i = 0; i < nprocs; i++) ++ if (send_size[i]) ++ nprocs_send++; ++ ++ if (fd->atomicity) { ++ /* bug fix from Wei-keng Liao and Kenin Coloma */ ++ requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + 1) * ++ sizeof(MPI_Request)); ++ send_req = requests; ++ } else { ++ requests = (MPI_Request *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1)* ++ sizeof(MPI_Request)); ++ /* +1 to avoid a 0-size malloc */ ++ ++ /* post receives */ ++ j = 0; ++ for (i = 0; i < nprocs; i++) { ++ if (recv_size[i]) { ++ MPI_Irecv(MPI_BOTTOM, 1, recv_types[j], i, ++ myrank + i + 100 * iter, fd->comm, requests + j); ++ j++; ++ } ++ } ++ send_req = requests + nprocs_recv; ++ } ++ ++ /* post sends. ++ * if buftype_is_contig, data can be directly sent from ++ * user buf at location given by buf_idx. else use send_buf. ++ */ ++ if (buftype_is_contig) { ++ j = 0; ++ for (i = 0; i < nprocs; i++) ++ if (send_size[i]) { ++ MPI_Isend(((char *) buf) + buf_idx[i], send_size[i], ++ MPI_BYTE, i, myrank + i + 100 * iter, fd->comm, ++ send_req + j); ++ j++; ++ buf_idx[i] += send_size[i]; ++ } ++ } else if (nprocs_send) { ++ /* buftype is not contig */ ++ send_buf = (char **) ADIOI_Malloc(nprocs * sizeof(char *)); ++ for (i = 0; i < nprocs; i++) ++ if (send_size[i]) ++ send_buf[i] = (char *) ADIOI_Malloc(send_size[i]); ++ ++ ADIOI_LUSTRE_Fill_send_buffer(fd, buf, flat_buf, send_buf, offset_list, ++ len_list, send_size, send_req, ++ sent_to_proc, nprocs, myrank, ++ contig_access_count, striping_info, ++ send_buf_idx, curr_to_proc, done_to_proc, ++ iter, buftype_extent); ++ /* the send is done in ADIOI_Fill_send_buffer */ ++ } ++ ++ /* bug fix from Wei-keng Liao and Kenin Coloma */ ++ if (fd->atomicity) { ++ j = 0; ++ for (i = 0; i < nprocs; i++) { ++ MPI_Status wkl_status; ++ if (recv_size[i]) { ++ MPI_Recv(MPI_BOTTOM, 1, recv_types[j], i, ++ myrank + i + 100 * iter, fd->comm, &wkl_status); ++ j++; ++ } ++ } ++ } ++ ++ for (i = 0; i < nprocs_recv; i++) ++ MPI_Type_free(recv_types + i); ++ ADIOI_Free(recv_types); ++ ++ /* bug fix from Wei-keng Liao and Kenin Coloma */ ++ /* +1 to avoid a 0-size malloc */ ++ if (fd->atomicity) { ++ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + 1) * ++ sizeof(MPI_Status)); ++ } else { ++ statuses = (MPI_Status *) ADIOI_Malloc((nprocs_send + nprocs_recv + 1) * ++ sizeof(MPI_Status)); ++ } ++ ++#ifdef NEEDS_MPI_TEST ++ i = 0; ++ if (fd->atomicity) { ++ /* bug fix from Wei-keng Liao and Kenin Coloma */ ++ while (!i) ++ MPI_Testall(nprocs_send, send_req, &i, statuses); ++ } else { ++ while (!i) ++ MPI_Testall(nprocs_send + nprocs_recv, requests, &i, statuses); ++ } ++#else ++ /* bug fix from Wei-keng Liao and Kenin Coloma */ ++ if (fd->atomicity) ++ MPI_Waitall(nprocs_send, send_req, statuses); ++ else ++ MPI_Waitall(nprocs_send + nprocs_recv, requests, statuses); ++#endif ++ ADIOI_Free(statuses); ++ ADIOI_Free(requests); ++ if (!buftype_is_contig && nprocs_send) { ++ for (i = 0; i < nprocs; i++) ++ if (send_size[i]) ++ ADIOI_Free(send_buf[i]); ++ ADIOI_Free(send_buf); ++ } ++} ++ ++#define ADIOI_BUF_INCR \ ++{ \ ++ while (buf_incr) { \ ++ size_in_buf = ADIOI_MIN(buf_incr, flat_buf_sz); \ ++ user_buf_idx += size_in_buf; \ ++ flat_buf_sz -= size_in_buf; \ ++ if (!flat_buf_sz) { \ ++ if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \ ++ else { \ ++ flat_buf_idx = 0; \ ++ n_buftypes++; \ ++ } \ ++ user_buf_idx = flat_buf->indices[flat_buf_idx] + \ ++ n_buftypes*buftype_extent; \ ++ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ ++ } \ ++ buf_incr -= size_in_buf; \ ++ } \ ++} ++ ++ ++#define ADIOI_BUF_COPY \ ++{ \ ++ while (size) { \ ++ size_in_buf = ADIOI_MIN(size, flat_buf_sz); \ ++ memcpy(&(send_buf[p][send_buf_idx[p]]), \ ++ ((char *) buf) + user_buf_idx, size_in_buf); \ ++ send_buf_idx[p] += size_in_buf; \ ++ user_buf_idx += size_in_buf; \ ++ flat_buf_sz -= size_in_buf; \ ++ if (!flat_buf_sz) { \ ++ if (flat_buf_idx < (flat_buf->count - 1)) flat_buf_idx++; \ ++ else { \ ++ flat_buf_idx = 0; \ ++ n_buftypes++; \ ++ } \ ++ user_buf_idx = flat_buf->indices[flat_buf_idx] + \ ++ n_buftypes*buftype_extent; \ ++ flat_buf_sz = flat_buf->blocklens[flat_buf_idx]; \ ++ } \ ++ size -= size_in_buf; \ ++ buf_incr -= size_in_buf; \ ++ } \ ++ ADIOI_BUF_INCR \ ++} ++ ++static void ADIOI_LUSTRE_Fill_send_buffer(ADIO_File fd, void *buf, ++ ADIOI_Flatlist_node * flat_buf, ++ char **send_buf, ++ ADIO_Offset * offset_list, ++ int *len_list, int *send_size, ++ MPI_Request * requests, ++ int *sent_to_proc, int nprocs, ++ int myrank, ++ int contig_access_count, ++ int * striping_info, ++ int *send_buf_idx, ++ int *curr_to_proc, ++ int *done_to_proc, int iter, ++ MPI_Aint buftype_extent) ++{ ++ /* this function is only called if buftype is not contig */ ++ int i, p, flat_buf_idx, size; ++ int flat_buf_sz, buf_incr, size_in_buf, jj, n_buftypes; ++ ADIO_Offset off, len, rem_len, user_buf_idx; ++ ++ /* curr_to_proc[p] = amount of data sent to proc. p that has already ++ * been accounted for so far ++ * done_to_proc[p] = amount of data already sent to proc. p in ++ * previous iterations ++ * user_buf_idx = current location in user buffer ++ * send_buf_idx[p] = current location in send_buf of proc. p ++ */ ++ ++ for (i = 0; i < nprocs; i++) { ++ send_buf_idx[i] = curr_to_proc[i] = 0; ++ done_to_proc[i] = sent_to_proc[i]; ++ } ++ jj = 0; ++ ++ user_buf_idx = flat_buf->indices[0]; ++ flat_buf_idx = 0; ++ n_buftypes = 0; ++ flat_buf_sz = flat_buf->blocklens[0]; ++ ++ /* flat_buf_idx = current index into flattened buftype ++ * flat_buf_sz = size of current contiguous component in flattened buf ++ */ ++ for (i = 0; i < contig_access_count; i++) { ++ off = offset_list[i]; ++ rem_len = (ADIO_Offset) len_list[i]; ++ ++ /*this request may span to more than one process */ ++ while (rem_len != 0) { ++ len = rem_len; ++ /* NOTE: len value is modified by ADIOI_Calc_aggregator() to be no ++ * longer than the single region that processor "p" is responsible ++ * for. ++ */ ++ p = ADIOI_LUSTRE_Calc_aggregator(fd, off, &len, nprocs, striping_info); ++ ++ if (send_buf_idx[p] < send_size[p]) { ++ if (curr_to_proc[p] + len > done_to_proc[p]) { ++ if (done_to_proc[p] > curr_to_proc[p]) { ++ size = (int) ADIOI_MIN(curr_to_proc[p] + len - ++ done_to_proc[p], ++ send_size[p] - ++ send_buf_idx[p]); ++ buf_incr = done_to_proc[p] - curr_to_proc[p]; ++ ADIOI_BUF_INCR ++ buf_incr = (int) (curr_to_proc[p] + len - ++ done_to_proc[p]); ++ curr_to_proc[p] = done_to_proc[p] + size; ++ ADIOI_BUF_COPY ++ } else { ++ size = (int) ADIOI_MIN(len, send_size[p] - ++ send_buf_idx[p]); ++ buf_incr = (int) len; ++ curr_to_proc[p] += size; ++ ADIOI_BUF_COPY ++ } ++ if (send_buf_idx[p] == send_size[p]) { ++ MPI_Isend(send_buf[p], send_size[p], MPI_BYTE, p, ++ myrank + p + 100 * iter, fd->comm, ++ requests + jj); ++ jj++; ++ } ++ } else { ++ curr_to_proc[p] += (int) len; ++ buf_incr = (int) len; ++ ADIOI_BUF_INCR ++ } ++ } else { ++ buf_incr = (int) len; ++ ADIOI_BUF_INCR ++ } ++ off += len; ++ rem_len -= len; ++ } ++ } ++ for (i = 0; i < nprocs; i++) ++ if (send_size[i]) ++ sent_to_proc[i] = curr_to_proc[i]; ++} ++ ++static void ADIOI_Heap_merge(ADIOI_Access * others_req, int *count, ++ ADIO_Offset * srt_off, int *srt_len, ++ int *start_pos, int nprocs, int nprocs_recv, ++ int total_elements) ++{ ++ typedef struct { ++ ADIO_Offset *off_list; ++ int *len_list; ++ int nelem; ++ } heap_struct; ++ ++ heap_struct *a, tmp; ++ int i, j, heapsize, l, r, k, smallest; ++ ++ a = (heap_struct *) ADIOI_Malloc((nprocs_recv + 1) * ++ sizeof(heap_struct)); ++ ++ j = 0; ++ for (i = 0; i < nprocs; i++) ++ if (count[i]) { ++ a[j].off_list = &(others_req[i].offsets[start_pos[i]]); ++ a[j].len_list = &(others_req[i].lens[start_pos[i]]); ++ a[j].nelem = count[i]; ++ j++; ++ } ++ ++ /* build a heap out of the first element from each list, with ++ the smallest element of the heap at the root */ ++ ++ heapsize = nprocs_recv; ++ for (i = heapsize / 2 - 1; i >= 0; i--) { ++ /* Heapify(a, i, heapsize); Algorithm from Cormen et al. pg. 143 ++ modified for a heap with smallest element at root. I have ++ removed the recursion so that there are no function calls. ++ Function calls are too expensive. */ ++ k = i; ++ for (;;) { ++ l = 2 * (k + 1) - 1; ++ r = 2 * (k + 1); ++ ++ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) ++ smallest = l; ++ else ++ smallest = k; ++ ++ if ((r < heapsize) && ++ (*(a[r].off_list) < *(a[smallest].off_list))) ++ smallest = r; ++ ++ if (smallest != k) { ++ tmp.off_list = a[k].off_list; ++ tmp.len_list = a[k].len_list; ++ tmp.nelem = a[k].nelem; ++ ++ a[k].off_list = a[smallest].off_list; ++ a[k].len_list = a[smallest].len_list; ++ a[k].nelem = a[smallest].nelem; ++ ++ a[smallest].off_list = tmp.off_list; ++ a[smallest].len_list = tmp.len_list; ++ a[smallest].nelem = tmp.nelem; ++ ++ k = smallest; ++ } else ++ break; ++ } ++ } ++ ++ for (i = 0; i < total_elements; i++) { ++ /* extract smallest element from heap, i.e. the root */ ++ srt_off[i] = *(a[0].off_list); ++ srt_len[i] = *(a[0].len_list); ++ (a[0].nelem)--; ++ ++ if (!a[0].nelem) { ++ a[0].off_list = a[heapsize - 1].off_list; ++ a[0].len_list = a[heapsize - 1].len_list; ++ a[0].nelem = a[heapsize - 1].nelem; ++ heapsize--; ++ } else { ++ (a[0].off_list)++; ++ (a[0].len_list)++; ++ } ++ ++ /* Heapify(a, 0, heapsize); */ ++ k = 0; ++ for (;;) { ++ l = 2 * (k + 1) - 1; ++ r = 2 * (k + 1); ++ ++ if ((l < heapsize) && (*(a[l].off_list) < *(a[k].off_list))) ++ smallest = l; ++ else ++ smallest = k; ++ ++ if ((r < heapsize) && ++ (*(a[r].off_list) < *(a[smallest].off_list))) ++ smallest = r; ++ ++ if (smallest != k) { ++ tmp.off_list = a[k].off_list; ++ tmp.len_list = a[k].len_list; ++ tmp.nelem = a[k].nelem; ++ ++ a[k].off_list = a[smallest].off_list; ++ a[k].len_list = a[smallest].len_list; ++ a[k].nelem = a[smallest].nelem; ++ ++ a[smallest].off_list = tmp.off_list; ++ a[smallest].len_list = tmp.len_list; ++ a[smallest].nelem = tmp.nelem; ++ ++ k = smallest; ++ } else ++ break; ++ } ++ } ++ ADIOI_Free(a); ++} +diff -ruN ad_lustre_orig/ad_lustre_wrstr.c ad_lustre/ad_lustre_wrstr.c +--- ad_lustre_orig/ad_lustre_wrstr.c 1970-01-01 08:00:00.000000000 +0800 ++++ ad_lustre/ad_lustre_wrstr.c 2008-09-17 18:20:35.000000000 +0800 +@@ -0,0 +1,463 @@ ++/* -*- Mode: C; c-basic-offset:4 ; -*- */ ++/* ++ * Copyright (C) 1997 University of Chicago. ++ * See COPYRIGHT notice in top-level directory. ++ * ++ * Copyright (C) 2007 Oak Ridge National Laboratory ++ * ++ * Copyright (C) 2008 Sun Microsystems, Lustre group ++ */ ++ ++#include "ad_lustre.h" ++#include "adio_extern.h" ++ ++#define ADIOI_BUFFERED_WRITE \ ++{ \ ++ if (req_off >= writebuf_off + writebuf_len) { \ ++ if (writebuf_len) { \ ++ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ++ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \ ++ if (!(fd->atomicity)) \ ++ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ ++ if (*error_code != MPI_SUCCESS) { \ ++ *error_code = MPIO_Err_create_code(*error_code, \ ++ MPIR_ERR_RECOVERABLE, myname, \ ++ __LINE__, MPI_ERR_IO, \ ++ "**iowswc", 0); \ ++ return; \ ++ } \ ++ } \ ++ writebuf_off = req_off; \ ++ /* stripe_size alignment */ \ ++ writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \ ++ (writebuf_off / stripe_size + 1) * \ ++ stripe_size - writebuf_off);\ ++ if (!(fd->atomicity)) \ ++ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ ++ ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\ ++ writebuf_off, &status1, error_code); \ ++ if (*error_code != MPI_SUCCESS) { \ ++ *error_code = MPIO_Err_create_code(*error_code, \ ++ MPIR_ERR_RECOVERABLE, myname, \ ++ __LINE__, MPI_ERR_IO, \ ++ "**iowsrc", 0); \ ++ return; \ ++ } \ ++ } \ ++ write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \ ++ memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\ ++ while (write_sz != req_len) {\ ++ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ++ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \ ++ if (!(fd->atomicity)) \ ++ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ ++ if (*error_code != MPI_SUCCESS) { \ ++ *error_code = MPIO_Err_create_code(*error_code, \ ++ MPIR_ERR_RECOVERABLE, myname, \ ++ __LINE__, MPI_ERR_IO, \ ++ "**iowswc", 0); \ ++ return; \ ++ } \ ++ req_len -= write_sz; \ ++ userbuf_off += write_sz; \ ++ writebuf_off += writebuf_len; \ ++ /* stripe_size alignment */ \ ++ writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \ ++ (writebuf_off / stripe_size + 1) * \ ++ stripe_size - writebuf_off);\ ++ if (!(fd->atomicity)) \ ++ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ ++ ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET,\ ++ writebuf_off, &status1, error_code); \ ++ if (*error_code != MPI_SUCCESS) { \ ++ *error_code = MPIO_Err_create_code(*error_code, \ ++ MPIR_ERR_RECOVERABLE, myname, \ ++ __LINE__, MPI_ERR_IO, \ ++ "**iowsrc", 0); \ ++ return; \ ++ } \ ++ write_sz = ADIOI_MIN(req_len, writebuf_len); \ ++ memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\ ++ } \ ++} ++ ++ ++/* this macro is used when filetype is contig and buftype is not contig. ++ it does not do a read-modify-write and does not lock*/ ++#define ADIOI_BUFFERED_WRITE_WITHOUT_READ \ ++{ \ ++ if (req_off >= writebuf_off + writebuf_len) { \ ++ writebuf_off = req_off; \ ++ /* stripe_size alignment */ \ ++ writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \ ++ (writebuf_off / stripe_size + 1) * \ ++ stripe_size - writebuf_off);\ ++ } \ ++ write_sz = (int) ADIOI_MIN(req_len, writebuf_off + writebuf_len - req_off); \ ++ memcpy(writebuf + req_off - writebuf_off, (char *)buf + userbuf_off, write_sz);\ ++ while (req_len) { \ ++ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ++ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \ ++ if (*error_code != MPI_SUCCESS) { \ ++ *error_code = MPIO_Err_create_code(*error_code, \ ++ MPIR_ERR_RECOVERABLE, myname, \ ++ __LINE__, MPI_ERR_IO, \ ++ "**iowswc", 0); \ ++ return; \ ++ } \ ++ req_len -= write_sz; \ ++ userbuf_off += write_sz; \ ++ writebuf_off += writebuf_len; \ ++ /* stripe_size alignment */ \ ++ writebuf_len = (int) ADIOI_MIN(end_offset - writebuf_off + 1, \ ++ (writebuf_off / stripe_size + 1) * \ ++ stripe_size - writebuf_off);\ ++ write_sz = ADIOI_MIN(req_len, writebuf_len); \ ++ memcpy(writebuf, (char *)buf + userbuf_off, write_sz);\ ++ } \ ++} ++ ++void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, void *buf, int count, ++ MPI_Datatype datatype, int file_ptr_type, ++ ADIO_Offset offset, ADIO_Status * status, ++ int *error_code) ++{ ++ /* offset is in units of etype relative to the filetype. */ ++ ADIOI_Flatlist_node *flat_buf, *flat_file; ++ int i, j, k, bwr_size, fwr_size = 0, st_index = 0; ++ int bufsize, num, size, sum, n_etypes_in_filetype, size_in_filetype; ++ int n_filetypes, etype_in_filetype; ++ ADIO_Offset abs_off_in_filetype = 0; ++ int filetype_size, etype_size, buftype_size, req_len; ++ MPI_Aint filetype_extent, buftype_extent; ++ int buf_count, buftype_is_contig, filetype_is_contig; ++ ADIO_Offset userbuf_off; ++ ADIO_Offset off, req_off, disp, end_offset = 0, writebuf_off, start_off; ++ char *writebuf; ++ int flag, st_fwr_size, st_n_filetypes, writebuf_len, write_sz; ++ ADIO_Status status1; ++ int new_bwr_size, new_fwr_size; ++ char * value; ++ int stripe_size, lflag = 0; ++ static char myname[] = "ADIOI_LUSTRE_WriteStrided"; ++ int myrank; ++ MPI_Comm_rank(fd->comm, &myrank); ++ ++ if (fd->hints->ds_write == ADIOI_HINT_DISABLE) { ++ /* if user has disabled data sieving on writes, use naive ++ * approach instead. ++ */ ++ ADIOI_GEN_WriteStrided_naive(fd, ++ buf, ++ count, ++ datatype, ++ file_ptr_type, ++ offset, status, error_code); ++ return; ++ } ++ ++ *error_code = MPI_SUCCESS; /* changed below if error */ ++ ++ ADIOI_Datatype_iscontig(datatype, &buftype_is_contig); ++ ADIOI_Datatype_iscontig(fd->filetype, &filetype_is_contig); ++ ++ MPI_Type_size(fd->filetype, &filetype_size); ++ if (!filetype_size) { ++ *error_code = MPI_SUCCESS; ++ return; ++ } ++ ++ MPI_Type_extent(fd->filetype, &filetype_extent); ++ MPI_Type_size(datatype, &buftype_size); ++ MPI_Type_extent(datatype, &buftype_extent); ++ etype_size = fd->etype_size; ++ ++ bufsize = buftype_size * count; ++ ++ /* get striping info */ ++ value = (char *) ADIOI_Malloc((MPI_MAX_INFO_VAL + 1) * sizeof(char)); ++ MPI_Info_get(fd->info, "striping_unit", MPI_MAX_INFO_VAL, value, &lflag); ++ if (lflag) ++ stripe_size = atoi(value); ++ ADIOI_Free(value); ++ ++ /* Different buftype to different filetype */ ++ if (!buftype_is_contig && filetype_is_contig) { ++ /* noncontiguous in memory, contiguous in file. */ ++ ADIOI_Flatten_datatype(datatype); ++ flat_buf = ADIOI_Flatlist; ++ while (flat_buf->type != datatype) ++ flat_buf = flat_buf->next; ++ ++ off = (file_ptr_type == ADIO_INDIVIDUAL) ? fd->fp_ind : ++ fd->disp + etype_size * offset; ++ ++ start_off = off; ++ end_offset = start_off + bufsize - 1; ++ writebuf_off = start_off; ++ /* write stripe size buffer each time */ ++ writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size)); ++ writebuf_len = (int) ADIOI_MIN(bufsize, ++ (writebuf_off / stripe_size + 1) * ++ stripe_size - writebuf_off); ++ ++ /* if atomicity is true, lock the region to be accessed */ ++ if (fd->atomicity) ++ ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize); ++ ++ for (j = 0; j < count; j++) { ++ for (i = 0; i < flat_buf->count; i++) { ++ userbuf_off = j * buftype_extent + flat_buf->indices[i]; ++ req_off = off; ++ req_len = flat_buf->blocklens[i]; ++ ADIOI_BUFFERED_WRITE_WITHOUT_READ ++ off += flat_buf->blocklens[i]; ++ } ++ } ++ ++ /* write the buffer out finally */ ++ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ++ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, ++ error_code); ++ ++ if (fd->atomicity) ++ ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); ++ if (*error_code != MPI_SUCCESS) ++ return; ++ ADIOI_Free(writebuf); ++ if (file_ptr_type == ADIO_INDIVIDUAL) ++ fd->fp_ind = off; ++ } else { ++ /* noncontiguous in file */ ++ /* filetype already flattened in ADIO_Open */ ++ flat_file = ADIOI_Flatlist; ++ while (flat_file->type != fd->filetype) ++ flat_file = flat_file->next; ++ disp = fd->disp; ++ ++ if (file_ptr_type == ADIO_INDIVIDUAL) { ++ offset = fd->fp_ind; /* in bytes */ ++ n_filetypes = -1; ++ flag = 0; ++ while (!flag) { ++ n_filetypes++; ++ for (i = 0; i < flat_file->count; i++) { ++ if (disp + flat_file->indices[i] + ++ (ADIO_Offset) n_filetypes * filetype_extent + ++ flat_file->blocklens[i] >= offset) { ++ st_index = i; ++ fwr_size = (int) (disp + flat_file->indices[i] + ++ (ADIO_Offset) n_filetypes * ++ filetype_extent + ++ flat_file->blocklens[i] - ++ offset); ++ flag = 1; ++ break; ++ } ++ } ++ } ++ } else { ++ n_etypes_in_filetype = filetype_size / etype_size; ++ n_filetypes = (int) (offset / n_etypes_in_filetype); ++ etype_in_filetype = (int) (offset % n_etypes_in_filetype); ++ size_in_filetype = etype_in_filetype * etype_size; ++ ++ sum = 0; ++ for (i = 0; i < flat_file->count; i++) { ++ sum += flat_file->blocklens[i]; ++ if (sum > size_in_filetype) { ++ st_index = i; ++ fwr_size = sum - size_in_filetype; ++ abs_off_in_filetype = flat_file->indices[i] + ++ size_in_filetype - (sum - flat_file->blocklens[i]); ++ break; ++ } ++ } ++ ++ /* abs. offset in bytes in the file */ ++ offset = disp + (ADIO_Offset) n_filetypes *filetype_extent + ++ abs_off_in_filetype; ++ } ++ ++ start_off = offset; ++ ++ /* If the file bytes is actually contiguous, we do not need data sieve at all */ ++ if (bufsize <= fwr_size) { ++ req_off = start_off; ++ req_len = bufsize; ++ end_offset = start_off + bufsize - 1; ++ writebuf = (char *) ADIOI_Malloc(ADIOI_MIN(bufsize, stripe_size)); ++ memset(writebuf, -1, ADIOI_MIN(bufsize, stripe_size)); ++ writebuf_off = 0; ++ writebuf_len = 0; ++ userbuf_off = 0; ++ ADIOI_BUFFERED_WRITE_WITHOUT_READ ++ } else { ++ /* Calculate end_offset, the last byte-offset that will be accessed. ++ e.g., if start_offset=0 and 100 bytes to be write, end_offset=99 */ ++ st_fwr_size = fwr_size; ++ st_n_filetypes = n_filetypes; ++ i = 0; ++ j = st_index; ++ off = offset; ++ fwr_size = ADIOI_MIN(st_fwr_size, bufsize); ++ while (i < bufsize) { ++ i += fwr_size; ++ end_offset = off + fwr_size - 1; ++ ++ if (j < (flat_file->count - 1)) ++ j++; ++ else { ++ j = 0; ++ n_filetypes++; ++ } ++ ++ off = disp + flat_file->indices[j] + ++ (ADIO_Offset) n_filetypes * filetype_extent; ++ fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize - i); ++ } ++ ++ writebuf_off = 0; ++ writebuf_len = 0; ++ writebuf = (char *) ADIOI_Malloc(stripe_size); ++ memset(writebuf, -1, stripe_size); ++ /* if atomicity is true, lock the region to be accessed */ ++ if (fd->atomicity) ++ ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize); ++ ++ if (buftype_is_contig && !filetype_is_contig) { ++ /* contiguous in memory, noncontiguous in file. should be the most ++ common case. */ ++ i = 0; ++ j = st_index; ++ off = offset; ++ n_filetypes = st_n_filetypes; ++ fwr_size = ADIOI_MIN(st_fwr_size, bufsize); ++ while (i < bufsize) { ++ if (fwr_size) { ++ /* TYPE_UB and TYPE_LB can result in ++ fwr_size = 0. save system call in such cases */ ++ /* ++ lseek(fd->fd_sys, off, SEEK_SET); ++ err = write(fd->fd_sys, ((char *) buf) + i, fwr_size); ++ */ ++ req_off = off; ++ req_len = fwr_size; ++ userbuf_off = i; ++ ADIOI_BUFFERED_WRITE ++ } ++ i += fwr_size; ++ ++ if (off + fwr_size < disp + flat_file->indices[j] + ++ flat_file->blocklens[j] + ++ (ADIO_Offset) n_filetypes * filetype_extent) ++ off += fwr_size; ++ /* did not reach end of contiguous block in filetype. ++ no more I/O needed. off is incremented by fwr_size. */ ++ else { ++ if (j < (flat_file->count - 1)) ++ j++; ++ else { ++ j = 0; ++ n_filetypes++; ++ } ++ off = disp + flat_file->indices[j] + ++ (ADIO_Offset) n_filetypes * filetype_extent; ++ fwr_size = ADIOI_MIN(flat_file->blocklens[j], ++ bufsize - i); ++ } ++ } ++ } else { ++ /* noncontiguous in memory as well as in file */ ++ ADIOI_Flatten_datatype(datatype); ++ flat_buf = ADIOI_Flatlist; ++ while (flat_buf->type != datatype) ++ flat_buf = flat_buf->next; ++ ++ k = num = buf_count = 0; ++ i = (int) (flat_buf->indices[0]); ++ j = st_index; ++ off = offset; ++ n_filetypes = st_n_filetypes; ++ fwr_size = st_fwr_size; ++ bwr_size = flat_buf->blocklens[0]; ++ ++ while (num < bufsize) { ++ size = ADIOI_MIN(fwr_size, bwr_size); ++ if (size) { ++ /* ++ lseek(fd->fd_sys, off, SEEK_SET); ++ err = write(fd->fd_sys, ((char *) buf) + i, size); ++ */ ++ req_off = off; ++ req_len = size; ++ userbuf_off = i; ++ ADIOI_BUFFERED_WRITE ++ } ++ ++ new_fwr_size = fwr_size; ++ new_bwr_size = bwr_size; ++ ++ if (size == fwr_size) { ++ /* reached end of contiguous block in file */ ++ if (j < (flat_file->count - 1)) { ++ j++; ++ } else { ++ j = 0; ++ n_filetypes++; ++ } ++ off = disp + flat_file->indices[j] + ++ (ADIO_Offset) n_filetypes * filetype_extent; ++ new_fwr_size = flat_file->blocklens[j]; ++ if (size != bwr_size) { ++ i += size; ++ new_bwr_size -= size; ++ } ++ } ++ if (size == bwr_size) { ++ /* reached end of contiguous block in memory */ ++ k = (k + 1) % flat_buf->count; ++ buf_count++; ++ i = (int) (buftype_extent * ++ (buf_count / flat_buf->count) + ++ flat_buf->indices[k]); ++ new_bwr_size = flat_buf->blocklens[k]; ++ if (size != fwr_size) { ++ off += size; ++ new_fwr_size -= size; ++ } ++ } ++ num += size; ++ fwr_size = new_fwr_size; ++ bwr_size = new_bwr_size; ++ } ++ } ++ ++ /* write the buffer out finally */ ++ if (writebuf_len) { ++ ADIO_WriteContig(fd, writebuf, writebuf_len, ++ MPI_BYTE, ADIO_EXPLICIT_OFFSET, ++ writebuf_off, &status1, error_code); ++ if (!(fd->atomicity)) ++ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); ++ if (*error_code != MPI_SUCCESS) ++ return; ++ } ++ if (fd->atomicity) ++ ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); ++ } ++ ADIOI_Free(writebuf); ++ if (file_ptr_type == ADIO_INDIVIDUAL) ++ fd->fp_ind = off; ++ } ++ fd->fp_sys_posn = -1; /* set it to null. */ ++ ++#ifdef HAVE_STATUS_SET_BYTES ++ MPIR_Status_set_bytes(status, datatype, bufsize); ++ /* This is a temporary way of filling in status. The right way is to ++ keep track of how much data was actually written by ADIOI_BUFFERED_WRITE. */ ++#endif ++ ++ if (!buftype_is_contig) ++ ADIOI_Delete_flattened(datatype); ++} +diff -ruN ad_lustre_orig/Makefile ad_lustre/Makefile +--- ad_lustre_orig/Makefile 1970-01-01 08:00:00.000000000 +0800 ++++ ad_lustre/Makefile 2008-09-17 18:20:35.000000000 +0800 +@@ -0,0 +1,50 @@ ++CC = gcc ++AR = ar cr ++RANLIB = ranlib ++LIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich.a ++srcdir = /work/download/mpich2-1.0.7-dev/src/mpi/romio/adio/ad_lustre ++CC_SHL = true ++SHLIBNAME = /work/download/mpich2-1.0.7-dev/lib/libmpich ++ ++INCLUDE_DIR = -I. -I${srcdir}/../include -I../include -I../../include -I${srcdir}/../../../../include -I../../../../include ++CFLAGS = -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/common/datatype -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/ch3/channels/sock/include -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -I/work/download/mpich2-1.0.7-dev/src/mpid/common/sock/poll -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR) ++ ++top_builddir = /work/download/mpich2-1.0.7-dev ++LIBTOOL = ++C_COMPILE_SHL = $(CC_SHL) -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -O2 -DFORTRANDOUBLEUNDERSCORE -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -DHAVE_ROMIOCONF_H -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 $(INCLUDE_DIR) ++ ++VPATH = .:${srcdir} ++ ++AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \ ++ ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \ ++ ad_lustre_fcntl.o ad_lustre_hints.o ad_lustre_close.o \ ++ ad_lustre_aggregate.o ++ ++ ++default: $(LIBNAME) ++ @if [ "none" != "none" ] ; then \ ++ $(MAKE) $(SHLIBNAME).la ;\ ++ fi ++ ++.SUFFIXES: $(SUFFIXES) .p .lo ++ ++.c.o: ++ $(CC) $(CFLAGS) -c $< ++.c.lo: ++ $(C_COMPILE_SHL) -c $< -o _s$*.o ++ @mv -f _s$*.o $*.lo ++ ++$(LIBNAME): $(AD_LUSTRE_OBJECTS) ++ $(AR) $(LIBNAME) $(AD_LUSTRE_OBJECTS) ++ $(RANLIB) $(LIBNAME) ++ ++AD_LUSTRE_LOOBJECTS=$(AD_LUSTRE_OBJECTS:.o=.lo) ++$(SHLIBNAME).la: $(AD_LUSTRE_LOOBJECTS) ++ $(AR) $(SHLIBNAME).la $(AD_LUSTRE_LOOBJECTS) ++ ++coverage: ++ -@for file in ${AD_LUSTRE_OBJECTS:.o=.c} ; do \ ++ gcov -b -f $$file ; done ++ ++clean: ++ @rm -f *.o *.lo +diff -ruN ad_lustre_orig/Makefile.in ad_lustre/Makefile.in +--- ad_lustre_orig/Makefile.in 2008-09-17 14:36:57.000000000 +0800 ++++ ad_lustre/Makefile.in 2008-09-17 18:20:35.000000000 +0800 +@@ -16,7 +16,9 @@ + @VPATH@ + + AD_LUSTRE_OBJECTS = ad_lustre.o ad_lustre_open.o \ +- ad_lustre_rwcontig.o ad_lustre_hints.o ++ ad_lustre_rwcontig.o ad_lustre_wrcoll.o ad_lustre_wrstr.o \ ++ ad_lustre_hints.o ad_lustre_aggregate.o ++ + + default: $(LIBNAME) + @if [ "@ENABLE_SHLIB@" != "none" ] ; then \ +diff -ruN ad_lustre_orig/README ad_lustre/README +--- ad_lustre_orig/README 2008-09-17 14:36:57.000000000 +0800 ++++ ad_lustre/README 2008-09-17 18:20:35.000000000 +0800 +@@ -5,6 +5,22 @@ + o To post the code for ParColl (Partitioned collective IO) + + ----------------------------------------------------- ++V05: ++----------------------------------------------------- ++ o Improved data redistribution ++ - add I/O pattern identification. If request I/O size is big, ++ collective I/O won't be done. The hint big_req_size can be ++ used to define this. ++ - provide hint CO for load balancing to control the number ++ of IO clients for each OST ++ - divide the IO clients into the different OST groups to ++ produce stripe-contiguous I/O pattern ++ - reduce the collective overhead by hints contiguous_data and ++ same_io_size to remove unnecessary MPI_Alltoall() ++ o Control read-modify-write in data sieving in collective IO ++ by hint ds_in_coll. ++ ++----------------------------------------------------- + V04: + ----------------------------------------------------- + o Direct IO and Lockless IO support