Merge pull request #1523 in HDFFV/hdf5 from ~JHENDERSON/hdf5:develop to develop

* commit '8f447bd3564a88738b9a26b1b0ac3bba785efd39':
  Add RELEASE.txt note for collective metadata read fixes
  Fix some collective metadata read issues
This commit is contained in:
Jordan Henderson 2019-02-11 22:24:06 -06:00
commit 8f42566f7a
11 changed files with 381 additions and 87 deletions

View File

@ -264,6 +264,22 @@ Bug Fixes since HDF5-1.10.3 release
Library
-------
- Fix hangs with collective metadata reads during chunked dataset I/O
In the parallel library, it was discovered that when a particular
sequence of operations following a pattern of:
"write to chunked dataset" -> "flush file" -> "read from dataset"
occurred with collective metadata reads enabled, hangs could be
observed due to certain MPI ranks not participating in the collective
metadata reads.
To fix the issue, collective metadata reads are now disabled during
chunked dataset raw data I/O.
(JTH - 2019/02/11, HDFFV-10563, HDFFV-10688)
- Performance issue when closing an object
The slow down is due to the search of the "tag_list" to find

View File

@ -1483,31 +1483,17 @@ H5C_insert_entry(H5F_t * f,
H5C__UPDATE_STATS_FOR_INSERTION(cache_ptr, entry_ptr)
#ifdef H5_HAVE_PARALLEL
if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
coll_access = (H5P_USER_TRUE == f->coll_md_read ? TRUE : FALSE);
/* If not explicitly disabled, get the cmdr setting from the API context */
if(!coll_access && H5P_FORCE_FALSE != f->coll_md_read)
coll_access = H5CX_get_coll_metadata_read();
} /* end if */
if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
coll_access = H5CX_get_coll_metadata_read();
entry_ptr->coll_access = coll_access;
if(coll_access) {
H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL)
/* Make sure the size of the collective entries in the cache remain in check */
if(H5P_USER_TRUE == f->coll_md_read) {
if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) {
if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
} /* end if */
} /* end if */
else {
if(cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) {
if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
} /* end if */
} /* end else */
if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
} /* end if */
#endif
@ -2243,13 +2229,8 @@ H5C_protect(H5F_t * f,
ring = H5CX_get_ring();
#ifdef H5_HAVE_PARALLEL
if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
coll_access = (H5P_USER_TRUE == f->coll_md_read ? TRUE : FALSE);
/* If not explicitly disabled, get the cmdr setting from the API context */
if(!coll_access && H5P_FORCE_FALSE != f->coll_md_read)
coll_access = H5CX_get_coll_metadata_read();
} /* end if */
if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
coll_access = H5CX_get_coll_metadata_read();
#endif /* H5_HAVE_PARALLEL */
/* first check to see if the target is in cache */
@ -2286,7 +2267,7 @@ H5C_protect(H5F_t * f,
the entry in their cache still have to participate in the
bcast. */
#ifdef H5_HAVE_PARALLEL
if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI) && coll_access) {
if(coll_access) {
if(!(entry_ptr->is_dirty) && !(entry_ptr->coll_access)) {
MPI_Comm comm; /* File MPI Communicator */
int mpi_code; /* MPI error code */
@ -2601,21 +2582,11 @@ H5C_protect(H5F_t * f,
} /* end if */
#ifdef H5_HAVE_PARALLEL
if(H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
/* Make sure the size of the collective entries in the cache remain in check */
if(coll_access) {
if(H5P_USER_TRUE == f->coll_md_read) {
if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
} /* end if */
else {
if(cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100)
if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
} /* end else */
} /* end if */
} /* end if */
/* Make sure the size of the collective entries in the cache remain in check */
if(coll_access)
if(cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
if(H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
#endif /* H5_HAVE_PARALLEL */
done:

View File

@ -815,8 +815,8 @@ H5CX_set_apl(hid_t *acspl_id, const H5P_libclass_t *libclass,
#ifdef H5_HAVE_PARALLEL
/* If this routine is not guaranteed to be collective (i.e. it doesn't
* modify the structural metadata in a file), check if we should use
* a collective metadata read.
* modify the structural metadata in a file), check if the application
* specified a collective metadata read for just this operation.
*/
if(!is_collective) {
H5P_genplist_t *plist; /* Property list pointer */

View File

@ -1065,9 +1065,9 @@ H5C__read_cache_image(H5F_t *f, H5C_t *cache_ptr)
H5C__UPDATE_STATS_FOR_CACHE_IMAGE_READ(cache_ptr)
#ifdef H5_HAVE_PARALLEL
if ( aux_ptr ) {
if ( aux_ptr ) {
/* Broadcast cache image */
/* Broadcast cache image */
if ( MPI_SUCCESS !=
(mpi_result = MPI_Bcast(cache_ptr->image_buffer,
(int)cache_ptr->image_len, MPI_BYTE,

View File

@ -2914,9 +2914,6 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled,
/* Check for cached information */
if(!H5D__chunk_cinfo_cache_found(&dset->shared->cache.chunk.last, udata)) {
H5D_chk_idx_info_t idx_info; /* Chunked index info */
#ifdef H5_HAVE_PARALLEL
H5P_coll_md_read_flag_t temp_cmr; /* Temp value to hold the coll metadata read setting */
#endif /* H5_HAVE_PARALLEL */
/* Compose chunked index info struct */
idx_info.f = dset->oloc.file;
@ -2925,25 +2922,18 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled,
idx_info.storage = &dset->shared->layout.storage.u.chunk;
#ifdef H5_HAVE_PARALLEL
if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) {
/* disable collective metadata read for chunk indexes
as it is highly unlikely that users would read the
same chunks from all processes. MSC - might turn on
for root node? */
temp_cmr = H5F_COLL_MD_READ(idx_info.f);
H5F_set_coll_md_read(idx_info.f, H5P_FORCE_FALSE);
} /* end if */
/* Disable collective metadata read for chunk indexes as it is
* highly unlikely that users would read the same chunks from all
* processes.
*/
if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI))
H5CX_set_coll_metadata_read(FALSE);
#endif /* H5_HAVE_PARALLEL */
/* Go get the chunk information */
if((dset->shared->layout.storage.u.chunk.ops->get_addr)(&idx_info, udata) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "can't query chunk address")
#ifdef H5_HAVE_PARALLEL
if(H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI))
H5F_set_coll_md_read(idx_info.f, temp_cmr);
#endif /* H5_HAVE_PARALLEL */
/*
* Cache the information retrieved.
*

View File

@ -800,6 +800,10 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
HDassert(type_info);
HDassert(fm);
/* Disable collective metadata reads for chunked dataset I/O operations
* in order to prevent potential hangs */
H5CX_set_coll_metadata_read(FALSE);
/* Check the optional property list for the collective chunk IO optimization option */
if(H5CX_get_mpio_chunk_opt_mode(&chunk_opt_mode) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get chunk optimization option")
@ -2313,7 +2317,7 @@ if(H5DEBUG(D))
/* Broadcasting the MPI_IO option info. and chunk address info. */
if(MPI_SUCCESS != (mpi_code = MPI_Bcast(total_chunk_addr_array, (int)(sizeof(haddr_t) * fm->layout->u.chunk.nchunks), MPI_BYTE, (int)0, io_info->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code)
HMPI_GOTO_ERROR(FAIL, "MPI_BCast failed", mpi_code)
} /* end if */
/* Start at first node in chunk skip list */

View File

@ -611,7 +611,7 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void *key)
/* Sanity check for collectively calling H5Zunregister, if requested */
/* (Sanity check assumes that a barrier on one file's comm
* is sufficient (i.e. that there aren't different comms for
* different files). -QAK, 2018/02/14
* different files). -QAK, 2018/02/14)
*/
if(H5_coll_api_sanity_check_g && !object->sanity_checked) {
MPI_Comm mpi_comm; /* File's communicator */

View File

@ -7225,7 +7225,7 @@ smoke_check_6(int metadata_write_strategy)
/* some error occured in the server -- report failure */
nerrors++;
if ( verbose ) {
HDfprintf(stdout, "%d:%s: server_main() failed.\n",
HDfprintf(stdout, "%d:%s: server_main() failed.\n",
world_mpi_rank, FUNC);
}
}
@ -7241,7 +7241,7 @@ smoke_check_6(int metadata_write_strategy)
fid = -1;
cache_ptr = NULL;
if ( verbose ) {
HDfprintf(stdout, "%d:%s: setup_cache_for_test() failed.\n",
HDfprintf(stdout, "%d:%s: setup_cache_for_test() failed.\n",
world_mpi_rank, FUNC);
}
}
@ -7250,7 +7250,7 @@ smoke_check_6(int metadata_write_strategy)
virt_num_data_entries = NUM_DATA_ENTRIES;
/* insert the first half collectively */
file_ptr->coll_md_read = H5P_USER_TRUE;
H5CX_set_coll_metadata_read(TRUE);
for ( i = 0; i < virt_num_data_entries/2; i++ )
{
struct datum * entry_ptr;
@ -7271,7 +7271,7 @@ smoke_check_6(int metadata_write_strategy)
}
/* insert the other half independently */
file_ptr->coll_md_read = H5P_USER_FALSE;
H5CX_set_coll_metadata_read(FALSE);
for ( i = virt_num_data_entries/2; i < virt_num_data_entries; i++ )
{
struct datum * entry_ptr;
@ -7291,7 +7291,7 @@ smoke_check_6(int metadata_write_strategy)
HDassert(cache_ptr->max_cache_size*0.8 > cache_ptr->coll_list_size);
}
/* flush the file */
/* flush the file */
if ( H5Fflush(fid, H5F_SCOPE_GLOBAL) < 0 ) {
nerrors++;
if ( verbose ) {
@ -7301,7 +7301,7 @@ smoke_check_6(int metadata_write_strategy)
}
/* Protect the first half of the entries collectively */
file_ptr->coll_md_read = H5P_USER_TRUE;
H5CX_set_coll_metadata_read(TRUE);
for ( i = 0; i < (virt_num_data_entries / 2); i++ )
{
struct datum * entry_ptr;
@ -7322,13 +7322,13 @@ smoke_check_6(int metadata_write_strategy)
}
/* protect the other half independently */
file_ptr->coll_md_read = H5P_USER_FALSE;
H5CX_set_coll_metadata_read(FALSE);
for ( i = virt_num_data_entries/2; i < virt_num_data_entries; i++ )
{
struct datum * entry_ptr;
entry_ptr = &(data[i]);
lock_entry(file_ptr, i);
lock_entry(file_ptr, i);
if(FALSE != entry_ptr->header.coll_access) {
nerrors++;

View File

@ -32,6 +32,14 @@
#define PARTIAL_NO_SELECTION_Y_DIM_SCALE 5
#define PARTIAL_NO_SELECTION_X_DIM_SCALE 5
#define MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS 2
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_NO_SEL_PROCESS (mpi_rank == mpi_size - 1)
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE 20000
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE 1
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1
/*
* A test for issue HDFFV-10501. A parallel hang was reported which occurred
* in linked-chunk I/O when collective metadata reads are enabled and some ranks
@ -57,13 +65,13 @@ void test_partial_no_selection_coll_md_read(void)
hsize_t stride[PARTIAL_NO_SELECTION_DATASET_NDIMS];
hsize_t count[PARTIAL_NO_SELECTION_DATASET_NDIMS];
hsize_t block[PARTIAL_NO_SELECTION_DATASET_NDIMS];
hid_t file_id = -1;
hid_t fapl_id = -1;
hid_t dset_id = -1;
hid_t dcpl_id = -1;
hid_t dxpl_id = -1;
hid_t fspace_id = -1;
hid_t mspace_id = -1;
hid_t file_id = H5I_INVALID_HID;
hid_t fapl_id = H5I_INVALID_HID;
hid_t dset_id = H5I_INVALID_HID;
hid_t dcpl_id = H5I_INVALID_HID;
hid_t dxpl_id = H5I_INVALID_HID;
hid_t fspace_id = H5I_INVALID_HID;
hid_t mspace_id = H5I_INVALID_HID;
int mpi_rank, mpi_size;
void *data = NULL;
void *read_buf = NULL;
@ -86,7 +94,7 @@ void test_partial_no_selection_coll_md_read(void)
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
VRFY((file_id >= 0), "H5Fcreate succeeded");
dataset_dims = malloc(PARTIAL_NO_SELECTION_DATASET_NDIMS * sizeof(*dataset_dims));
dataset_dims = HDmalloc(PARTIAL_NO_SELECTION_DATASET_NDIMS * sizeof(*dataset_dims));
VRFY((dataset_dims != NULL), "malloc succeeded");
dataset_dims[0] = PARTIAL_NO_SELECTION_Y_DIM_SCALE * mpi_size;
@ -129,7 +137,7 @@ void test_partial_no_selection_coll_md_read(void)
mspace_id = H5Screate_simple(1, sel_dims, NULL);
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
data = calloc(1, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int));
data = HDcalloc(1, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int));
VRFY((data != NULL), "calloc succeeded");
dxpl_id = H5Pcreate(H5P_DATASET_XFER);
@ -151,7 +159,7 @@ void test_partial_no_selection_coll_md_read(void)
*/
VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded");
read_buf = malloc(count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int));
read_buf = HDmalloc(count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int));
VRFY((read_buf != NULL), "malloc succeeded");
/*
@ -171,21 +179,321 @@ void test_partial_no_selection_coll_md_read(void)
* Check data integrity just to be sure.
*/
if (!PARTIAL_NO_SELECTION_NO_SEL_PROCESS) {
VRFY((!memcmp(data, read_buf, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int))), "memcmp succeeded");
VRFY((!HDmemcmp(data, read_buf, count[1] * (PARTIAL_NO_SELECTION_Y_DIM_SCALE * PARTIAL_NO_SELECTION_X_DIM_SCALE) * sizeof(int))), "memcmp succeeded");
}
if (dataset_dims) {
free(dataset_dims);
HDfree(dataset_dims);
dataset_dims = NULL;
}
if (data) {
free(data);
HDfree(data);
data = NULL;
}
if (read_buf) {
free(read_buf);
HDfree(read_buf);
read_buf = NULL;
}
VRFY((H5Sclose(fspace_id) >= 0), "H5Sclose succeeded");
VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded");
VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded");
VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
}
/*
* A test for HDFFV-10562 which attempts to verify that using multi-chunk
* I/O with collective metadata reads enabled doesn't causes issues due to
* collective metadata reads being made only by process 0 in H5D__chunk_addrmap().
*
* Failure in this test may either cause a hang, or, due to how the MPI calls
* pertaining to this issue might mistakenly match up, may cause an MPI error
* message similar to:
*
* #008: H5Dmpio.c line 2546 in H5D__obtain_mpio_mode(): MPI_BCast failed
* major: Internal error (too specific to document in detail)
* minor: Some MPI function failed
* #009: H5Dmpio.c line 2546 in H5D__obtain_mpio_mode(): Message truncated, error stack:
*PMPI_Bcast(1600)..................: MPI_Bcast(buf=0x1df98e0, count=18, MPI_BYTE, root=0, comm=0x84000006) failed
*MPIR_Bcast_impl(1452).............:
*MPIR_Bcast(1476)..................:
*MPIR_Bcast_intra(1249)............:
*MPIR_SMP_Bcast(1088)..............:
*MPIR_Bcast_binomial(239)..........:
*MPIDI_CH3U_Receive_data_found(131): Message from rank 0 and tag 2 truncated; 2616 bytes received but buffer size is 18
* major: Internal error (too specific to document in detail)
* minor: MPI Error String
*
*/
void test_multi_chunk_io_addrmap_issue(void)
{
const char *filename;
hsize_t start[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS];
hsize_t stride[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS];
hsize_t count[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS];
hsize_t block[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS];
hsize_t dims[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS] = {10, 5};
hsize_t chunk_dims[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS] = {5, 5};
hsize_t max_dims[MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS] = {H5S_UNLIMITED, H5S_UNLIMITED};
hid_t file_id = H5I_INVALID_HID;
hid_t fapl_id = H5I_INVALID_HID;
hid_t dset_id = H5I_INVALID_HID;
hid_t dcpl_id = H5I_INVALID_HID;
hid_t dxpl_id = H5I_INVALID_HID;
hid_t space_id = H5I_INVALID_HID;
void *read_buf = NULL;
int mpi_rank;
int data[5][5] = { {0, 1, 2, 3, 4},
{0, 1, 2, 3, 4},
{0, 1, 2, 3, 4},
{0, 1, 2, 3, 4},
{0, 1, 2, 3, 4} };
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
filename = GetTestParameters();
fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type);
VRFY((fapl_id >= 0), "create_faccess_plist succeeded");
/*
* Even though the testphdf5 framework currently sets collective metadata reads
* on the FAPL, we call it here just to be sure this is futureproof, since
* demonstrating this issue relies upon it.
*/
VRFY((H5Pset_all_coll_metadata_ops(fapl_id, true) >= 0), "Set collective metadata reads succeeded");
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
VRFY((file_id >= 0), "H5Fcreate succeeded");
space_id = H5Screate_simple(MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS, dims, max_dims);
VRFY((space_id >= 0), "H5Screate_simple succeeded");
dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
VRFY((dcpl_id >= 0), "H5Pcreate succeeded");
VRFY((H5Pset_chunk(dcpl_id, MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded");
dset_id = H5Dcreate2(file_id, "dset", H5T_NATIVE_INT, space_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
VRFY((dset_id >= 0), "H5Dcreate2 succeeded");
dxpl_id = H5Pcreate(H5P_DATASET_XFER);
VRFY((dxpl_id >= 0), "H5Pcreate succeeded");
VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0), "H5Pset_dxpl_mpio succeeded");
VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_MULTI_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded");
start[1] = 0;
stride[0] = stride[1] = 1;
count[0] = count[1] = 5;
block[0] = block[1] = 1;
if (mpi_rank == 0)
start[0] = 0;
else
start[0] = 5;
VRFY((H5Sselect_hyperslab(space_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded");
if (mpi_rank != 0)
VRFY((H5Sselect_none(space_id) >= 0), "H5Sselect_none succeeded");
VRFY((H5Dwrite(dset_id, H5T_NATIVE_INT, H5S_ALL, space_id, dxpl_id, data) >= 0), "H5Dwrite succeeded");
VRFY((H5Fflush(file_id, H5F_SCOPE_GLOBAL) >= 0), "H5Fflush succeeded");
read_buf = HDmalloc(50 * sizeof(int));
VRFY((read_buf != NULL), "malloc succeeded");
VRFY((H5Dread(dset_id, H5T_NATIVE_INT, H5S_ALL, H5S_ALL, dxpl_id, read_buf) >= 0), "H5Dread succeeded");
if (read_buf) {
HDfree(read_buf);
read_buf = NULL;
}
VRFY((H5Sclose(space_id) >= 0), "H5Sclose succeeded");
VRFY((H5Pclose(dcpl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Pclose(dxpl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Dclose(dset_id) >= 0), "H5Dclose succeeded");
VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
}
/*
* A test for HDFFV-10562 which attempts to verify that using linked-chunk
* I/O with collective metadata reads enabled doesn't cause issues due to
* collective metadata reads being made only by process 0 in H5D__sort_chunk().
*
* NOTE: Due to the way that the threshold value which pertains to this test
* is currently calculated within HDF5, there are several conditions that this
* test must maintain. Refer to the function H5D__sort_chunk in H5Dmpio.c for
* a better idea of why.
*
* Condition 1: We need to make sure that the test always selects every single
* chunk in the dataset. It is fine if the selection is split up among multiple
* ranks, but their combined selection must cover the whole dataset.
*
* Condition 2: The number of chunks in the dataset divided by the number of MPI
* ranks must exceed or equal 10000. In other words, each MPI rank must be
* responsible for 10000 or more unique chunks.
*
* Condition 3: This test will currently only be reliably reproducable for 2 or 3
* MPI ranks. The threshold value calculated reduces to a constant 100 / mpi_size,
* and is compared against a default value of 30%.
*
* Failure in this test may either cause a hang, or, due to how the MPI calls
* pertaining to this issue might mistakenly match up, may cause an MPI error
* message similar to:
*
* #008: H5Dmpio.c line 2338 in H5D__sort_chunk(): MPI_BCast failed
* major: Internal error (too specific to document in detail)
* minor: Some MPI function failed
* #009: H5Dmpio.c line 2338 in H5D__sort_chunk(): Other MPI error, error stack:
*PMPI_Bcast(1600)........: MPI_Bcast(buf=0x7eae610, count=320000, MPI_BYTE, root=0, comm=0x84000006) failed
*MPIR_Bcast_impl(1452)...:
*MPIR_Bcast(1476)........:
*MPIR_Bcast_intra(1249)..:
*MPIR_SMP_Bcast(1088)....:
*MPIR_Bcast_binomial(250): message sizes do not match across processes in the collective routine: Received 2096 but expected 320000
* major: Internal error (too specific to document in detail)
* minor: MPI Error String
*/
void test_link_chunk_io_sort_chunk_issue(void)
{
const char *filename;
hsize_t *dataset_dims = NULL;
hsize_t max_dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t sel_dims[1];
hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS] = { LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS };
hsize_t start[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t stride[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t count[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t block[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hid_t file_id = H5I_INVALID_HID;
hid_t fapl_id = H5I_INVALID_HID;
hid_t dset_id = H5I_INVALID_HID;
hid_t dcpl_id = H5I_INVALID_HID;
hid_t dxpl_id = H5I_INVALID_HID;
hid_t fspace_id = H5I_INVALID_HID;
hid_t mspace_id = H5I_INVALID_HID;
int mpi_rank, mpi_size;
void *data = NULL;
void *read_buf = NULL;
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
filename = GetTestParameters();
fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type);
VRFY((fapl_id >= 0), "create_faccess_plist succeeded");
/*
* Even though the testphdf5 framework currently sets collective metadata reads
* on the FAPL, we call it here just to be sure this is futureproof, since
* demonstrating this issue relies upon it.
*/
VRFY((H5Pset_all_coll_metadata_ops(fapl_id, true) >= 0), "Set collective metadata reads succeeded");
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
VRFY((file_id >= 0), "H5Fcreate succeeded");
dataset_dims = HDmalloc(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS * sizeof(*dataset_dims));
VRFY((dataset_dims != NULL), "malloc succeeded");
dataset_dims[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE * mpi_size * LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE;
max_dataset_dims[0] = H5S_UNLIMITED;
fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, max_dataset_dims);
VRFY((fspace_id >= 0), "H5Screate_simple succeeded");
/*
* Set up chunking on the dataset in order to reproduce the problem.
*/
dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
VRFY((dcpl_id >= 0), "H5Pcreate succeeded");
VRFY((H5Pset_chunk(dcpl_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, chunk_dims) >= 0), "H5Pset_chunk succeeded");
dset_id = H5Dcreate2(file_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME, H5T_NATIVE_INT, fspace_id, H5P_DEFAULT, dcpl_id, H5P_DEFAULT);
VRFY((dset_id >= 0), "H5Dcreate2 succeeded");
/*
* Setup hyperslab selection to split the dataset among the ranks.
*
* The ranks will write rows across the dataset.
*/
stride[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
count[0] = (dataset_dims[0] / LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) / mpi_size;
start[0] = count[0] * mpi_rank;
block[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded");
sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
mspace_id = H5Screate_simple(1, sel_dims, NULL);
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
data = HDcalloc(1, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
VRFY((data != NULL), "calloc succeeded");
dxpl_id = H5Pcreate(H5P_DATASET_XFER);
VRFY((dxpl_id >= 0), "H5Pcreate succeeded");
/*
* Enable collective access for the data transfer.
*/
VRFY((H5Pset_dxpl_mpio(dxpl_id, H5FD_MPIO_COLLECTIVE) >= 0), "H5Pset_dxpl_mpio succeeded");
VRFY((H5Dwrite(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, data) >= 0), "H5Dwrite succeeded");
VRFY((H5Fflush(file_id, H5F_SCOPE_GLOBAL) >= 0), "H5Fflush succeeded");
/*
* Ensure that linked-chunk I/O is performed since this is
* the particular code path where the issue lies and we don't
* want the library doing multi-chunk I/O behind our backs.
*/
VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0), "H5Pset_dxpl_mpio_chunk_opt succeeded");
read_buf = HDmalloc(count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
VRFY((read_buf != NULL), "malloc succeeded");
VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0), "H5Sselect_hyperslab succeeded");
sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded");
mspace_id = H5Screate_simple(1, sel_dims, NULL);
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
read_buf = HDrealloc(read_buf, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
VRFY((read_buf != NULL), "realloc succeeded");
/*
* Finally have each rank read their section of data back from the dataset.
*/
VRFY((H5Dread(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, read_buf) >= 0), "H5Dread succeeded");
if (dataset_dims) {
HDfree(dataset_dims);
dataset_dims = NULL;
}
if (data) {
HDfree(data);
data = NULL;
}
if (read_buf) {
HDfree(read_buf);
read_buf = NULL;
}

View File

@ -549,7 +549,10 @@ int main(int argc, char **argv)
AddTest("noselcollmdread", test_partial_no_selection_coll_md_read, NULL,
"Collective Metadata read with some ranks having no selection", PARATESTFILE);
AddTest("MC coll MD read", test_multi_chunk_io_addrmap_issue, NULL,
"Collective MD read with multi chunk I/O (H5D__chunk_addrmap)", PARATESTFILE);
AddTest("LC coll MD read", test_link_chunk_io_sort_chunk_issue, NULL,
"Collective MD read with link chunk I/O (H5D__sort_chunk)", PARATESTFILE);
/* Display testing information */
TestInfo(argv[0]);

View File

@ -295,6 +295,8 @@ void compress_readAll(void);
#endif /* H5_HAVE_FILTER_DEFLATE */
void test_dense_attr(void);
void test_partial_no_selection_coll_md_read(void);
void test_multi_chunk_io_addrmap_issue(void);
void test_link_chunk_io_sort_chunk_issue(void);
/* commonly used prototypes */
hid_t create_faccess_plist(MPI_Comm comm, MPI_Info info, int l_facc_type);