Unify handling of collective metadata reads status (#1206)

This commit is contained in:
jhendersonHDF 2022-01-29 14:22:30 -06:00 committed by GitHub
parent bcf9565535
commit 86ef00fd67
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 357 additions and 136 deletions

View File

@ -1083,6 +1083,27 @@ Bug Fixes since HDF5-1.12.0 release
(DER - 2021/11/23, HDFFV-11286)
- Unified handling of collective metadata reads to correctly fix old bugs
Due to MPI-related issues occurring in HDF5 from mismanagement of the
status of collective metadata reads, they were forced to be disabled
during chunked dataset raw data I/O in the HDF5 1.10.5 release. This
wouldn't generally have affected application performance because HDF5
already disables collective metadata reads during chunk lookup, since
it is generally unlikely that the same chunks will be read by all MPI
ranks in the I/O operation. However, this was only a partial solution
that wasn't granular enough.
This change now unifies the handling of the file-global flag and the
API context-level flag for collective metadata reads in order to
simplify querying of the true status of collective metadata reads. Thus,
collective metadata reads are once again enabled for chunked dataset
raw data I/O, but manually controlled at places where some processing
occurs on MPI rank 0 only and would cause issues when collective
metadata reads are enabled.
(JTH - 2021/11/16, HDFFV-10501/HDFFV-10562)
- Fixed several potential MPI deadlocks in library failure conditions
In the parallel library, there were several places where MPI rank 0

View File

@ -1518,17 +1518,26 @@ H5C_insert_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *thing, u
#ifdef H5_HAVE_PARALLEL
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
coll_access = H5CX_get_coll_metadata_read();
coll_access = H5F_get_coll_metadata_reads(f);
entry_ptr->coll_access = coll_access;
if (coll_access) {
H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL)
/* Make sure the size of the collective entries in the cache remain in check */
if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
} /* end if */
if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) {
if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) {
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
} /* end if */
} /* end if */
else {
if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) {
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
} /* end if */
} /* end else */
} /* end if */
#endif
done:
@ -2248,7 +2257,7 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
#ifdef H5_HAVE_PARALLEL
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
coll_access = H5CX_get_coll_metadata_read();
coll_access = H5F_get_coll_metadata_reads(f);
#endif /* H5_HAVE_PARALLEL */
/* first check to see if the target is in cache */
@ -2600,11 +2609,19 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
#ifdef H5_HAVE_PARALLEL
/* Make sure the size of the collective entries in the cache remain in check */
if (coll_access)
if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
#endif /* H5_HAVE_PARALLEL */
if (coll_access) {
if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) {
if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
} /* end if */
else {
if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100)
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
} /* end else */
} /* end if */
#endif /* H5_HAVE_PARALLEL */
done:
#if H5C_DO_EXTREME_SANITY_CHECKS

View File

@ -1423,7 +1423,7 @@ done:
* Purpose: Sanity checks and sets up collective operations.
*
* Note: Should be called for all API routines that modify file
* file metadata but don't pass in an access property list.
* metadata but don't pass in an access property list.
*
* Return: Non-negative on success / Negative on failure
*

View File

@ -3178,7 +3178,9 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
unsigned idx = 0; /* Index of chunk in cache, if present */
hbool_t found = FALSE; /* In cache? */
#ifdef H5_HAVE_PARALLEL
hbool_t reenable_coll_md_reads = FALSE;
H5P_coll_md_read_flag_t md_reads_file_flag;
hbool_t md_reads_context_flag;
hbool_t restore_md_reads_state = FALSE;
#endif
herr_t ret_value = SUCCEED; /* Return value */
@ -3252,11 +3254,10 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
* processes.
*/
if (H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) {
hbool_t do_coll_md_reads = H5CX_get_coll_metadata_read();
if (do_coll_md_reads) {
H5CX_set_coll_metadata_read(FALSE);
reenable_coll_md_reads = TRUE;
}
md_reads_file_flag = H5P_FORCE_FALSE;
md_reads_context_flag = FALSE;
H5F_set_coll_metadata_reads(idx_info.f, &md_reads_file_flag, &md_reads_context_flag);
restore_md_reads_state = TRUE;
}
#endif /* H5_HAVE_PARALLEL */
@ -3302,8 +3303,8 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
done:
#ifdef H5_HAVE_PARALLEL
/* Re-enable collective metadata reads if we disabled them */
if (reenable_coll_md_reads)
H5CX_set_coll_metadata_read(TRUE);
if (restore_md_reads_state)
H5F_set_coll_metadata_reads(dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
#endif /* H5_HAVE_PARALLEL */
FUNC_LEAVE_NOAPI(ret_value)

View File

@ -808,10 +808,6 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
HDassert(type_info);
HDassert(fm);
/* Disable collective metadata reads for chunked dataset I/O operations
* in order to prevent potential hangs */
H5CX_set_coll_metadata_read(FALSE);
/* Check the optional property list for the collective chunk IO optimization option */
if (H5CX_get_mpio_chunk_opt_mode(&chunk_opt_mode) < 0)
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get chunk optimization option")
@ -2306,17 +2302,20 @@ static herr_t
H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
H5D_chunk_addr_info_t chunk_addr_info_array[], int sum_chunk)
{
H5SL_node_t * chunk_node; /* Current node in chunk skip list */
H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
haddr_t chunk_addr; /* Current chunking address of this node */
haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */
hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */
int bsearch_coll_chunk_threshold;
int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND;
int mpi_size; /* Number of MPI processes */
int mpi_code; /* MPI return code */
int i; /* Local index variable */
herr_t ret_value = SUCCEED; /* Return value */
H5SL_node_t * chunk_node; /* Current node in chunk skip list */
H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
haddr_t chunk_addr; /* Current chunking address of this node */
haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */
H5P_coll_md_read_flag_t md_reads_file_flag;
hbool_t md_reads_context_flag;
hbool_t restore_md_reads_state = FALSE;
hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */
int bsearch_coll_chunk_threshold;
int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND;
int mpi_size; /* Number of MPI processes */
int mpi_code; /* MPI return code */
int i; /* Local index variable */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_STATIC
@ -2360,7 +2359,32 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
if (mpi_rank == 0) {
if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) {
herr_t result;
/*
* If enabled, disable collective metadata reads here.
* Since the chunk address mapping is done on rank 0
* only here, it will cause problems if collective
* metadata reads are enabled.
*/
if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) {
md_reads_file_flag = H5P_FORCE_FALSE;
md_reads_context_flag = FALSE;
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
&md_reads_context_flag);
restore_md_reads_state = TRUE;
}
result = H5D__chunk_addrmap(io_info, total_chunk_addr_array);
/* Ensure that we restore the old collective metadata reads state */
if (restore_md_reads_state) {
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
&md_reads_context_flag);
restore_md_reads_state = FALSE;
}
if (result < 0) {
size_t u;
/* Clear total chunk address array */
@ -2424,6 +2448,10 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
} /* end if */
done:
/* Re-enable collective metadata reads if we disabled them */
if (restore_md_reads_state)
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
if (total_chunk_addr_array)
H5MM_xfree(total_chunk_addr_array);
@ -2471,20 +2499,23 @@ static herr_t
H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assign_io_mode[],
haddr_t chunk_addr[])
{
size_t total_chunks;
unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk;
uint8_t * io_mode_info = NULL;
uint8_t * recv_io_mode_info = NULL;
uint8_t * mergebuf = NULL;
uint8_t * tempbuf;
H5SL_node_t * chunk_node;
H5D_chunk_info_t *chunk_info;
int mpi_size, mpi_rank;
MPI_Comm comm;
int root;
size_t ic;
int mpi_code;
herr_t ret_value = SUCCEED;
size_t total_chunks;
unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk;
uint8_t * io_mode_info = NULL;
uint8_t * recv_io_mode_info = NULL;
uint8_t * mergebuf = NULL;
uint8_t * tempbuf;
H5SL_node_t * chunk_node;
H5D_chunk_info_t * chunk_info;
H5P_coll_md_read_flag_t md_reads_file_flag;
hbool_t md_reads_context_flag;
hbool_t restore_md_reads_state = FALSE;
int mpi_size, mpi_rank;
MPI_Comm comm;
int root;
size_t ic;
int mpi_code;
herr_t ret_value = SUCCEED;
FUNC_ENTER_STATIC
@ -2544,6 +2575,20 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig
size_t nproc;
unsigned *nproc_per_chunk;
/*
* If enabled, disable collective metadata reads here.
* Since the chunk address mapping is done on rank 0
* only here, it will cause problems if collective
* metadata reads are enabled.
*/
if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) {
md_reads_file_flag = H5P_FORCE_FALSE;
md_reads_context_flag = FALSE;
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
&md_reads_context_flag);
restore_md_reads_state = TRUE;
}
/* pre-computing: calculate number of processes and
regularity of the selection occupied in each chunk */
if (NULL == (nproc_per_chunk = (unsigned *)H5MM_calloc(total_chunks * sizeof(unsigned))))
@ -2610,6 +2655,10 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig
#endif
done:
/* Re-enable collective metadata reads if we disabled them */
if (restore_md_reads_state)
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
if (io_mode_info)
H5MM_free(io_mode_info);
if (mergebuf)

View File

@ -31,11 +31,12 @@
/***********/
/* Headers */
/***********/
#include "H5private.h" /* Generic Functions */
#include "H5Eprivate.h" /* Error handling */
#include "H5Fpkg.h" /* File access */
#include "H5FDprivate.h" /* File drivers */
#include "H5Iprivate.h" /* IDs */
#include "H5private.h" /* Generic Functions */
#include "H5CXprivate.h" /* API Contexts */
#include "H5Eprivate.h" /* Error handling */
#include "H5Fpkg.h" /* File access */
#include "H5FDprivate.h" /* File drivers */
#include "H5Iprivate.h" /* IDs */
#include "H5VLnative_private.h" /* Native VOL connector */
@ -402,4 +403,125 @@ H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm)
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_mpi_retrieve_comm */
/*-------------------------------------------------------------------------
* Function: H5F_get_coll_metadata_reads
*
* Purpose: Determines whether collective metadata reads should be
* performed. This routine is meant to be the single source of
* truth for the collective metadata reads status, as it
* coordinates between the file-global flag and the flag set
* for the current operation in the current API context.
*
* Return: TRUE/FALSE (can't fail)
*
*-------------------------------------------------------------------------
*/
hbool_t
H5F_get_coll_metadata_reads(const H5F_t *file)
{
H5P_coll_md_read_flag_t file_flag = H5P_USER_FALSE;
hbool_t ret_value = FALSE;
FUNC_ENTER_NOAPI_NOERR
HDassert(file && file->shared);
/* Retrieve the file-global flag */
file_flag = H5F_COLL_MD_READ(file);
/* If file flag is set to H5P_FORCE_FALSE, exit early
* with FALSE, since collective metadata reads have
* been explicitly disabled somewhere in the library.
*/
if (H5P_FORCE_FALSE == file_flag)
ret_value = FALSE;
else {
/* If file flag is set to H5P_USER_TRUE, ignore
* any settings in the API context. A file-global
* setting of H5P_USER_TRUE for collective metadata
* reads should ignore any settings on an Access
* Property List for an individual operation.
*/
if (H5P_USER_TRUE == file_flag)
ret_value = TRUE;
else {
/* Get the collective metadata reads flag from
* the current API context.
*/
ret_value = H5CX_get_coll_metadata_read();
}
}
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_get_coll_metadata_reads() */
/*-------------------------------------------------------------------------
* Function: H5F_set_coll_metadata_reads
*
* Purpose: Used to temporarily modify the collective metadata reads
* status. This is useful for cases where either:
*
* * Collective metadata reads are enabled, but need to be
* disabled for an operation about to occur that may trigger
* an independent metadata read (such as only rank 0 doing
* something)
*
* * Metadata reads are currently independent, but it is
* guaranteed that the application has maintained
* collectivity at the interface level (e.g., an operation
* that modifies metadata is being performed). In this case,
* it should be safe to enable collective metadata reads,
* barring any internal library issues that may occur
*
* After completion, the `file_flag` parameter will be set to
* the previous value of the file-global collective metadata
* reads flag. The `context_flag` parameter will be set to the
* previous value of the API context's collective metadata
* reads flag. Another call to this routine should be made to
* restore these values (see below warning).
*
* !! WARNING !!
* It is dangerous to modify the collective metadata reads
* status, as this can cause crashes, hangs and corruption in
* the HDF5 file when improperly done. Therefore, the
* `file_flag` and `context_flag` parameters are both
* mandatory, and it is assumed that the caller will guarantee
* these settings are restored with another call to this
* routine once the bracketed operation is complete.
* !! WARNING !!
*
* Return: Nothing
*
*-------------------------------------------------------------------------
*/
void
H5F_set_coll_metadata_reads(H5F_t *file, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag)
{
H5P_coll_md_read_flag_t prev_file_flag = H5P_USER_FALSE;
hbool_t prev_context_flag = FALSE;
FUNC_ENTER_NOAPI_NOERR
HDassert(file && file->shared);
HDassert(file_flag);
HDassert(context_flag);
/* Save old state */
prev_file_flag = H5F_COLL_MD_READ(file);
prev_context_flag = H5CX_get_coll_metadata_read();
/* Set new desired state */
if (prev_file_flag != *file_flag) {
file->shared->coll_md_read = *file_flag;
*file_flag = prev_file_flag;
}
if (prev_context_flag != *context_flag) {
H5CX_set_coll_metadata_read(*context_flag);
*context_flag = prev_context_flag;
}
FUNC_LEAVE_NOAPI_VOID
} /* end H5F_set_coll_metadata_reads() */
#endif /* H5_HAVE_PARALLEL */

View File

@ -962,6 +962,8 @@ H5_DLL MPI_Comm H5F_mpi_get_comm(const H5F_t *f);
H5_DLL int H5F_shared_mpi_get_size(const H5F_shared_t *f_sh);
H5_DLL int H5F_mpi_get_size(const H5F_t *f);
H5_DLL herr_t H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm);
H5_DLL hbool_t H5F_get_coll_metadata_reads(const H5F_t *f);
H5_DLL void H5F_set_coll_metadata_reads(H5F_t *f, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag);
#endif /* H5_HAVE_PARALLEL */
/* External file cache routines */

View File

@ -5284,15 +5284,14 @@ H5P__decode_coll_md_read_flag_t(const void **_pp, void *_value)
* Function: H5Pset_all_coll_metadata_ops
*
* Purpose: Tell the library whether the metadata read operations will
* be done collectively (1) or not (0). Default is independent.
* With collective mode, the library will optimize access to
* metadata operations on the file.
* be done collectively (1) or not (0). Default is independent.
* With collective mode, the library will optimize access to
* metadata operations on the file.
*
* Note: This routine accepts file access property lists, link
* access property lists, attribute access property lists,
* dataset access property lists, group access property lists,
* named datatype access property lists,
* and dataset transfer property lists.
* access property lists, attribute access property lists,
* dataset access property lists, group access property lists
* and named datatype access property lists.
*
* Return: Non-negative on success/Negative on failure
*
@ -5312,7 +5311,7 @@ H5Pset_all_coll_metadata_ops(hid_t plist_id, hbool_t is_collective)
H5TRACE2("e", "ib", plist_id, is_collective);
/* Compare the property list's class against the other class */
/* (Dataset, group, attribute, and named datype access property lists
/* (Dataset, group, attribute, and named datatype access property lists
* are sub-classes of link access property lists -QAK)
*/
if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS))
@ -5342,10 +5341,9 @@ done:
* Purpose: Gets information about collective metadata read mode.
*
* Note: This routine accepts file access property lists, link
* access property lists, attribute access property lists,
* dataset access property lists, group access property lists,
* named datatype access property lists,
* and dataset transfer property lists.
* access property lists, attribute access property lists,
* dataset access property lists, group access property lists,
* and named datatype access property lists.
*
* Return: Non-negative on success/Negative on failure
*
@ -5363,7 +5361,7 @@ H5Pget_all_coll_metadata_ops(hid_t plist_id, hbool_t *is_collective /*out*/)
H5TRACE2("e", "ix", plist_id, is_collective);
/* Compare the property list's class against the other class */
/* (Dataset, group, attribute, and named datype access property lists
/* (Dataset, group, attribute, and named datatype access property lists
* are sub-classes of link access property lists -QAK)
*/
if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS))

View File

@ -594,14 +594,9 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA
/* Do a global flush if the file is opened for write */
if (H5F_ACC_RDWR & H5F_INTENT(f)) {
/* When parallel HDF5 is defined, check for collective metadata reads on this
* file and set the flag for metadata I/O in the API context. -QAK, 2018/02/14
*/
#ifdef H5_HAVE_PARALLEL
/* Check if MPIO driver is used */
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
H5P_coll_md_read_flag_t coll_md_read; /* Do all metadata reads collectively */
/* Sanity check for collectively calling H5Zunregister, if requested */
/* (Sanity check assumes that a barrier on one file's comm
* is sufficient (i.e. that there aren't different comms for
@ -621,13 +616,8 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA
/* Set the "sanity checked" flag */
object->sanity_checked = TRUE;
} /* end if */
/* Check whether to use the collective metadata read DXPL */
coll_md_read = H5F_COLL_MD_READ(f);
if (H5P_USER_TRUE == coll_md_read)
H5CX_set_coll_metadata_read(TRUE);
} /* end if */
#endif /* H5_HAVE_PARALLEL */
} /* end if */
#endif /* H5_HAVE_PARALLEL */
/* Call the flush routine for mounted file hierarchies */
if (H5F_flush_mounts((H5F_t *)obj_ptr) < 0)

View File

@ -6623,13 +6623,15 @@ trace_file_check(int metadata_write_strategy)
static hbool_t
smoke_check_6(int metadata_write_strategy)
{
hbool_t success = TRUE;
int i;
int max_nerrors;
hid_t fid = -1;
H5F_t * file_ptr = NULL;
H5C_t * cache_ptr = NULL;
struct mssg_t mssg;
H5P_coll_md_read_flag_t md_reads_file_flag;
hbool_t md_reads_context_flag;
hbool_t success = TRUE;
int i;
int max_nerrors;
hid_t fid = -1;
H5F_t * file_ptr = NULL;
H5C_t * cache_ptr = NULL;
struct mssg_t mssg;
switch (metadata_write_strategy) {
@ -6685,7 +6687,9 @@ smoke_check_6(int metadata_write_strategy)
virt_num_data_entries = NUM_DATA_ENTRIES;
/* insert the first half collectively */
H5CX_set_coll_metadata_read(TRUE);
md_reads_file_flag = H5P_USER_TRUE;
md_reads_context_flag = TRUE;
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = 0; i < virt_num_data_entries / 2; i++) {
struct datum *entry_ptr;
entry_ptr = &(data[i]);
@ -6704,9 +6708,13 @@ smoke_check_6(int metadata_write_strategy)
H5_CHECK_OVERFLOW(cache_ptr->max_cache_size, size_t, double);
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
}
/* Restore collective metadata reads state */
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
/* insert the other half independently */
H5CX_set_coll_metadata_read(FALSE);
md_reads_file_flag = H5P_USER_FALSE;
md_reads_context_flag = FALSE;
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) {
struct datum *entry_ptr;
entry_ptr = &(data[i]);
@ -6724,6 +6732,8 @@ smoke_check_6(int metadata_write_strategy)
/* Make sure coll entries do not cross the 80% threshold */
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
}
/* Restore collective metadata reads state */
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
/* flush the file */
if (H5Fflush(fid, H5F_SCOPE_GLOBAL) < 0) {
@ -6734,7 +6744,9 @@ smoke_check_6(int metadata_write_strategy)
}
/* Protect the first half of the entries collectively */
H5CX_set_coll_metadata_read(TRUE);
md_reads_file_flag = H5P_USER_TRUE;
md_reads_context_flag = TRUE;
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = 0; i < (virt_num_data_entries / 2); i++) {
struct datum *entry_ptr;
entry_ptr = &(data[i]);
@ -6752,9 +6764,13 @@ smoke_check_6(int metadata_write_strategy)
/* Make sure coll entries do not cross the 80% threshold */
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
}
/* Restore collective metadata reads state */
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
/* protect the other half independently */
H5CX_set_coll_metadata_read(FALSE);
md_reads_file_flag = H5P_USER_FALSE;
md_reads_context_flag = FALSE;
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) {
struct datum *entry_ptr;
entry_ptr = &(data[i]);
@ -6772,6 +6788,8 @@ smoke_check_6(int metadata_write_strategy)
/* Make sure coll entries do not cross the 80% threshold */
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
}
/* Restore collective metadata reads state */
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
for (i = 0; i < (virt_num_data_entries); i++) {
unlock_entry(file_ptr, i, H5AC__NO_FLAGS_SET);

View File

@ -34,10 +34,9 @@
#define MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS 2
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE 20000
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE 1
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM 10000
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1
/*
* A test for issue HDFFV-10501. A parallel hang was reported which occurred
@ -339,21 +338,34 @@ test_multi_chunk_io_addrmap_issue(void)
* collective metadata reads being made only by process 0 in H5D__sort_chunk().
*
* NOTE: Due to the way that the threshold value which pertains to this test
* is currently calculated within HDF5, there are several conditions that this
* test must maintain. Refer to the function H5D__sort_chunk in H5Dmpio.c for
* a better idea of why.
* is currently calculated within HDF5, the following two conditions must be
* true to trigger the issue:
*
* Condition 1: We need to make sure that the test always selects every single
* chunk in the dataset. It is fine if the selection is split up among multiple
* ranks, but their combined selection must cover the whole dataset.
* Condition 1: A certain threshold ratio must be met in order to have HDF5
* obtain all chunk addresses collectively inside H5D__sort_chunk(). This is
* given by the following:
*
* Condition 2: The number of chunks in the dataset divided by the number of MPI
* ranks must exceed or equal 10000. In other words, each MPI rank must be
* responsible for 10000 or more unique chunks.
* (sum_chunk * 100) / (dataset_nchunks * mpi_size) >= 30%
*
* Condition 3: This test will currently only be reliably reproducible for 2 or 3
* MPI ranks. The threshold value calculated reduces to a constant 100 / mpi_size,
* and is compared against a default value of 30%.
* where:
* * `sum_chunk` is the combined sum of the number of chunks selected in
* the dataset by all ranks (chunks selected by more than one rank count
* individually toward the sum for each rank selecting that chunk)
* * `dataset_nchunks` is the number of chunks in the dataset (selected
* or not)
* * `mpi_size` is the size of the MPI Communicator
*
* Condition 2: `sum_chunk` divided by `mpi_size` must exceed or equal a certain
* threshold (as of this writing, 10000).
*
* To satisfy both these conditions, we #define a macro,
* LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM, which corresponds to the
* value of the H5D_ALL_CHUNK_ADDR_THRES_COL_NUM macro in H5Dmpio.c (the
* 10000 threshold from condition 2). We then create a dataset of that many
* chunks and have each MPI rank write to and read from a piece of every single
* chunk in the dataset. This ensures chunk utilization is the max possible
* and exceeds our 30% target ratio, while always exactly matching the numeric
* chunk threshold value of condition 2.
*
* Failure in this test may either cause a hang, or, due to how the MPI calls
* pertaining to this issue might mistakenly match up, may cause an MPI error
@ -375,10 +387,9 @@ void
test_link_chunk_io_sort_chunk_issue(void)
{
const char *filename;
hsize_t * dataset_dims = NULL;
hsize_t max_dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t sel_dims[1];
hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS] = {LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS};
hsize_t dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t sel_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t start[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t stride[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
hsize_t count[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
@ -412,14 +423,13 @@ test_link_chunk_io_sort_chunk_issue(void)
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
VRFY((file_id >= 0), "H5Fcreate succeeded");
dataset_dims = HDmalloc(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS * sizeof(*dataset_dims));
VRFY((dataset_dims != NULL), "malloc succeeded");
/*
* Create a one-dimensional dataset of exactly LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM
* chunks, where every rank writes to a piece of every single chunk to keep utilization high.
*/
dataset_dims[0] = (hsize_t)mpi_size * (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM;
dataset_dims[0] = (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE * (hsize_t)mpi_size *
(hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE;
max_dataset_dims[0] = H5S_UNLIMITED;
fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, max_dataset_dims);
fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, NULL);
VRFY((fspace_id >= 0), "H5Screate_simple succeeded");
/*
@ -428,6 +438,9 @@ test_link_chunk_io_sort_chunk_issue(void)
dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
VRFY((dcpl_id >= 0), "H5Pcreate succeeded");
/* Chunk size is equal to MPI size since each rank writes to a piece of every chunk */
chunk_dims[0] = (hsize_t)mpi_size;
VRFY((H5Pset_chunk(dcpl_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, chunk_dims) >= 0),
"H5Pset_chunk succeeded");
@ -437,23 +450,21 @@ test_link_chunk_io_sort_chunk_issue(void)
/*
* Setup hyperslab selection to split the dataset among the ranks.
*
* The ranks will write rows across the dataset.
*/
stride[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
count[0] = (dataset_dims[0] / LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) / (hsize_t)mpi_size;
start[0] = count[0] * (hsize_t)mpi_rank;
block[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
start[0] = (hsize_t)mpi_rank;
stride[0] = (hsize_t)mpi_size;
count[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM;
block[0] = 1;
VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0),
"H5Sselect_hyperslab succeeded");
sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
sel_dims[0] = count[0];
mspace_id = H5Screate_simple(1, sel_dims, NULL);
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
data = HDcalloc(1, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
data = HDcalloc(1, count[0] * sizeof(int));
VRFY((data != NULL), "calloc succeeded");
dxpl_id = H5Pcreate(H5P_DATASET_XFER);
@ -476,33 +487,25 @@ test_link_chunk_io_sort_chunk_issue(void)
VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0),
"H5Pset_dxpl_mpio_chunk_opt succeeded");
read_buf = HDmalloc(count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
read_buf = HDmalloc(count[0] * sizeof(int));
VRFY((read_buf != NULL), "malloc succeeded");
VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0),
"H5Sselect_hyperslab succeeded");
sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
sel_dims[0] = count[0];
VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded");
mspace_id = H5Screate_simple(1, sel_dims, NULL);
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
read_buf = HDrealloc(read_buf, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
VRFY((read_buf != NULL), "realloc succeeded");
/*
* Finally have each rank read their section of data back from the dataset.
*/
VRFY((H5Dread(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, read_buf) >= 0),
"H5Dread succeeded");
if (dataset_dims) {
HDfree(dataset_dims);
dataset_dims = NULL;
}
if (data) {
HDfree(data);
data = NULL;