mirror of
https://github.com/HDFGroup/hdf5.git
synced 2025-02-17 16:10:24 +08:00
Unify handling of collective metadata reads status (#1206)
This commit is contained in:
parent
bcf9565535
commit
86ef00fd67
@ -1083,6 +1083,27 @@ Bug Fixes since HDF5-1.12.0 release
|
||||
|
||||
(DER - 2021/11/23, HDFFV-11286)
|
||||
|
||||
- Unified handling of collective metadata reads to correctly fix old bugs
|
||||
|
||||
Due to MPI-related issues occurring in HDF5 from mismanagement of the
|
||||
status of collective metadata reads, they were forced to be disabled
|
||||
during chunked dataset raw data I/O in the HDF5 1.10.5 release. This
|
||||
wouldn't generally have affected application performance because HDF5
|
||||
already disables collective metadata reads during chunk lookup, since
|
||||
it is generally unlikely that the same chunks will be read by all MPI
|
||||
ranks in the I/O operation. However, this was only a partial solution
|
||||
that wasn't granular enough.
|
||||
|
||||
This change now unifies the handling of the file-global flag and the
|
||||
API context-level flag for collective metadata reads in order to
|
||||
simplify querying of the true status of collective metadata reads. Thus,
|
||||
collective metadata reads are once again enabled for chunked dataset
|
||||
raw data I/O, but manually controlled at places where some processing
|
||||
occurs on MPI rank 0 only and would cause issues when collective
|
||||
metadata reads are enabled.
|
||||
|
||||
(JTH - 2021/11/16, HDFFV-10501/HDFFV-10562)
|
||||
|
||||
- Fixed several potential MPI deadlocks in library failure conditions
|
||||
|
||||
In the parallel library, there were several places where MPI rank 0
|
||||
|
39
src/H5C.c
39
src/H5C.c
@ -1518,17 +1518,26 @@ H5C_insert_entry(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *thing, u
|
||||
|
||||
#ifdef H5_HAVE_PARALLEL
|
||||
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
|
||||
coll_access = H5CX_get_coll_metadata_read();
|
||||
coll_access = H5F_get_coll_metadata_reads(f);
|
||||
|
||||
entry_ptr->coll_access = coll_access;
|
||||
if (coll_access) {
|
||||
H5C__INSERT_IN_COLL_LIST(cache_ptr, entry_ptr, FAIL)
|
||||
|
||||
/* Make sure the size of the collective entries in the cache remain in check */
|
||||
if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
|
||||
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
|
||||
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
|
||||
} /* end if */
|
||||
if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) {
|
||||
if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100) {
|
||||
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
|
||||
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
|
||||
} /* end if */
|
||||
} /* end if */
|
||||
else {
|
||||
if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100) {
|
||||
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
|
||||
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "can't clear collective metadata entries")
|
||||
} /* end if */
|
||||
} /* end else */
|
||||
} /* end if */
|
||||
#endif
|
||||
|
||||
done:
|
||||
@ -2248,7 +2257,7 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
|
||||
|
||||
#ifdef H5_HAVE_PARALLEL
|
||||
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI))
|
||||
coll_access = H5CX_get_coll_metadata_read();
|
||||
coll_access = H5F_get_coll_metadata_reads(f);
|
||||
#endif /* H5_HAVE_PARALLEL */
|
||||
|
||||
/* first check to see if the target is in cache */
|
||||
@ -2600,11 +2609,19 @@ H5C_protect(H5F_t *f, const H5C_class_t *type, haddr_t addr, void *udata, unsign
|
||||
|
||||
#ifdef H5_HAVE_PARALLEL
|
||||
/* Make sure the size of the collective entries in the cache remain in check */
|
||||
if (coll_access)
|
||||
if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
|
||||
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
|
||||
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
|
||||
#endif /* H5_HAVE_PARALLEL */
|
||||
if (coll_access) {
|
||||
if (H5P_USER_TRUE == H5F_COLL_MD_READ(f)) {
|
||||
if (cache_ptr->max_cache_size * 80 < cache_ptr->coll_list_size * 100)
|
||||
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
|
||||
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
|
||||
} /* end if */
|
||||
else {
|
||||
if (cache_ptr->max_cache_size * 40 < cache_ptr->coll_list_size * 100)
|
||||
if (H5C_clear_coll_entries(cache_ptr, TRUE) < 0)
|
||||
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, NULL, "can't clear collective metadata entries")
|
||||
} /* end else */
|
||||
} /* end if */
|
||||
#endif /* H5_HAVE_PARALLEL */
|
||||
|
||||
done:
|
||||
#if H5C_DO_EXTREME_SANITY_CHECKS
|
||||
|
@ -1423,7 +1423,7 @@ done:
|
||||
* Purpose: Sanity checks and sets up collective operations.
|
||||
*
|
||||
* Note: Should be called for all API routines that modify file
|
||||
* file metadata but don't pass in an access property list.
|
||||
* metadata but don't pass in an access property list.
|
||||
*
|
||||
* Return: Non-negative on success / Negative on failure
|
||||
*
|
||||
|
@ -3178,7 +3178,9 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
|
||||
unsigned idx = 0; /* Index of chunk in cache, if present */
|
||||
hbool_t found = FALSE; /* In cache? */
|
||||
#ifdef H5_HAVE_PARALLEL
|
||||
hbool_t reenable_coll_md_reads = FALSE;
|
||||
H5P_coll_md_read_flag_t md_reads_file_flag;
|
||||
hbool_t md_reads_context_flag;
|
||||
hbool_t restore_md_reads_state = FALSE;
|
||||
#endif
|
||||
herr_t ret_value = SUCCEED; /* Return value */
|
||||
|
||||
@ -3252,11 +3254,10 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
|
||||
* processes.
|
||||
*/
|
||||
if (H5F_HAS_FEATURE(idx_info.f, H5FD_FEAT_HAS_MPI)) {
|
||||
hbool_t do_coll_md_reads = H5CX_get_coll_metadata_read();
|
||||
if (do_coll_md_reads) {
|
||||
H5CX_set_coll_metadata_read(FALSE);
|
||||
reenable_coll_md_reads = TRUE;
|
||||
}
|
||||
md_reads_file_flag = H5P_FORCE_FALSE;
|
||||
md_reads_context_flag = FALSE;
|
||||
H5F_set_coll_metadata_reads(idx_info.f, &md_reads_file_flag, &md_reads_context_flag);
|
||||
restore_md_reads_state = TRUE;
|
||||
}
|
||||
#endif /* H5_HAVE_PARALLEL */
|
||||
|
||||
@ -3302,8 +3303,8 @@ H5D__chunk_lookup(const H5D_t *dset, const hsize_t *scaled, H5D_chunk_ud_t *udat
|
||||
done:
|
||||
#ifdef H5_HAVE_PARALLEL
|
||||
/* Re-enable collective metadata reads if we disabled them */
|
||||
if (reenable_coll_md_reads)
|
||||
H5CX_set_coll_metadata_read(TRUE);
|
||||
if (restore_md_reads_state)
|
||||
H5F_set_coll_metadata_reads(dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
|
||||
#endif /* H5_HAVE_PARALLEL */
|
||||
|
||||
FUNC_LEAVE_NOAPI(ret_value)
|
||||
|
109
src/H5Dmpio.c
109
src/H5Dmpio.c
@ -808,10 +808,6 @@ H5D__chunk_collective_io(H5D_io_info_t *io_info, const H5D_type_info_t *type_inf
|
||||
HDassert(type_info);
|
||||
HDassert(fm);
|
||||
|
||||
/* Disable collective metadata reads for chunked dataset I/O operations
|
||||
* in order to prevent potential hangs */
|
||||
H5CX_set_coll_metadata_read(FALSE);
|
||||
|
||||
/* Check the optional property list for the collective chunk IO optimization option */
|
||||
if (H5CX_get_mpio_chunk_opt_mode(&chunk_opt_mode) < 0)
|
||||
HGOTO_ERROR(H5E_DATASET, H5E_CANTGET, FAIL, "couldn't get chunk optimization option")
|
||||
@ -2306,17 +2302,20 @@ static herr_t
|
||||
H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
|
||||
H5D_chunk_addr_info_t chunk_addr_info_array[], int sum_chunk)
|
||||
{
|
||||
H5SL_node_t * chunk_node; /* Current node in chunk skip list */
|
||||
H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
|
||||
haddr_t chunk_addr; /* Current chunking address of this node */
|
||||
haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */
|
||||
hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */
|
||||
int bsearch_coll_chunk_threshold;
|
||||
int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND;
|
||||
int mpi_size; /* Number of MPI processes */
|
||||
int mpi_code; /* MPI return code */
|
||||
int i; /* Local index variable */
|
||||
herr_t ret_value = SUCCEED; /* Return value */
|
||||
H5SL_node_t * chunk_node; /* Current node in chunk skip list */
|
||||
H5D_chunk_info_t *chunk_info; /* Current chunking info. of this node. */
|
||||
haddr_t chunk_addr; /* Current chunking address of this node */
|
||||
haddr_t *total_chunk_addr_array = NULL; /* The array of chunk address for the total number of chunk */
|
||||
H5P_coll_md_read_flag_t md_reads_file_flag;
|
||||
hbool_t md_reads_context_flag;
|
||||
hbool_t restore_md_reads_state = FALSE;
|
||||
hbool_t do_sort = FALSE; /* Whether the addresses need to be sorted */
|
||||
int bsearch_coll_chunk_threshold;
|
||||
int many_chunk_opt = H5D_OBTAIN_ONE_CHUNK_ADDR_IND;
|
||||
int mpi_size; /* Number of MPI processes */
|
||||
int mpi_code; /* MPI return code */
|
||||
int i; /* Local index variable */
|
||||
herr_t ret_value = SUCCEED; /* Return value */
|
||||
|
||||
FUNC_ENTER_STATIC
|
||||
|
||||
@ -2360,7 +2359,32 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
|
||||
HGOTO_ERROR(H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank")
|
||||
|
||||
if (mpi_rank == 0) {
|
||||
if (H5D__chunk_addrmap(io_info, total_chunk_addr_array) < 0) {
|
||||
herr_t result;
|
||||
|
||||
/*
|
||||
* If enabled, disable collective metadata reads here.
|
||||
* Since the chunk address mapping is done on rank 0
|
||||
* only here, it will cause problems if collective
|
||||
* metadata reads are enabled.
|
||||
*/
|
||||
if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) {
|
||||
md_reads_file_flag = H5P_FORCE_FALSE;
|
||||
md_reads_context_flag = FALSE;
|
||||
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
|
||||
&md_reads_context_flag);
|
||||
restore_md_reads_state = TRUE;
|
||||
}
|
||||
|
||||
result = H5D__chunk_addrmap(io_info, total_chunk_addr_array);
|
||||
|
||||
/* Ensure that we restore the old collective metadata reads state */
|
||||
if (restore_md_reads_state) {
|
||||
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
|
||||
&md_reads_context_flag);
|
||||
restore_md_reads_state = FALSE;
|
||||
}
|
||||
|
||||
if (result < 0) {
|
||||
size_t u;
|
||||
|
||||
/* Clear total chunk address array */
|
||||
@ -2424,6 +2448,10 @@ H5D__sort_chunk(H5D_io_info_t *io_info, const H5D_chunk_map_t *fm,
|
||||
} /* end if */
|
||||
|
||||
done:
|
||||
/* Re-enable collective metadata reads if we disabled them */
|
||||
if (restore_md_reads_state)
|
||||
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
|
||||
|
||||
if (total_chunk_addr_array)
|
||||
H5MM_xfree(total_chunk_addr_array);
|
||||
|
||||
@ -2471,20 +2499,23 @@ static herr_t
|
||||
H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assign_io_mode[],
|
||||
haddr_t chunk_addr[])
|
||||
{
|
||||
size_t total_chunks;
|
||||
unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk;
|
||||
uint8_t * io_mode_info = NULL;
|
||||
uint8_t * recv_io_mode_info = NULL;
|
||||
uint8_t * mergebuf = NULL;
|
||||
uint8_t * tempbuf;
|
||||
H5SL_node_t * chunk_node;
|
||||
H5D_chunk_info_t *chunk_info;
|
||||
int mpi_size, mpi_rank;
|
||||
MPI_Comm comm;
|
||||
int root;
|
||||
size_t ic;
|
||||
int mpi_code;
|
||||
herr_t ret_value = SUCCEED;
|
||||
size_t total_chunks;
|
||||
unsigned percent_nproc_per_chunk, threshold_nproc_per_chunk;
|
||||
uint8_t * io_mode_info = NULL;
|
||||
uint8_t * recv_io_mode_info = NULL;
|
||||
uint8_t * mergebuf = NULL;
|
||||
uint8_t * tempbuf;
|
||||
H5SL_node_t * chunk_node;
|
||||
H5D_chunk_info_t * chunk_info;
|
||||
H5P_coll_md_read_flag_t md_reads_file_flag;
|
||||
hbool_t md_reads_context_flag;
|
||||
hbool_t restore_md_reads_state = FALSE;
|
||||
int mpi_size, mpi_rank;
|
||||
MPI_Comm comm;
|
||||
int root;
|
||||
size_t ic;
|
||||
int mpi_code;
|
||||
herr_t ret_value = SUCCEED;
|
||||
|
||||
FUNC_ENTER_STATIC
|
||||
|
||||
@ -2544,6 +2575,20 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig
|
||||
size_t nproc;
|
||||
unsigned *nproc_per_chunk;
|
||||
|
||||
/*
|
||||
* If enabled, disable collective metadata reads here.
|
||||
* Since the chunk address mapping is done on rank 0
|
||||
* only here, it will cause problems if collective
|
||||
* metadata reads are enabled.
|
||||
*/
|
||||
if (H5F_get_coll_metadata_reads(io_info->dset->oloc.file)) {
|
||||
md_reads_file_flag = H5P_FORCE_FALSE;
|
||||
md_reads_context_flag = FALSE;
|
||||
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag,
|
||||
&md_reads_context_flag);
|
||||
restore_md_reads_state = TRUE;
|
||||
}
|
||||
|
||||
/* pre-computing: calculate number of processes and
|
||||
regularity of the selection occupied in each chunk */
|
||||
if (NULL == (nproc_per_chunk = (unsigned *)H5MM_calloc(total_chunks * sizeof(unsigned))))
|
||||
@ -2610,6 +2655,10 @@ H5D__obtain_mpio_mode(H5D_io_info_t *io_info, H5D_chunk_map_t *fm, uint8_t assig
|
||||
#endif
|
||||
|
||||
done:
|
||||
/* Re-enable collective metadata reads if we disabled them */
|
||||
if (restore_md_reads_state)
|
||||
H5F_set_coll_metadata_reads(io_info->dset->oloc.file, &md_reads_file_flag, &md_reads_context_flag);
|
||||
|
||||
if (io_mode_info)
|
||||
H5MM_free(io_mode_info);
|
||||
if (mergebuf)
|
||||
|
132
src/H5Fmpi.c
132
src/H5Fmpi.c
@ -31,11 +31,12 @@
|
||||
/***********/
|
||||
/* Headers */
|
||||
/***********/
|
||||
#include "H5private.h" /* Generic Functions */
|
||||
#include "H5Eprivate.h" /* Error handling */
|
||||
#include "H5Fpkg.h" /* File access */
|
||||
#include "H5FDprivate.h" /* File drivers */
|
||||
#include "H5Iprivate.h" /* IDs */
|
||||
#include "H5private.h" /* Generic Functions */
|
||||
#include "H5CXprivate.h" /* API Contexts */
|
||||
#include "H5Eprivate.h" /* Error handling */
|
||||
#include "H5Fpkg.h" /* File access */
|
||||
#include "H5FDprivate.h" /* File drivers */
|
||||
#include "H5Iprivate.h" /* IDs */
|
||||
|
||||
#include "H5VLnative_private.h" /* Native VOL connector */
|
||||
|
||||
@ -402,4 +403,125 @@ H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm)
|
||||
done:
|
||||
FUNC_LEAVE_NOAPI(ret_value)
|
||||
} /* end H5F_mpi_retrieve_comm */
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
* Function: H5F_get_coll_metadata_reads
|
||||
*
|
||||
* Purpose: Determines whether collective metadata reads should be
|
||||
* performed. This routine is meant to be the single source of
|
||||
* truth for the collective metadata reads status, as it
|
||||
* coordinates between the file-global flag and the flag set
|
||||
* for the current operation in the current API context.
|
||||
*
|
||||
* Return: TRUE/FALSE (can't fail)
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
hbool_t
|
||||
H5F_get_coll_metadata_reads(const H5F_t *file)
|
||||
{
|
||||
H5P_coll_md_read_flag_t file_flag = H5P_USER_FALSE;
|
||||
hbool_t ret_value = FALSE;
|
||||
|
||||
FUNC_ENTER_NOAPI_NOERR
|
||||
|
||||
HDassert(file && file->shared);
|
||||
|
||||
/* Retrieve the file-global flag */
|
||||
file_flag = H5F_COLL_MD_READ(file);
|
||||
|
||||
/* If file flag is set to H5P_FORCE_FALSE, exit early
|
||||
* with FALSE, since collective metadata reads have
|
||||
* been explicitly disabled somewhere in the library.
|
||||
*/
|
||||
if (H5P_FORCE_FALSE == file_flag)
|
||||
ret_value = FALSE;
|
||||
else {
|
||||
/* If file flag is set to H5P_USER_TRUE, ignore
|
||||
* any settings in the API context. A file-global
|
||||
* setting of H5P_USER_TRUE for collective metadata
|
||||
* reads should ignore any settings on an Access
|
||||
* Property List for an individual operation.
|
||||
*/
|
||||
if (H5P_USER_TRUE == file_flag)
|
||||
ret_value = TRUE;
|
||||
else {
|
||||
/* Get the collective metadata reads flag from
|
||||
* the current API context.
|
||||
*/
|
||||
ret_value = H5CX_get_coll_metadata_read();
|
||||
}
|
||||
}
|
||||
|
||||
FUNC_LEAVE_NOAPI(ret_value)
|
||||
} /* end H5F_get_coll_metadata_reads() */
|
||||
|
||||
/*-------------------------------------------------------------------------
|
||||
* Function: H5F_set_coll_metadata_reads
|
||||
*
|
||||
* Purpose: Used to temporarily modify the collective metadata reads
|
||||
* status. This is useful for cases where either:
|
||||
*
|
||||
* * Collective metadata reads are enabled, but need to be
|
||||
* disabled for an operation about to occur that may trigger
|
||||
* an independent metadata read (such as only rank 0 doing
|
||||
* something)
|
||||
*
|
||||
* * Metadata reads are currently independent, but it is
|
||||
* guaranteed that the application has maintained
|
||||
* collectivity at the interface level (e.g., an operation
|
||||
* that modifies metadata is being performed). In this case,
|
||||
* it should be safe to enable collective metadata reads,
|
||||
* barring any internal library issues that may occur
|
||||
*
|
||||
* After completion, the `file_flag` parameter will be set to
|
||||
* the previous value of the file-global collective metadata
|
||||
* reads flag. The `context_flag` parameter will be set to the
|
||||
* previous value of the API context's collective metadata
|
||||
* reads flag. Another call to this routine should be made to
|
||||
* restore these values (see below warning).
|
||||
*
|
||||
* !! WARNING !!
|
||||
* It is dangerous to modify the collective metadata reads
|
||||
* status, as this can cause crashes, hangs and corruption in
|
||||
* the HDF5 file when improperly done. Therefore, the
|
||||
* `file_flag` and `context_flag` parameters are both
|
||||
* mandatory, and it is assumed that the caller will guarantee
|
||||
* these settings are restored with another call to this
|
||||
* routine once the bracketed operation is complete.
|
||||
* !! WARNING !!
|
||||
*
|
||||
* Return: Nothing
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
void
|
||||
H5F_set_coll_metadata_reads(H5F_t *file, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag)
|
||||
{
|
||||
H5P_coll_md_read_flag_t prev_file_flag = H5P_USER_FALSE;
|
||||
hbool_t prev_context_flag = FALSE;
|
||||
|
||||
FUNC_ENTER_NOAPI_NOERR
|
||||
|
||||
HDassert(file && file->shared);
|
||||
HDassert(file_flag);
|
||||
HDassert(context_flag);
|
||||
|
||||
/* Save old state */
|
||||
prev_file_flag = H5F_COLL_MD_READ(file);
|
||||
prev_context_flag = H5CX_get_coll_metadata_read();
|
||||
|
||||
/* Set new desired state */
|
||||
if (prev_file_flag != *file_flag) {
|
||||
file->shared->coll_md_read = *file_flag;
|
||||
*file_flag = prev_file_flag;
|
||||
}
|
||||
if (prev_context_flag != *context_flag) {
|
||||
H5CX_set_coll_metadata_read(*context_flag);
|
||||
*context_flag = prev_context_flag;
|
||||
}
|
||||
|
||||
FUNC_LEAVE_NOAPI_VOID
|
||||
} /* end H5F_set_coll_metadata_reads() */
|
||||
|
||||
#endif /* H5_HAVE_PARALLEL */
|
||||
|
@ -962,6 +962,8 @@ H5_DLL MPI_Comm H5F_mpi_get_comm(const H5F_t *f);
|
||||
H5_DLL int H5F_shared_mpi_get_size(const H5F_shared_t *f_sh);
|
||||
H5_DLL int H5F_mpi_get_size(const H5F_t *f);
|
||||
H5_DLL herr_t H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm);
|
||||
H5_DLL hbool_t H5F_get_coll_metadata_reads(const H5F_t *f);
|
||||
H5_DLL void H5F_set_coll_metadata_reads(H5F_t *f, H5P_coll_md_read_flag_t *file_flag, hbool_t *context_flag);
|
||||
#endif /* H5_HAVE_PARALLEL */
|
||||
|
||||
/* External file cache routines */
|
||||
|
@ -5284,15 +5284,14 @@ H5P__decode_coll_md_read_flag_t(const void **_pp, void *_value)
|
||||
* Function: H5Pset_all_coll_metadata_ops
|
||||
*
|
||||
* Purpose: Tell the library whether the metadata read operations will
|
||||
* be done collectively (1) or not (0). Default is independent.
|
||||
* With collective mode, the library will optimize access to
|
||||
* metadata operations on the file.
|
||||
* be done collectively (1) or not (0). Default is independent.
|
||||
* With collective mode, the library will optimize access to
|
||||
* metadata operations on the file.
|
||||
*
|
||||
* Note: This routine accepts file access property lists, link
|
||||
* access property lists, attribute access property lists,
|
||||
* dataset access property lists, group access property lists,
|
||||
* named datatype access property lists,
|
||||
* and dataset transfer property lists.
|
||||
* access property lists, attribute access property lists,
|
||||
* dataset access property lists, group access property lists
|
||||
* and named datatype access property lists.
|
||||
*
|
||||
* Return: Non-negative on success/Negative on failure
|
||||
*
|
||||
@ -5312,7 +5311,7 @@ H5Pset_all_coll_metadata_ops(hid_t plist_id, hbool_t is_collective)
|
||||
H5TRACE2("e", "ib", plist_id, is_collective);
|
||||
|
||||
/* Compare the property list's class against the other class */
|
||||
/* (Dataset, group, attribute, and named datype access property lists
|
||||
/* (Dataset, group, attribute, and named datatype access property lists
|
||||
* are sub-classes of link access property lists -QAK)
|
||||
*/
|
||||
if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS))
|
||||
@ -5342,10 +5341,9 @@ done:
|
||||
* Purpose: Gets information about collective metadata read mode.
|
||||
*
|
||||
* Note: This routine accepts file access property lists, link
|
||||
* access property lists, attribute access property lists,
|
||||
* dataset access property lists, group access property lists,
|
||||
* named datatype access property lists,
|
||||
* and dataset transfer property lists.
|
||||
* access property lists, attribute access property lists,
|
||||
* dataset access property lists, group access property lists,
|
||||
* and named datatype access property lists.
|
||||
*
|
||||
* Return: Non-negative on success/Negative on failure
|
||||
*
|
||||
@ -5363,7 +5361,7 @@ H5Pget_all_coll_metadata_ops(hid_t plist_id, hbool_t *is_collective /*out*/)
|
||||
H5TRACE2("e", "ix", plist_id, is_collective);
|
||||
|
||||
/* Compare the property list's class against the other class */
|
||||
/* (Dataset, group, attribute, and named datype access property lists
|
||||
/* (Dataset, group, attribute, and named datatype access property lists
|
||||
* are sub-classes of link access property lists -QAK)
|
||||
*/
|
||||
if (TRUE != H5P_isa_class(plist_id, H5P_LINK_ACCESS) && TRUE != H5P_isa_class(plist_id, H5P_FILE_ACCESS))
|
||||
|
14
src/H5Z.c
14
src/H5Z.c
@ -594,14 +594,9 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA
|
||||
/* Do a global flush if the file is opened for write */
|
||||
if (H5F_ACC_RDWR & H5F_INTENT(f)) {
|
||||
|
||||
/* When parallel HDF5 is defined, check for collective metadata reads on this
|
||||
* file and set the flag for metadata I/O in the API context. -QAK, 2018/02/14
|
||||
*/
|
||||
#ifdef H5_HAVE_PARALLEL
|
||||
/* Check if MPIO driver is used */
|
||||
if (H5F_HAS_FEATURE(f, H5FD_FEAT_HAS_MPI)) {
|
||||
H5P_coll_md_read_flag_t coll_md_read; /* Do all metadata reads collectively */
|
||||
|
||||
/* Sanity check for collectively calling H5Zunregister, if requested */
|
||||
/* (Sanity check assumes that a barrier on one file's comm
|
||||
* is sufficient (i.e. that there aren't different comms for
|
||||
@ -621,13 +616,8 @@ H5Z__flush_file_cb(void *obj_ptr, hid_t H5_ATTR_UNUSED obj_id, void H5_ATTR_PARA
|
||||
/* Set the "sanity checked" flag */
|
||||
object->sanity_checked = TRUE;
|
||||
} /* end if */
|
||||
|
||||
/* Check whether to use the collective metadata read DXPL */
|
||||
coll_md_read = H5F_COLL_MD_READ(f);
|
||||
if (H5P_USER_TRUE == coll_md_read)
|
||||
H5CX_set_coll_metadata_read(TRUE);
|
||||
} /* end if */
|
||||
#endif /* H5_HAVE_PARALLEL */
|
||||
} /* end if */
|
||||
#endif /* H5_HAVE_PARALLEL */
|
||||
|
||||
/* Call the flush routine for mounted file hierarchies */
|
||||
if (H5F_flush_mounts((H5F_t *)obj_ptr) < 0)
|
||||
|
@ -6623,13 +6623,15 @@ trace_file_check(int metadata_write_strategy)
|
||||
static hbool_t
|
||||
smoke_check_6(int metadata_write_strategy)
|
||||
{
|
||||
hbool_t success = TRUE;
|
||||
int i;
|
||||
int max_nerrors;
|
||||
hid_t fid = -1;
|
||||
H5F_t * file_ptr = NULL;
|
||||
H5C_t * cache_ptr = NULL;
|
||||
struct mssg_t mssg;
|
||||
H5P_coll_md_read_flag_t md_reads_file_flag;
|
||||
hbool_t md_reads_context_flag;
|
||||
hbool_t success = TRUE;
|
||||
int i;
|
||||
int max_nerrors;
|
||||
hid_t fid = -1;
|
||||
H5F_t * file_ptr = NULL;
|
||||
H5C_t * cache_ptr = NULL;
|
||||
struct mssg_t mssg;
|
||||
|
||||
switch (metadata_write_strategy) {
|
||||
|
||||
@ -6685,7 +6687,9 @@ smoke_check_6(int metadata_write_strategy)
|
||||
virt_num_data_entries = NUM_DATA_ENTRIES;
|
||||
|
||||
/* insert the first half collectively */
|
||||
H5CX_set_coll_metadata_read(TRUE);
|
||||
md_reads_file_flag = H5P_USER_TRUE;
|
||||
md_reads_context_flag = TRUE;
|
||||
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
|
||||
for (i = 0; i < virt_num_data_entries / 2; i++) {
|
||||
struct datum *entry_ptr;
|
||||
entry_ptr = &(data[i]);
|
||||
@ -6704,9 +6708,13 @@ smoke_check_6(int metadata_write_strategy)
|
||||
H5_CHECK_OVERFLOW(cache_ptr->max_cache_size, size_t, double);
|
||||
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
|
||||
}
|
||||
/* Restore collective metadata reads state */
|
||||
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
|
||||
|
||||
/* insert the other half independently */
|
||||
H5CX_set_coll_metadata_read(FALSE);
|
||||
md_reads_file_flag = H5P_USER_FALSE;
|
||||
md_reads_context_flag = FALSE;
|
||||
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
|
||||
for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) {
|
||||
struct datum *entry_ptr;
|
||||
entry_ptr = &(data[i]);
|
||||
@ -6724,6 +6732,8 @@ smoke_check_6(int metadata_write_strategy)
|
||||
/* Make sure coll entries do not cross the 80% threshold */
|
||||
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
|
||||
}
|
||||
/* Restore collective metadata reads state */
|
||||
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
|
||||
|
||||
/* flush the file */
|
||||
if (H5Fflush(fid, H5F_SCOPE_GLOBAL) < 0) {
|
||||
@ -6734,7 +6744,9 @@ smoke_check_6(int metadata_write_strategy)
|
||||
}
|
||||
|
||||
/* Protect the first half of the entries collectively */
|
||||
H5CX_set_coll_metadata_read(TRUE);
|
||||
md_reads_file_flag = H5P_USER_TRUE;
|
||||
md_reads_context_flag = TRUE;
|
||||
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
|
||||
for (i = 0; i < (virt_num_data_entries / 2); i++) {
|
||||
struct datum *entry_ptr;
|
||||
entry_ptr = &(data[i]);
|
||||
@ -6752,9 +6764,13 @@ smoke_check_6(int metadata_write_strategy)
|
||||
/* Make sure coll entries do not cross the 80% threshold */
|
||||
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
|
||||
}
|
||||
/* Restore collective metadata reads state */
|
||||
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
|
||||
|
||||
/* protect the other half independently */
|
||||
H5CX_set_coll_metadata_read(FALSE);
|
||||
md_reads_file_flag = H5P_USER_FALSE;
|
||||
md_reads_context_flag = FALSE;
|
||||
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
|
||||
for (i = virt_num_data_entries / 2; i < virt_num_data_entries; i++) {
|
||||
struct datum *entry_ptr;
|
||||
entry_ptr = &(data[i]);
|
||||
@ -6772,6 +6788,8 @@ smoke_check_6(int metadata_write_strategy)
|
||||
/* Make sure coll entries do not cross the 80% threshold */
|
||||
HDassert((double)cache_ptr->max_cache_size * 0.8 > cache_ptr->coll_list_size);
|
||||
}
|
||||
/* Restore collective metadata reads state */
|
||||
H5F_set_coll_metadata_reads(file_ptr, &md_reads_file_flag, &md_reads_context_flag);
|
||||
|
||||
for (i = 0; i < (virt_num_data_entries); i++) {
|
||||
unlock_entry(file_ptr, i, H5AC__NO_FLAGS_SET);
|
||||
|
@ -34,10 +34,9 @@
|
||||
|
||||
#define MULTI_CHUNK_IO_ADDRMAP_ISSUE_DIMS 2
|
||||
|
||||
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
|
||||
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE 20000
|
||||
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE 1
|
||||
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1
|
||||
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM 10000
|
||||
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
|
||||
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1
|
||||
|
||||
/*
|
||||
* A test for issue HDFFV-10501. A parallel hang was reported which occurred
|
||||
@ -339,21 +338,34 @@ test_multi_chunk_io_addrmap_issue(void)
|
||||
* collective metadata reads being made only by process 0 in H5D__sort_chunk().
|
||||
*
|
||||
* NOTE: Due to the way that the threshold value which pertains to this test
|
||||
* is currently calculated within HDF5, there are several conditions that this
|
||||
* test must maintain. Refer to the function H5D__sort_chunk in H5Dmpio.c for
|
||||
* a better idea of why.
|
||||
* is currently calculated within HDF5, the following two conditions must be
|
||||
* true to trigger the issue:
|
||||
*
|
||||
* Condition 1: We need to make sure that the test always selects every single
|
||||
* chunk in the dataset. It is fine if the selection is split up among multiple
|
||||
* ranks, but their combined selection must cover the whole dataset.
|
||||
* Condition 1: A certain threshold ratio must be met in order to have HDF5
|
||||
* obtain all chunk addresses collectively inside H5D__sort_chunk(). This is
|
||||
* given by the following:
|
||||
*
|
||||
* Condition 2: The number of chunks in the dataset divided by the number of MPI
|
||||
* ranks must exceed or equal 10000. In other words, each MPI rank must be
|
||||
* responsible for 10000 or more unique chunks.
|
||||
* (sum_chunk * 100) / (dataset_nchunks * mpi_size) >= 30%
|
||||
*
|
||||
* Condition 3: This test will currently only be reliably reproducible for 2 or 3
|
||||
* MPI ranks. The threshold value calculated reduces to a constant 100 / mpi_size,
|
||||
* and is compared against a default value of 30%.
|
||||
* where:
|
||||
* * `sum_chunk` is the combined sum of the number of chunks selected in
|
||||
* the dataset by all ranks (chunks selected by more than one rank count
|
||||
* individually toward the sum for each rank selecting that chunk)
|
||||
* * `dataset_nchunks` is the number of chunks in the dataset (selected
|
||||
* or not)
|
||||
* * `mpi_size` is the size of the MPI Communicator
|
||||
*
|
||||
* Condition 2: `sum_chunk` divided by `mpi_size` must exceed or equal a certain
|
||||
* threshold (as of this writing, 10000).
|
||||
*
|
||||
* To satisfy both these conditions, we #define a macro,
|
||||
* LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM, which corresponds to the
|
||||
* value of the H5D_ALL_CHUNK_ADDR_THRES_COL_NUM macro in H5Dmpio.c (the
|
||||
* 10000 threshold from condition 2). We then create a dataset of that many
|
||||
* chunks and have each MPI rank write to and read from a piece of every single
|
||||
* chunk in the dataset. This ensures chunk utilization is the max possible
|
||||
* and exceeds our 30% target ratio, while always exactly matching the numeric
|
||||
* chunk threshold value of condition 2.
|
||||
*
|
||||
* Failure in this test may either cause a hang, or, due to how the MPI calls
|
||||
* pertaining to this issue might mistakenly match up, may cause an MPI error
|
||||
@ -375,10 +387,9 @@ void
|
||||
test_link_chunk_io_sort_chunk_issue(void)
|
||||
{
|
||||
const char *filename;
|
||||
hsize_t * dataset_dims = NULL;
|
||||
hsize_t max_dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
|
||||
hsize_t sel_dims[1];
|
||||
hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS] = {LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS};
|
||||
hsize_t dataset_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
|
||||
hsize_t sel_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
|
||||
hsize_t chunk_dims[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
|
||||
hsize_t start[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
|
||||
hsize_t stride[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
|
||||
hsize_t count[LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS];
|
||||
@ -412,14 +423,13 @@ test_link_chunk_io_sort_chunk_issue(void)
|
||||
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
|
||||
VRFY((file_id >= 0), "H5Fcreate succeeded");
|
||||
|
||||
dataset_dims = HDmalloc(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS * sizeof(*dataset_dims));
|
||||
VRFY((dataset_dims != NULL), "malloc succeeded");
|
||||
/*
|
||||
* Create a one-dimensional dataset of exactly LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM
|
||||
* chunks, where every rank writes to a piece of every single chunk to keep utilization high.
|
||||
*/
|
||||
dataset_dims[0] = (hsize_t)mpi_size * (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM;
|
||||
|
||||
dataset_dims[0] = (hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE * (hsize_t)mpi_size *
|
||||
(hsize_t)LINK_CHUNK_IO_SORT_CHUNK_ISSUE_Y_DIM_SCALE;
|
||||
max_dataset_dims[0] = H5S_UNLIMITED;
|
||||
|
||||
fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, max_dataset_dims);
|
||||
fspace_id = H5Screate_simple(LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, dataset_dims, NULL);
|
||||
VRFY((fspace_id >= 0), "H5Screate_simple succeeded");
|
||||
|
||||
/*
|
||||
@ -428,6 +438,9 @@ test_link_chunk_io_sort_chunk_issue(void)
|
||||
dcpl_id = H5Pcreate(H5P_DATASET_CREATE);
|
||||
VRFY((dcpl_id >= 0), "H5Pcreate succeeded");
|
||||
|
||||
/* Chunk size is equal to MPI size since each rank writes to a piece of every chunk */
|
||||
chunk_dims[0] = (hsize_t)mpi_size;
|
||||
|
||||
VRFY((H5Pset_chunk(dcpl_id, LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS, chunk_dims) >= 0),
|
||||
"H5Pset_chunk succeeded");
|
||||
|
||||
@ -437,23 +450,21 @@ test_link_chunk_io_sort_chunk_issue(void)
|
||||
|
||||
/*
|
||||
* Setup hyperslab selection to split the dataset among the ranks.
|
||||
*
|
||||
* The ranks will write rows across the dataset.
|
||||
*/
|
||||
stride[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
|
||||
count[0] = (dataset_dims[0] / LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) / (hsize_t)mpi_size;
|
||||
start[0] = count[0] * (hsize_t)mpi_rank;
|
||||
block[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE;
|
||||
start[0] = (hsize_t)mpi_rank;
|
||||
stride[0] = (hsize_t)mpi_size;
|
||||
count[0] = LINK_CHUNK_IO_SORT_CHUNK_ISSUE_COLL_THRESH_NUM;
|
||||
block[0] = 1;
|
||||
|
||||
VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0),
|
||||
"H5Sselect_hyperslab succeeded");
|
||||
|
||||
sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
|
||||
sel_dims[0] = count[0];
|
||||
|
||||
mspace_id = H5Screate_simple(1, sel_dims, NULL);
|
||||
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
|
||||
|
||||
data = HDcalloc(1, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
|
||||
data = HDcalloc(1, count[0] * sizeof(int));
|
||||
VRFY((data != NULL), "calloc succeeded");
|
||||
|
||||
dxpl_id = H5Pcreate(H5P_DATASET_XFER);
|
||||
@ -476,33 +487,25 @@ test_link_chunk_io_sort_chunk_issue(void)
|
||||
VRFY((H5Pset_dxpl_mpio_chunk_opt(dxpl_id, H5FD_MPIO_CHUNK_ONE_IO) >= 0),
|
||||
"H5Pset_dxpl_mpio_chunk_opt succeeded");
|
||||
|
||||
read_buf = HDmalloc(count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
|
||||
read_buf = HDmalloc(count[0] * sizeof(int));
|
||||
VRFY((read_buf != NULL), "malloc succeeded");
|
||||
|
||||
VRFY((H5Sselect_hyperslab(fspace_id, H5S_SELECT_SET, start, stride, count, block) >= 0),
|
||||
"H5Sselect_hyperslab succeeded");
|
||||
|
||||
sel_dims[0] = count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE);
|
||||
sel_dims[0] = count[0];
|
||||
|
||||
VRFY((H5Sclose(mspace_id) >= 0), "H5Sclose succeeded");
|
||||
|
||||
mspace_id = H5Screate_simple(1, sel_dims, NULL);
|
||||
VRFY((mspace_id >= 0), "H5Screate_simple succeeded");
|
||||
|
||||
read_buf = HDrealloc(read_buf, count[0] * (LINK_CHUNK_IO_SORT_CHUNK_ISSUE_CHUNK_SIZE) * sizeof(int));
|
||||
VRFY((read_buf != NULL), "realloc succeeded");
|
||||
|
||||
/*
|
||||
* Finally have each rank read their section of data back from the dataset.
|
||||
*/
|
||||
VRFY((H5Dread(dset_id, H5T_NATIVE_INT, mspace_id, fspace_id, dxpl_id, read_buf) >= 0),
|
||||
"H5Dread succeeded");
|
||||
|
||||
if (dataset_dims) {
|
||||
HDfree(dataset_dims);
|
||||
dataset_dims = NULL;
|
||||
}
|
||||
|
||||
if (data) {
|
||||
HDfree(data);
|
||||
data = NULL;
|
||||
|
Loading…
Reference in New Issue
Block a user