Subfiling VFD - check if MPI is finalized during VFD termination (#2683)

This commit is contained in:
jhendersonHDF 2023-04-27 11:52:11 -05:00 committed by GitHub
parent 14a19b8c90
commit b5ecb0af6d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 87 additions and 28 deletions

View File

@ -887,16 +887,20 @@ done:
static herr_t
H5FD__ioc_close_int(H5FD_ioc_t *file_ptr)
{
int mpi_finalized;
int mpi_code;
herr_t ret_value = SUCCEED;
HDassert(file_ptr);
if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
if (file_ptr->context_id >= 0) {
subfiling_context_t *sf_context = H5_get_subfiling_object(file_ptr->context_id);
int mpi_code;
/* Don't allow IOC threads to be finalized until everyone gets here */
if (file_ptr->mpi_size > 1)
if (!mpi_finalized && (file_ptr->mpi_size > 1))
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file_ptr->comm)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code);
@ -911,10 +915,12 @@ H5FD__ioc_close_int(H5FD_ioc_t *file_ptr)
file_ptr->context_id = -1;
}
if (H5_mpi_comm_free(&file_ptr->comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Communicator");
if (H5_mpi_info_free(&file_ptr->info) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Info object");
if (!mpi_finalized) {
if (H5_mpi_comm_free(&file_ptr->comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Communicator");
if (H5_mpi_info_free(&file_ptr->info) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Info object");
}
done:
HDfree(file_ptr->file_path);

View File

@ -374,12 +374,29 @@ H5FD__subfiling_term(void)
herr_t ret_value = SUCCEED;
if (H5FD_SUBFILING_g >= 0) {
int mpi_finalized;
int mpi_code;
/*
* Retrieve status of whether MPI has already been terminated.
* This can happen if an HDF5 ID is left unclosed and HDF5
* shuts down after MPI_Finalize() is called in an application.
*/
if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
/* Free RPC message MPI Datatype */
if (H5_subfiling_rpc_msg_type != MPI_DATATYPE_NULL)
if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&H5_subfiling_rpc_msg_type)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
if (H5_subfiling_rpc_msg_type != MPI_DATATYPE_NULL) {
if (!mpi_finalized) {
if (MPI_SUCCESS != (mpi_code = MPI_Type_free(&H5_subfiling_rpc_msg_type)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
}
#ifdef H5FD_SUBFILING_DEBUG
else
HDprintf("** WARNING **: HDF5 is terminating the Subfiling VFD after MPI_Finalize() was "
"called - an HDF5 ID was probably left unclosed\n");
#endif
}
/* Clean up resources */
if (H5_subfiling_terminate() < 0)
@ -1297,10 +1314,15 @@ done:
static herr_t
H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr)
{
int mpi_finalized;
int mpi_code;
herr_t ret_value = SUCCEED;
HDassert(file_ptr);
if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized)))
H5_SUBFILING_MPI_GOTO_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
if (file_ptr->sf_file && H5FD_close(file_ptr->sf_file) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close subfile");
if (file_ptr->stub_file && H5FD_close(file_ptr->stub_file) < 0)
@ -1311,13 +1333,15 @@ H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_ARGS, FAIL, "can't close IOC FAPL");
file_ptr->fa.ioc_fapl_id = H5I_INVALID_HID;
if (H5_mpi_comm_free(&file_ptr->comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Communicator");
if (H5_mpi_info_free(&file_ptr->info) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Info object");
if (!mpi_finalized) {
if (H5_mpi_comm_free(&file_ptr->comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Communicator");
if (H5_mpi_info_free(&file_ptr->info) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI Info object");
if (H5_mpi_comm_free(&file_ptr->ext_comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
if (H5_mpi_comm_free(&file_ptr->ext_comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
}
file_ptr->fail_to_encode = FALSE;

View File

@ -338,8 +338,18 @@ done:
static herr_t
H5_free_subfiling_object_int(subfiling_context_t *sf_context)
{
int mpi_finalized;
int mpi_code;
herr_t ret_value = SUCCEED;
HDassert(sf_context);
if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized))) {
/* Assume MPI is finalized or worse, and try to clean up what we can */
H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
mpi_finalized = 1;
}
sf_context->sf_context_id = -1;
sf_context->h5_file_id = UINT64_MAX;
sf_context->sf_num_fids = 0;
@ -352,28 +362,38 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context)
sf_context->sf_base_addr = -1;
if (sf_context->sf_msg_comm != MPI_COMM_NULL) {
if (H5_mpi_comm_free(&sf_context->sf_msg_comm) < 0)
return FAIL;
if (!mpi_finalized) {
if (H5_mpi_comm_free(&sf_context->sf_msg_comm) < 0)
return FAIL;
}
sf_context->sf_msg_comm = MPI_COMM_NULL;
}
if (sf_context->sf_data_comm != MPI_COMM_NULL) {
if (H5_mpi_comm_free(&sf_context->sf_data_comm) < 0)
return FAIL;
if (!mpi_finalized) {
if (H5_mpi_comm_free(&sf_context->sf_data_comm) < 0)
return FAIL;
}
sf_context->sf_data_comm = MPI_COMM_NULL;
}
if (sf_context->sf_eof_comm != MPI_COMM_NULL) {
if (H5_mpi_comm_free(&sf_context->sf_eof_comm) < 0)
return FAIL;
if (!mpi_finalized) {
if (H5_mpi_comm_free(&sf_context->sf_eof_comm) < 0)
return FAIL;
}
sf_context->sf_eof_comm = MPI_COMM_NULL;
}
if (sf_context->sf_node_comm != MPI_COMM_NULL) {
if (H5_mpi_comm_free(&sf_context->sf_node_comm) < 0)
return FAIL;
if (!mpi_finalized) {
if (H5_mpi_comm_free(&sf_context->sf_node_comm) < 0)
return FAIL;
}
sf_context->sf_node_comm = MPI_COMM_NULL;
}
if (sf_context->sf_group_comm != MPI_COMM_NULL) {
if (H5_mpi_comm_free(&sf_context->sf_group_comm) < 0)
return FAIL;
if (!mpi_finalized) {
if (H5_mpi_comm_free(&sf_context->sf_group_comm) < 0)
return FAIL;
}
sf_context->sf_group_comm = MPI_COMM_NULL;
}
@ -402,16 +422,24 @@ H5_free_subfiling_object_int(subfiling_context_t *sf_context)
HDfree(sf_context);
return SUCCEED;
H5_SUBFILING_FUNC_LEAVE;
}
static herr_t
H5_free_subfiling_topology(sf_topology_t *topology)
{
int mpi_finalized;
int mpi_code;
herr_t ret_value = SUCCEED;
HDassert(topology);
if (MPI_SUCCESS != (mpi_code = MPI_Finalized(&mpi_finalized))) {
/* Assume MPI is finalized or worse, but clean up what we can */
H5_SUBFILING_MPI_DONE_ERROR(FAIL, "MPI_Finalized failed", mpi_code);
mpi_finalized = 1;
}
#ifndef NDEBUG
{
hbool_t topology_cached = FALSE;
@ -442,8 +470,9 @@ H5_free_subfiling_topology(sf_topology_t *topology)
HDfree(topology->io_concentrators);
topology->io_concentrators = NULL;
if (H5_mpi_comm_free(&topology->app_comm) < 0)
H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
if (!mpi_finalized)
if (H5_mpi_comm_free(&topology->app_comm) < 0)
H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "can't free MPI communicator");
HDfree(topology);