Fix issue with collective metadata writes of global heap data (#2480) (#2486)

This commit is contained in:
jhendersonHDF 2023-02-21 09:30:45 -06:00 committed by GitHub
parent 3dcee39ced
commit d8fd9c2f79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 104 additions and 5 deletions

View File

@ -109,7 +109,20 @@ Bug Fixes since HDF5-1.14.0 release
===================================
Library
-------
-
- Fixed an issue with collective metadata writes of global heap data
New test failures in parallel netCDF started occurring with debug
builds of HDF5 due to an assertion failure and this was reported in
GitHub issue #2433. The assertion failure began happening after the
collective metadata write pathway in the library was updated to use
vector I/O so that parallel-enabled HDF5 Virtual File Drivers (other
than the existing MPI I/O VFD) can support collective metadata writes.
The assertion failure was fixed by updating collective metadata writes
to treat global heap metadata as raw data, as done elsewhere in the
library.
(JTH - 2023/02/16, GH #2433)
Java Library

View File

@ -1003,6 +1003,10 @@ H5C__collective_write(H5F_t *f)
bufs[0] = base_buf;
types[0] = entry_ptr->type->mem_type;
/* Treat global heap as raw data */
if (types[0] == H5FD_MEM_GHEAP)
types[0] = H5FD_MEM_DRAW;
node = H5SL_next(node);
i = 1;
while (node) {
@ -1016,6 +1020,10 @@ H5C__collective_write(H5F_t *f)
bufs[i] = entry_ptr->image_ptr;
types[i] = entry_ptr->type->mem_type;
/* Treat global heap as raw data */
if (types[i] == H5FD_MEM_GHEAP)
types[i] = H5FD_MEM_DRAW;
/* Advance to next node & array location */
node = H5SL_next(node);
i++;

View File

@ -17,7 +17,7 @@ set (testphdf5_SOURCES
${HDF5_TEST_PAR_SOURCE_DIR}/t_chunk_alloc.c
${HDF5_TEST_PAR_SOURCE_DIR}/t_filter_read.c
${HDF5_TEST_PAR_SOURCE_DIR}/t_prop.c
${HDF5_TEST_PAR_SOURCE_DIR}/t_coll_md_read.c
${HDF5_TEST_PAR_SOURCE_DIR}/t_coll_md.c
${HDF5_TEST_PAR_SOURCE_DIR}/t_oflush.c
)

View File

@ -44,7 +44,7 @@ check_PROGRAMS = $(TEST_PROG_PARA) t_pflush1 t_pflush2
testphdf5_SOURCES=testphdf5.c t_dset.c t_file.c t_file_image.c t_mdset.c \
t_ph5basic.c t_coll_chunk.c t_span_tree.c t_chunk_alloc.c t_filter_read.c \
t_prop.c t_coll_md_read.c t_oflush.c
t_prop.c t_coll_md.c t_oflush.c
# The tests all depend on the hdf5 library and the test library
LDADD = $(LIBH5TEST) $(LIBHDF5)

View File

@ -11,8 +11,9 @@
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*
* A test suite to test HDF5's collective metadata read capabilities, as enabled
* by making a call to H5Pset_all_coll_metadata_ops().
* A test suite to test HDF5's collective metadata read and write capabilities,
* as enabled by making a call to H5Pset_all_coll_metadata_ops() and/or
* H5Pset_coll_metadata_write().
*/
#include "testphdf5.h"
@ -38,6 +39,10 @@
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DATASET_NAME "linked_chunk_io_sort_chunk_issue"
#define LINK_CHUNK_IO_SORT_CHUNK_ISSUE_DIMS 1
#define COLL_GHEAP_WRITE_ATTR_NELEMS 10
#define COLL_GHEAP_WRITE_ATTR_NAME "coll_gheap_write_attr"
#define COLL_GHEAP_WRITE_ATTR_DIMS 1
/*
* A test for issue HDFFV-10501. A parallel hang was reported which occurred
* in linked-chunk I/O when collective metadata reads are enabled and some ranks
@ -524,3 +529,73 @@ test_link_chunk_io_sort_chunk_issue(void)
VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
}
/*
* A test for GitHub issue #2433 which causes a collective metadata write
* of global heap data. This test is meant to ensure that global heap data
* gets correctly mapped as raw data during a collective metadata write
* using vector I/O.
*
* An assertion exists in the library that should be triggered if global
* heap data is not correctly mapped as raw data.
*/
void
test_collective_global_heap_write(void)
{
const char *filename;
hsize_t attr_dims[COLL_GHEAP_WRITE_ATTR_DIMS];
hid_t file_id = H5I_INVALID_HID;
hid_t fapl_id = H5I_INVALID_HID;
hid_t attr_id = H5I_INVALID_HID;
hid_t vl_type = H5I_INVALID_HID;
hid_t fspace_id = H5I_INVALID_HID;
hvl_t vl_data;
int mpi_rank, mpi_size;
int data_buf[COLL_GHEAP_WRITE_ATTR_NELEMS];
MPI_Comm_rank(MPI_COMM_WORLD, &mpi_rank);
MPI_Comm_size(MPI_COMM_WORLD, &mpi_size);
filename = GetTestParameters();
fapl_id = create_faccess_plist(MPI_COMM_WORLD, MPI_INFO_NULL, facc_type);
VRFY((fapl_id >= 0), "create_faccess_plist succeeded");
/*
* Even though the testphdf5 framework currently sets collective metadata
* writes on the FAPL, we call it here just to be sure this is futureproof,
* since demonstrating this issue relies upon it.
*/
VRFY((H5Pset_coll_metadata_write(fapl_id, true) >= 0), "Set collective metadata writes succeeded");
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
VRFY((file_id >= 0), "H5Fcreate succeeded");
attr_dims[0] = 1;
fspace_id = H5Screate_simple(COLL_GHEAP_WRITE_ATTR_DIMS, attr_dims, NULL);
VRFY((fspace_id >= 0), "H5Screate_simple succeeded");
vl_type = H5Tvlen_create(H5T_NATIVE_INT);
VRFY((vl_type >= 0), "H5Tvlen_create succeeded");
vl_data.len = COLL_GHEAP_WRITE_ATTR_NELEMS;
vl_data.p = data_buf;
/*
* Create a variable-length attribute that will get written to the global heap
*/
attr_id = H5Acreate2(file_id, COLL_GHEAP_WRITE_ATTR_NAME, vl_type, fspace_id, H5P_DEFAULT, H5P_DEFAULT);
VRFY((attr_id >= 0), "H5Acreate2 succeeded");
for (size_t i = 0; i < COLL_GHEAP_WRITE_ATTR_NELEMS; i++)
data_buf[i] = (int)i;
VRFY((H5Awrite(attr_id, vl_type, &vl_data) >= 0), "H5Awrite succeeded");
VRFY((H5Sclose(fspace_id) >= 0), "H5Sclose succeeded");
VRFY((H5Tclose(vl_type) >= 0), "H5Sclose succeeded");
VRFY((H5Aclose(attr_id) >= 0), "H5Aclose succeeded");
VRFY((H5Pclose(fapl_id) >= 0), "H5Pclose succeeded");
VRFY((H5Fclose(file_id) >= 0), "H5Fclose succeeded");
}

View File

@ -502,6 +502,8 @@ main(int argc, char **argv)
"Collective MD read with multi chunk I/O (H5D__chunk_addrmap)", PARATESTFILE);
AddTest("LC_coll_MD_read", test_link_chunk_io_sort_chunk_issue, NULL,
"Collective MD read with link chunk I/O (H5D__sort_chunk)", PARATESTFILE);
AddTest("GH_coll_MD_wr", test_collective_global_heap_write, NULL,
"Collective MD write of global heap data", PARATESTFILE);
/* Display testing information */
TestInfo(argv[0]);

View File

@ -293,6 +293,7 @@ void test_dense_attr(void);
void test_partial_no_selection_coll_md_read(void);
void test_multi_chunk_io_addrmap_issue(void);
void test_link_chunk_io_sort_chunk_issue(void);
void test_collective_global_heap_write(void);
void test_oflush(void);
/* commonly used prototypes */