Subfiling VFD - tidying up and fixing a few new testing failures (#1977)

* Rename Subfiling IOC "thread_pool_count" field to "thread_pool_size"

* Add simple HDF5 example for Subfiling VFD

* Subfiling VFD - never cache app topology as it may change

* Subfiling VFD - cleanup unused funtionality and tidy up some TODOs

* Subfiling VFD - tidy up subfiling error handling in H5subfiling_common.c

* Subfiling VFD - show number of failed I/O requests on close

* Subfiling VFD - Update file cmp callback after switching to MPI I/O VFD

* Amend RELEASE.txt with info about h5fuse.sh and Subfiling limitations

* Subfiling VFD - switch to using H5_basename and H5_dirname
This commit is contained in:
jhendersonHDF 2022-08-09 18:05:37 -05:00 committed by GitHub
parent b84241e57a
commit ef33ac8bac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 915 additions and 1490 deletions

View File

@ -47,6 +47,7 @@ if (H5_HAVE_PARALLEL)
ph5example
ph5_filtered_writes
ph5_filtered_writes_no_sel
ph5_subfiling
)
endif ()

View File

@ -30,6 +30,8 @@ set (test_ex_CLEANFILES
group.h5
groups.h5
hard_link.h5
h5_subfiling_default_example.h5
h5_subfiling_custom_example.h5
mount1.h5
mount2.h5
one_index_file.h5

View File

@ -20,7 +20,7 @@
include $(top_srcdir)/config/commence.am
if BUILD_PARALLEL_CONDITIONAL
EXAMPLE_PROG_PARA = ph5example ph5_filtered_writes ph5_filtered_writes_no_sel
EXAMPLE_PROG_PARA = ph5example ph5_filtered_writes ph5_filtered_writes_no_sel ph5_subfiling
endif
INSTALL_SCRIPT_FILES = run-c-ex.sh
@ -51,8 +51,9 @@ INSTALL_FILES = h5_write.c h5_read.c h5_extend_write.c h5_chunk_read.c h5_compou
h5_reference_deprec.c h5_ref_extern.c h5_ref_compat.c h5_ref2reg_deprec.c \
h5_extlink.c h5_elink_unix2win.c h5_shared_mesg.c h5_debug_trace.c \
ph5example.c ph5_filtered_writes.c ph5_filtered_writes_no_sel.c \
h5_vds.c h5_vds-exc.c h5_vds-exclim.c h5_vds-eiger.c h5_vds-simpleIO.c \
h5_vds-percival.c h5_vds-percival-unlim.c h5_vds-percival-unlim-maxmin.c
ph5_subfiling.c h5_vds.c h5_vds-exc.c h5_vds-exclim.c h5_vds-eiger.c \
h5_vds-simpleIO.c h5_vds-percival.c h5_vds-percival-unlim.c \
h5_vds-percival-unlim-maxmin.c
@ -120,7 +121,8 @@ h5_ref2reg_deprec: $(srcdir)/h5_ref2reg_deprec.c
h5_drivers: $(srcdir)/h5_drivers.c
ph5example: $(srcdir)/ph5example.c
ph5_filtered_writes: $(srcdir)/ph5_filtered_writes.c
ph5_filtered_writes_no_sel: $(srcdir)/ph5_filtered_writes_no_sel.c
ph5_filtered_writes_no_sel: $(srcdir)/ph5_filtered_writes_no_sel.c
ph5_subfiling: $(srcdir)/ph5_subfiling.c
h5_dtransform: $(srcdir)/h5_dtransform.c
h5_extlink: $(srcdir)/h5_extlink.c $(EXTLINK_DIRS)
h5_elink_unix2win: $(srcdir)/h5_elink_unix2win.c $(EXTLINK_DIRS)

362
examples/ph5_subfiling.c Normal file
View File

@ -0,0 +1,362 @@
/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright by The HDF Group. *
* All rights reserved. *
* *
* This file is part of HDF5. The full HDF5 copyright notice, including *
* terms governing use, modification, and redistribution, is contained in *
* the COPYING file, which can be found at the root of the source code *
* distribution tree, or in https://www.hdfgroup.org/licenses. *
* If you do not have access to either file, you may request a copy from *
* help@hdfgroup.org. *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*
* Example of using HDF5's Subfiling VFD to write to an
* HDF5 file that is striped across multiple sub-files
*
* If the HDF5_NOCLEANUP environment variable is set, the
* files that this example creates will not be removed as
* the example finishes.
*
* In general, the current working directory in which compiling
* is done, is not suitable for parallel I/O and there is no
* standard pathname for parallel file systems. In some cases,
* the parallel file name may even need some parallel file type
* prefix such as: "pfs:/GF/...". Therefore, this example parses
* the HDF5_PARAPREFIX environment variable for a prefix, if one
* is needed.
*/
#include <stdlib.h>
#include "hdf5.h"
#if defined(H5_HAVE_PARALLEL) && defined(H5_HAVE_SUBFILING_VFD)
#define EXAMPLE_FILE "h5_subfiling_default_example.h5"
#define EXAMPLE_FILE2 "h5_subfiling_custom_example.h5"
#define EXAMPLE_DSET_NAME "DSET"
#define EXAMPLE_DSET_DIMS 2
/* Have each MPI rank write 64MiB of data */
#define EXAMPLE_DSET_NY 16777216
/* Dataset datatype */
#define EXAMPLE_DSET_DATATYPE H5T_NATIVE_INT
typedef int EXAMPLE_DSET_C_DATATYPE;
/* Cleanup created files */
static void
cleanup(char *filename, hid_t fapl_id)
{
hbool_t do_cleanup = getenv(HDF5_NOCLEANUP) ? 0 : 1;
if (do_cleanup)
H5Fdelete(filename, fapl_id);
}
static void
subfiling_write_default(hid_t fapl_id, int mpi_size, int mpi_rank)
{
EXAMPLE_DSET_C_DATATYPE *data;
hsize_t dset_dims[EXAMPLE_DSET_DIMS];
hsize_t start[EXAMPLE_DSET_DIMS];
hsize_t count[EXAMPLE_DSET_DIMS];
hid_t file_id;
hid_t dset_id;
hid_t filespace;
char filename[512];
char *par_prefix;
/*
* Set Subfiling VFD on FAPL using default settings
* (use IOC VFD, 1 IOC per node, 32MiB stripe size)
*
* Note that all of Subfiling's configuration settings
* can be adjusted with environment variables as well
* in this case.
*/
H5Pset_fapl_subfiling(fapl_id, NULL);
/*
* OPTIONAL: Set alignment of objects in HDF5 file to
* be equal to the Subfiling stripe size.
* Choosing a Subfiling stripe size and HDF5
* object alignment value that are some
* multiple of the disk block size can
* generally help performance by ensuring
* that I/O is well-aligned and doesn't
* excessively cross stripe boundaries.
*
* Note that this option can substantially
* increase the size of the resulting HDF5
* files, so it is a good idea to keep an eye
* on this.
*/
H5Pset_alignment(fapl_id, 0, 33554432); /* Align to default 32MiB stripe size */
/* Parse any parallel prefix and create filename */
par_prefix = getenv("HDF5_PARAPREFIX");
snprintf(filename, sizeof(filename), "%s%s%s", par_prefix ? par_prefix : "", par_prefix ? "/" : "",
EXAMPLE_FILE);
/*
* Create a new file collectively
*/
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
/*
* Create the dataspace for the dataset. The first
* dimension varies with the number of MPI ranks
* while the second dimension is fixed.
*/
dset_dims[0] = mpi_size;
dset_dims[1] = EXAMPLE_DSET_NY;
filespace = H5Screate_simple(EXAMPLE_DSET_DIMS, dset_dims, NULL);
/*
* Create the dataset with default properties
*/
dset_id = H5Dcreate2(file_id, EXAMPLE_DSET_NAME, EXAMPLE_DSET_DATATYPE, filespace, H5P_DEFAULT,
H5P_DEFAULT, H5P_DEFAULT);
/*
* Each MPI rank writes from a contiguous memory
* region to the hyperslab in the file
*/
start[0] = mpi_rank;
start[1] = 0;
count[0] = 1;
count[1] = dset_dims[1];
H5Sselect_hyperslab(filespace, H5S_SELECT_SET, start, NULL, count, NULL);
/*
* Initialize data buffer
*/
data = malloc(count[0] * count[1] * sizeof(EXAMPLE_DSET_C_DATATYPE));
for (size_t i = 0; i < count[0] * count[1]; i++) {
data[i] = mpi_rank + i;
}
/*
* Write to dataset
*/
H5Dwrite(dset_id, EXAMPLE_DSET_DATATYPE, H5S_BLOCK, filespace, H5P_DEFAULT, data);
/*
* Close/release resources.
*/
free(data);
H5Dclose(dset_id);
H5Sclose(filespace);
H5Fclose(file_id);
cleanup(EXAMPLE_FILE, fapl_id);
}
static void
subfiling_write_custom(hid_t fapl_id, int mpi_size, int mpi_rank)
{
EXAMPLE_DSET_C_DATATYPE *data;
H5FD_subfiling_config_t subf_config;
H5FD_ioc_config_t ioc_config;
hsize_t dset_dims[EXAMPLE_DSET_DIMS];
hsize_t start[EXAMPLE_DSET_DIMS];
hsize_t count[EXAMPLE_DSET_DIMS];
hid_t file_id;
hid_t ioc_fapl;
hid_t dset_id;
hid_t filespace;
char filename[512];
char *par_prefix;
/*
* Get a default Subfiling and IOC configuration
*/
H5Pget_fapl_subfiling(fapl_id, &subf_config);
H5Pget_fapl_ioc(fapl_id, &ioc_config);
/*
* Set Subfiling configuration to use a 1MiB
* stripe size and the SELECT_IOC_EVERY_NTH_RANK
* selection method. By default, without a setting
* in the H5FD_SUBFILING_IOC_SELECTION_CRITERIA
* environment variable, this will use every MPI
* rank as an I/O concentrator.
*/
subf_config.shared_cfg.stripe_size = 1048576;
subf_config.shared_cfg.ioc_selection = SELECT_IOC_EVERY_NTH_RANK;
/*
* Set IOC configuration to use 2 worker threads
* per IOC instead of the default setting and
* update IOC configuration with new subfiling
* configuration.
*/
ioc_config.thread_pool_size = 2;
ioc_config.subf_config = subf_config.shared_cfg;
/*
* Create a File Access Property List for
* the IOC VFD and set our new configuration
* on it. We make a copy of the original
* FAPL here so we get the MPI parameters
* set on it
*/
ioc_fapl = H5Pcopy(fapl_id);
H5Pset_fapl_ioc(ioc_fapl, &ioc_config);
/*
* Close FAPLs in the default configurations
* we retrieved and update the subfiling
* configuration with our new IOC FAPL
*/
H5Pclose(ioc_config.under_fapl_id);
H5Pclose(subf_config.ioc_fapl_id);
subf_config.ioc_fapl_id = ioc_fapl;
/*
* Finally, set our new Subfiling configuration
* on the original FAPL
*/
H5Pset_fapl_subfiling(fapl_id, &subf_config);
/*
* OPTIONAL: Set alignment of objects in HDF5 file to
* be equal to the Subfiling stripe size.
* Choosing a Subfiling stripe size and HDF5
* object alignment value that are some
* multiple of the disk block size can
* generally help performance by ensuring
* that I/O is well-aligned and doesn't
* excessively cross stripe boundaries.
*
* Note that this option can substantially
* increase the size of the resulting HDF5
* files, so it is a good idea to keep an eye
* on this.
*/
H5Pset_alignment(fapl_id, 0, 1048576); /* Align to custom 1MiB stripe size */
/* Parse any parallel prefix and create filename */
par_prefix = getenv("HDF5_PARAPREFIX");
snprintf(filename, sizeof(filename), "%s%s%s", par_prefix ? par_prefix : "", par_prefix ? "/" : "",
EXAMPLE_FILE2);
/*
* Create a new file collectively
*/
file_id = H5Fcreate(filename, H5F_ACC_TRUNC, H5P_DEFAULT, fapl_id);
/*
* Create the dataspace for the dataset. The first
* dimension varies with the number of MPI ranks
* while the second dimension is fixed.
*/
dset_dims[0] = mpi_size;
dset_dims[1] = EXAMPLE_DSET_NY;
filespace = H5Screate_simple(EXAMPLE_DSET_DIMS, dset_dims, NULL);
/*
* Create the dataset with default properties
*/
dset_id = H5Dcreate2(file_id, EXAMPLE_DSET_NAME, EXAMPLE_DSET_DATATYPE, filespace, H5P_DEFAULT,
H5P_DEFAULT, H5P_DEFAULT);
/*
* Each MPI rank writes from a contiguous memory
* region to the hyperslab in the file
*/
start[0] = mpi_rank;
start[1] = 0;
count[0] = 1;
count[1] = dset_dims[1];
H5Sselect_hyperslab(filespace, H5S_SELECT_SET, start, NULL, count, NULL);
/*
* Initialize data buffer
*/
data = malloc(count[0] * count[1] * sizeof(EXAMPLE_DSET_C_DATATYPE));
for (size_t i = 0; i < count[0] * count[1]; i++) {
data[i] = mpi_rank + i;
}
/*
* Write to dataset
*/
H5Dwrite(dset_id, EXAMPLE_DSET_DATATYPE, H5S_BLOCK, filespace, H5P_DEFAULT, data);
/*
* Close/release resources.
*/
free(data);
H5Dclose(dset_id);
H5Sclose(filespace);
H5Fclose(file_id);
cleanup(EXAMPLE_FILE2, fapl_id);
}
int
main(int argc, char **argv)
{
MPI_Comm comm = MPI_COMM_WORLD;
MPI_Info info = MPI_INFO_NULL;
hid_t fapl_id;
int mpi_size;
int mpi_rank;
int mpi_thread_required = MPI_THREAD_MULTIPLE;
int mpi_thread_provided = 0;
/* HDF5 Subfiling VFD requires MPI_Init_thread with MPI_THREAD_MULTIPLE */
MPI_Init_thread(&argc, &argv, mpi_thread_required, &mpi_thread_provided);
if (mpi_thread_provided < mpi_thread_required) {
printf("MPI_THREAD_MULTIPLE not supported\n");
MPI_Abort(comm, -1);
}
MPI_Comm_size(comm, &mpi_size);
MPI_Comm_rank(comm, &mpi_rank);
/*
* Set up File Access Property List with MPI
* parameters for the Subfiling VFD to use
*/
fapl_id = H5Pcreate(H5P_FILE_ACCESS);
H5Pset_mpi_params(fapl_id, comm, info);
/* Use Subfiling VFD with default settings */
subfiling_write_default(fapl_id, mpi_size, mpi_rank);
/* Use Subfiling VFD with custom settings */
subfiling_write_custom(fapl_id, mpi_size, mpi_rank);
H5Pclose(fapl_id);
if (mpi_rank == 0)
printf("PHDF5 example finished with no errors\n");
MPI_Finalize();
return 0;
}
#else
/* dummy program since HDF5 is not parallel-enabled */
int
main(void)
{
printf(
"Example program cannot run - HDF5 must be built with parallel support and Subfiling VFD support\n");
return 0;
}
#endif /* H5_HAVE_PARALLEL && H5_HAVE_SUBFILING_VFD */

View File

@ -103,8 +103,25 @@ New Features
while also minimizing the locking issues of the single shared file approach
on a parallel file system.
Also included with the Subfiling VFD is a new h5fuse.sh script which
reads a Subfiling configuration file and then combines the various
sub-files back into a single HDF5 file. By default, the h5fuse.sh script
looks in the current directory for the Subfiling configuration file,
but can also be pointed to the configuration file with a command-line
option.
The Subfiling VFD can be used by calling H5Pset_fapl_subfiling() on a
File Access Property List and using that FAPL for file operations.
File Access Property List and using that FAPL for file operations. Note
that the Subfiling VFD currently has the following limitations:
* Does not currently support HDF5 collective I/O, other than collective
metadata writes and reads as set by H5Pset_coll_metadata_write() and
H5Pset_all_coll_metadata_ops()
* The Subfiling VFD should not currently be used with an HDF5 library
that has been built with thread-safety enabled. This can cause deadlocks
when failures occur due to interactions between the VFD's internal
threads and HDF5's global lock.
(JTH - 2022/07/22)

View File

@ -61,36 +61,6 @@ typedef struct H5FD_ioc_t {
char *file_dir; /* Directory where we find files */
char *file_path; /* The user defined filename */
#ifndef H5_HAVE_WIN32_API
/* On most systems the combination of device and i-node number uniquely
* identify a file. Note that Cygwin, MinGW and other Windows POSIX
* environments have the stat function (which fakes inodes)
* and will use the 'device + inodes' scheme as opposed to the
* Windows code further below.
*/
dev_t device; /* file device number */
#else
/* Files in windows are uniquely identified by the volume serial
* number and the file index (both low and high parts).
*
* There are caveats where these numbers can change, especially
* on FAT file systems. On NTFS, however, a file should keep
* those numbers the same until renamed or deleted (though you
* can use ReplaceFile() on NTFS to keep the numbers the same
* while renaming).
*
* See the MSDN "BY_HANDLE_FILE_INFORMATION Structure" entry for
* more information.
*
* http://msdn.microsoft.com/en-us/library/aa363788(v=VS.85).aspx
*/
DWORD nFileIndexLow;
DWORD nFileIndexHigh;
DWORD dwVolumeSerialNumber;
HANDLE hFile; /* Native windows file handle */
#endif /* H5_HAVE_WIN32_API */
} H5FD_ioc_t;
/*
@ -490,7 +460,7 @@ H5FD__ioc_get_default_config(hid_t fapl_id, H5FD_ioc_config_t *config_out)
H5_SUBFILING_GOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI I/O VFD on IOC under FAPL");
/* Specific to this I/O Concentrator */
config_out->thread_pool_count = H5FD_IOC_DEFAULT_THREAD_POOL_SIZE;
config_out->thread_pool_size = H5FD_IOC_DEFAULT_THREAD_POOL_SIZE;
done:
if (H5_mpi_comm_free(&comm) < 0)
@ -536,7 +506,12 @@ H5FD__ioc_validate_config(const H5FD_ioc_config_t *fa)
if (fa->magic != H5FD_IOC_FAPL_MAGIC)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid H5FD_ioc_config_t magic value");
/* TODO: add extra IOC configuration validation code */
if (fa->under_fapl_id < 0)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid under FAPL ID");
if (fa->subf_config.ioc_selection < SELECT_IOC_ONE_PER_NODE ||
fa->subf_config.ioc_selection >= ioc_selection_options)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method");
done:
H5_SUBFILING_FUNC_LEAVE;
@ -850,24 +825,15 @@ H5FD__ioc_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr)
}
if (NULL != (file_ptr->file_path = HDrealpath(name, NULL))) {
char *path = NULL;
char *directory = dirname(path);
if (NULL == (path = HDstrdup(file_ptr->file_path)))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy subfiling subfile path");
if (NULL == (file_ptr->file_dir = HDstrdup(directory))) {
HDfree(path);
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL,
"can't copy subfiling subfile directory path");
if (H5_dirname(file_ptr->file_path, &file_ptr->file_dir) < 0) {
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't get subfile dirname");
}
HDfree(path);
}
else {
if (ENOENT == errno) {
if (NULL == (file_ptr->file_path = HDstrdup(name)))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy file name");
if (NULL == (file_ptr->file_dir = HDstrdup(".")))
if (NULL == (file_ptr->file_dir = H5MM_strdup(".")))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "can't set subfile directory path");
}
else
@ -983,7 +949,7 @@ H5FD__ioc_close_int(H5FD_ioc_t *file_ptr)
#ifdef H5FD_IOC_DEBUG
{
subfiling_context_t *sf_context = H5_get_subfiling_object(file_ptr->fa.context_id);
subfiling_context_t *sf_context = H5_get_subfiling_object(file_ptr->context_id);
if (sf_context) {
if (sf_context->topology->rank_is_ioc)
HDprintf("[%s %d] fd=%d\n", __func__, file_ptr->mpi_rank, sf_context->sf_fid);
@ -1035,7 +1001,7 @@ done:
HDfree(file_ptr->file_path);
file_ptr->file_path = NULL;
HDfree(file_ptr->file_dir);
H5MM_free(file_ptr->file_dir);
file_ptr->file_dir = NULL;
/* Release the file info */
@ -1089,8 +1055,31 @@ H5FD__ioc_cmp(const H5FD_t *_f1, const H5FD_t *_f2)
HDassert(f1);
HDassert(f2);
ret_value = H5FD_cmp(f1->ioc_file, f2->ioc_file);
if (f1->ioc_file && f1->ioc_file->cls && f1->ioc_file->cls->cmp && f2->ioc_file && f2->ioc_file->cls &&
f2->ioc_file->cls->cmp) {
ret_value = H5FD_cmp(f1->ioc_file, f2->ioc_file);
}
else {
h5_stat_t st1;
h5_stat_t st2;
/*
* If under VFD has no compare routine, get
* inode of HDF5 stub file and compare them
*
* Note that the compare callback doesn't
* allow for failure, so we just return -1
* if stat fails.
*/
if (HDstat(f1->file_path, &st1) < 0)
H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, -1, "couldn't stat file");
if (HDstat(f2->file_path, &st2) < 0)
H5_SUBFILING_SYS_GOTO_ERROR(H5E_VFL, H5E_CANTGET, -1, "couldn't stat file");
ret_value = (st1.st_ino > st2.st_ino) - (st1.st_ino < st2.st_ino);
}
done:
H5_SUBFILING_FUNC_LEAVE;
} /* end H5FD__ioc_cmp */
@ -1607,8 +1596,6 @@ H5FD__ioc_del(const char *name, hid_t fapl)
MPI_Comm comm = MPI_COMM_NULL;
MPI_Info info = MPI_INFO_NULL;
FILE *config_file = NULL;
char *name_copy = NULL;
char *name_copy2 = NULL;
char *tmp_filename = NULL;
char *base_filename = NULL;
char *file_dirname = NULL;
@ -1647,13 +1634,10 @@ H5FD__ioc_del(const char *name, hid_t fapl)
if (HDstat(name, &st) < 0)
H5_SUBFILING_SYS_GOTO_ERROR(H5E_FILE, H5E_SYSERRSTR, FAIL, "HDstat failed");
if (NULL == (name_copy = HDstrdup(name)))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't copy filename");
if (NULL == (name_copy2 = HDstrdup(name)))
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't copy filename");
base_filename = basename(name_copy);
file_dirname = dirname(name_copy2);
if (H5_basename(name, &base_filename) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file basename");
if (H5_dirname(name, &file_dirname) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't get file dirname");
/* Try to open the subfiling configuration file and get the number of IOCs */
if (NULL == (tmp_filename = HDmalloc(PATH_MAX)))
@ -1732,8 +1716,8 @@ done:
H5_SUBFILING_DONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI info object");
HDfree(tmp_filename);
HDfree(name_copy);
HDfree(name_copy2);
H5MM_free(file_dirname);
H5MM_free(base_filename);
H5_SUBFILING_FUNC_LEAVE;
}

View File

@ -108,7 +108,7 @@
* for compatibility with legacy HDF5 applications. The default driver used
* is currently the #H5FD_MPIO driver.
*
* \var int32_t H5FD_ioc_config_t::thread_pool_count
* \var int32_t H5FD_ioc_config_t::thread_pool_size
* The number of I/O concentrator worker threads to use.
*
* This value can also be set or adjusted with the #H5FD_IOC_THREAD_POOL_SIZE
@ -121,10 +121,10 @@
*
*/
typedef struct H5FD_ioc_config_t {
uint32_t magic; /* Must be set to H5FD_IOC_FAPL_MAGIC */
uint32_t version; /* Must be set to H5FD_IOC_CURR_FAPL_VERSION */
hid_t under_fapl_id; /* FAPL setup with the VFD to use for I/O to the HDF5 stub file */
int32_t thread_pool_count; /* Number of I/O concentrator worker threads to use */
uint32_t magic; /* Must be set to H5FD_IOC_FAPL_MAGIC */
uint32_t version; /* Must be set to H5FD_IOC_CURR_FAPL_VERSION */
hid_t under_fapl_id; /* FAPL setup with the VFD to use for I/O to the HDF5 stub file */
int32_t thread_pool_size; /* Number of I/O concentrator worker threads to use */
H5FD_subfiling_shared_config_t subf_config; /* Subfiling driver configuration */
} H5FD_ioc_config_t;
//! <!-- [H5FD_ioc_config_t_snip] -->

View File

@ -112,9 +112,9 @@ static void ioc_io_queue_add_entry(ioc_data_t *ioc_data, sf_work_request_t *wk_r
int
initialize_ioc_threads(void *_sf_context)
{
subfiling_context_t *sf_context = _sf_context;
ioc_data_t *ioc_data = NULL;
unsigned thread_pool_count = H5FD_IOC_DEFAULT_THREAD_POOL_SIZE;
subfiling_context_t *sf_context = _sf_context;
ioc_data_t *ioc_data = NULL;
unsigned thread_pool_size = H5FD_IOC_DEFAULT_THREAD_POOL_SIZE;
char *env_value;
int ret_value = 0;
#ifdef H5FD_IOC_COLLECT_STATS
@ -173,12 +173,12 @@ initialize_ioc_threads(void *_sf_context)
if ((env_value = HDgetenv(H5FD_IOC_THREAD_POOL_SIZE)) != NULL) {
int value_check = HDatoi(env_value);
if (value_check > 0) {
thread_pool_count = (unsigned int)value_check;
thread_pool_size = (unsigned int)value_check;
}
}
/* Initialize a thread pool for the I/O concentrator's worker threads */
if (hg_thread_pool_init(thread_pool_count, &ioc_data->io_thread_pool) < 0)
if (hg_thread_pool_init(thread_pool_size, &ioc_data->io_thread_pool) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTINIT, (-1), "can't initialize IOC worker thread pool");
/* Create the main IOC thread that will receive and dispatch I/O requests */
@ -194,11 +194,9 @@ initialize_ioc_threads(void *_sf_context)
t_end = MPI_Wtime();
#ifdef H5FD_IOC_DEBUG
if (sf_verbose_flag) {
if (sf_context->topology->subfile_rank == 0) {
HDprintf("%s: time = %lf seconds\n", __func__, (t_end - t_start));
HDfflush(stdout);
}
if (sf_context->topology->subfile_rank == 0) {
HDprintf("%s: time = %lf seconds\n", __func__, (t_end - t_start));
HDfflush(stdout);
}
#endif
@ -242,6 +240,10 @@ finalize_ioc_threads(void *_sf_context)
hg_thread_join(ioc_data->ioc_main_thread);
}
if (ioc_data->io_queue.num_failed > 0)
H5_SUBFILING_DONE_ERROR(H5E_IO, H5E_CLOSEERROR, -1, "%" PRId32 " I/O requests failed",
ioc_data->io_queue.num_failed);
HDfree(ioc_data);
H5_SUBFILING_FUNC_LEAVE;
@ -418,7 +420,6 @@ ioc_main(ioc_data_t *ioc_data)
wk_req.subfile_rank = subfile_rank;
wk_req.context_id = ioc_data->sf_context_id;
wk_req.start_time = queue_start_time;
wk_req.buffer = NULL;
ioc_io_queue_add_entry(ioc_data, &wk_req);
@ -521,8 +522,6 @@ handle_work_request(void *arg)
atomic_fetch_add(&ioc_data->sf_work_pending, 1);
msg->in_progress = 1;
switch (msg->tag) {
case WRITE_INDEP:
op_ret = ioc_file_queue_write_indep(msg, msg->subfile_rank, msg->source, sf_context->sf_data_comm,
@ -744,15 +743,10 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source,
t_start = MPI_Wtime();
t_queue_delay = t_start - msg->start_time;
#ifdef H5FD_IOC_DEBUG
if (sf_verbose_flag) {
if (sf_logfile) {
HDfprintf(sf_logfile,
"[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld, "
"queue_delay = %lf seconds\n",
subfile_rank, __func__, source, data_size, file_offset, t_queue_delay);
}
}
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(file_context_id,
"[ioc(%d) %s]: msg from %d: datasize=%ld\toffset=%ld, queue_delay = %lf seconds\n",
subfile_rank, __func__, source, data_size, file_offset, t_queue_delay);
#endif
#endif
@ -799,20 +793,16 @@ ioc_file_queue_write_indep(sf_work_request_t *msg, int subfile_rank, int source,
t_start = t_end;
#ifdef H5FD_IOC_DEBUG
if (sf_verbose_flag) {
if (sf_logfile) {
HDfprintf(sf_logfile, "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n", subfile_rank,
__func__, data_size, source, mpi_code);
}
}
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(file_context_id, "[ioc(%d) %s] MPI_Recv(%ld bytes, from = %d) status = %d\n",
subfile_rank, __func__, data_size, source, mpi_code);
#endif
#endif
sf_fid = sf_context->sf_fid;
#ifdef H5FD_IOC_DEBUG
#ifdef H5_SUBFILING_DEBUG
if (sf_fid < 0)
H5_subfiling_log(file_context_id, "%s: WARNING: attempt to write data to closed subfile FID %d",
__func__, sf_fid);
@ -919,13 +909,10 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source,
t_start = MPI_Wtime();
t_queue_delay = t_start - msg->start_time;
#ifdef H5FD_IOC_DEBUG
if (sf_verbose_flag && (sf_logfile != NULL)) {
HDfprintf(sf_logfile,
"[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld "
"queue_delay=%lf seconds\n",
subfile_rank, __func__, source, data_size, file_offset, t_queue_delay);
}
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(file_context_id,
"[ioc(%d) %s] msg from %d: datasize=%ld\toffset=%ld queue_delay=%lf seconds\n",
subfile_rank, __func__, source, data_size, file_offset, t_queue_delay);
#endif
#endif
@ -959,10 +946,9 @@ ioc_file_queue_read_indep(sf_work_request_t *msg, int subfile_rank, int source,
sf_pread_time += t_read;
sf_queue_delay_time += t_queue_delay;
#ifdef H5FD_IOC_DEBUG
if (sf_verbose_flag && (sf_logfile != NULL)) {
HDfprintf(sf_logfile, "[ioc(%d)] MPI_Send to source(%d) completed\n", subfile_rank, source);
}
#ifdef H5_SUBFILING_DEBUG
H5_subfiling_log(sf_context->sf_context_id, "[ioc(%d)] MPI_Send to source(%d) completed\n", subfile_rank,
source);
#endif
#endif
@ -1598,7 +1584,7 @@ ioc_io_queue_complete_entry(ioc_data_t *ioc_data, ioc_io_queue_entry_t *entry_pt
#ifdef H5FD_IOC_COLLECT_STATS
/* Compute the queued and execution time */
queued_time = entry_ptr->dispatch_time - entry_ptr->q_time;
execution_time = H5_now_usec() = entry_ptr->dispatch_time;
execution_time = H5_now_usec() - entry_ptr->dispatch_time;
ioc_data->io_queue.requests_completed++;
@ -1608,8 +1594,6 @@ ioc_io_queue_complete_entry(ioc_data_t *ioc_data, ioc_io_queue_entry_t *entry_pt
hg_thread_mutex_unlock(&ioc_data->io_queue.q_mutex);
HDassert(entry_ptr->wk_req.buffer == NULL);
ioc_io_queue_free_entry(entry_ptr);
entry_ptr = NULL;
@ -1642,7 +1626,6 @@ ioc_io_queue_free_entry(ioc_io_queue_entry_t *q_entry_ptr)
HDassert(q_entry_ptr->magic == H5FD_IOC__IO_Q_ENTRY_MAGIC);
HDassert(q_entry_ptr->next == NULL);
HDassert(q_entry_ptr->prev == NULL);
HDassert(q_entry_ptr->wk_req.buffer == NULL);
q_entry_ptr->magic = 0;

View File

@ -192,6 +192,59 @@ done:
* invalid data if other ranks perform writes while this
* operation is in progress.
*
* SUBFILING NOTE:
* The EOF calculation for subfiling is somewhat different
* than for the more traditional HDF5 file implementations.
* This statement derives from the fact that unlike "normal"
* HDF5 files, subfiling introduces a multi-file representation
* of a single HDF5 file. The plurality of sub-files represents
* a software RAID-0 based HDF5 file. As such, each sub-file
* contains a designated portion of the address space of the
* virtual HDF5 storage. We have no notion of HDF5 datatypes,
* datasets, metadata, or other HDF5 structures; only BYTES.
*
* The organization of the bytes within sub-files is consistent
* with the RAID-0 striping, i.e. there are IO Concentrators
* (IOCs) which correspond to a stripe-count (in Lustre) as
* well as a stripe_size. The combination of these two
* variables determines the "address" (a combination of IOC
* and a file offset) of any storage operation.
*
* Having a defined storage layout, the virtual file EOF
* calculation should be the MAXIMUM value returned by the
* collection of IOCs. Every MPI rank which hosts an IOC
* maintains its own EOF by updating that value for each
* WRITE operation that completes, i.e. if a new local EOF
* is greater than the existing local EOF, the new EOF
* will replace the old. The local EOF calculation is as
* follows.
* 1. At file creation, each IOC is assigned a rank value
* (0 to N-1, where N is the total number of IOCs) and
* a 'sf_base_addr' = 'subfile_rank' * 'sf_stripe_size')
* we also determine the 'sf_blocksize_per_stripe' which
* is simply the 'sf_stripe_size' * 'n_ioc_concentrators'
*
* 2. For every write operation, the IOC receives a message
* containing a file_offset and the data_size.
*
* 3. The file_offset + data_size are in turn used to
* create a stripe_id:
* IOC-(ioc_rank) IOC-(ioc_rank+1)
* |<- sf_base_address |<- sf_base_address |
* ID +--------------------+--------------------+
* 0:|<- sf_stripe_size ->|<- sf_stripe_size ->|
* 1:|<- sf_stripe_size ->|<- sf_stripe_size ->|
* ~ ~ ~
* N:|<- sf_stripe_size ->|<- sf_stripe_size ->|
* +--------------------+--------------------+
*
* The new 'stripe_id' is then used to calculate a
* potential new EOF:
* sf_eof = (stripe_id * sf_blocksize_per_stripe) + sf_base_addr
* + ((file_offset + data_size) % sf_stripe_size)
*
* 4. If (sf_eof > current_sf_eof), then current_sf_eof = sf_eof.
*
* Return: SUCCEED/FAIL
*
* Programmer: JRM -- 1/18/22

View File

@ -108,37 +108,6 @@ typedef struct H5FD_subfiling_t {
char *file_dir; /* Directory where we find files */
char *file_path; /* The user defined filename */
#ifndef H5_HAVE_WIN32_API
/* On most systems the combination of device and i-node number uniquely
* identify a file. Note that Cygwin, MinGW and other Windows POSIX
* environments have the stat function (which fakes inodes)
* and will use the 'device + inodes' scheme as opposed to the
* Windows code further below.
*/
dev_t device; /* file device number */
ino_t inode; /* file i-node number */
#else
/* Files in windows are uniquely identified by the volume serial
* number and the file index (both low and high parts).
*
* There are caveats where these numbers can change, especially
* on FAT file systems. On NTFS, however, a file should keep
* those numbers the same until renamed or deleted (though you
* can use ReplaceFile() on NTFS to keep the numbers the same
* while renaming).
*
* See the MSDN "BY_HANDLE_FILE_INFORMATION Structure" entry for
* more information.
*
* http://msdn.microsoft.com/en-us/library/aa363788(v=VS.85).aspx
*/
DWORD nFileIndexLow;
DWORD nFileIndexHigh;
DWORD dwVolumeSerialNumber;
HANDLE hFile; /* Native windows file handle */
#endif /* H5_HAVE_WIN32_API */
/*
* The element layouts above this point are identical with the
* H5FD_ioc_t structure. As a result,
@ -175,18 +144,6 @@ typedef struct H5FD_subfiling_t {
#define REGION_OVERFLOW(A, Z) \
(ADDR_OVERFLOW(A) || SIZE_OVERFLOW(Z) || HADDR_UNDEF == (A) + (Z) || (HDoff_t)((A) + (Z)) < (HDoff_t)(A))
#define H5FD_SUBFILING_DEBUG_OP_CALLS 0 /* debugging print toggle; 0 disables */
#if H5FD_SUBFILING_DEBUG_OP_CALLS
#define H5FD_SUBFILING_LOG_CALL(name) \
do { \
HDprintf("called %s()\n", (name)); \
HDfflush(stdout); \
} while (0)
#else
#define H5FD_SUBFILING_LOG_CALL(name) /* no-op */
#endif /* H5FD_SUBFILING_DEBUG_OP_CALLS */
/* Prototypes */
static herr_t H5FD__subfiling_term(void);
static void *H5FD__subfiling_fapl_get(H5FD_t *_file);
@ -393,18 +350,6 @@ H5FD__subfiling_term(void)
herr_t ret_value = SUCCEED;
if (H5FD_SUBFILING_g >= 0) {
/* Free the subfiling application layout information */
if (sf_app_layout) {
HDfree(sf_app_layout->layout);
sf_app_layout->layout = NULL;
HDfree(sf_app_layout->node_ranks);
sf_app_layout->node_ranks = NULL;
HDfree(sf_app_layout);
sf_app_layout = NULL;
}
/* Unregister from HDF5 error API */
if (H5subfiling_err_class_g >= 0) {
if (H5Eunregister_class(H5subfiling_err_class_g) < 0)
@ -646,12 +591,21 @@ H5FD__subfiling_validate_config(const H5FD_subfiling_config_t *fa)
HDassert(fa != NULL);
if (fa->version != H5FD_SUBFILING_CURR_FAPL_VERSION)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "Unknown H5FD_subfiling_config_t version");
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "unknown H5FD_subfiling_config_t version");
if (fa->magic != H5FD_SUBFILING_FAPL_MAGIC)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid H5FD_subfiling_config_t magic value");
/* TODO: add extra subfiling configuration validation code */
if (fa->ioc_fapl_id < 0)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC FAPL ID");
if (!fa->require_ioc)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL,
"Subfiling VFD currently always requires IOC VFD to be used");
if (fa->shared_cfg.ioc_selection < SELECT_IOC_ONE_PER_NODE ||
fa->shared_cfg.ioc_selection >= ioc_selection_options)
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid IOC selection method");
done:
H5_SUBFILING_FUNC_LEAVE;
@ -724,8 +678,6 @@ H5FD__copy_plist(hid_t fapl_id, hid_t *id_out_ptr)
int ret_value = 0;
H5P_genplist_t *plist_ptr = NULL;
H5FD_SUBFILING_LOG_CALL(__func__);
HDassert(id_out_ptr != NULL);
if (FALSE == H5P_isa_class(fapl_id, H5P_FILE_ACCESS))
@ -917,24 +869,22 @@ H5FD__subfiling_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t ma
}
if (NULL != (file_ptr->file_path = HDrealpath(name, NULL))) {
char *path = NULL;
char *directory = dirname(path);
char *path = NULL;
if (NULL == (path = HDstrdup(file_ptr->file_path)))
if (NULL == (path = H5MM_strdup(file_ptr->file_path)))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy subfiling subfile path");
if (NULL == (file_ptr->file_dir = HDstrdup(directory))) {
HDfree(path);
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL,
"can't copy subfiling subfile directory path");
if (H5_dirname(path, &file_ptr->file_dir) < 0) {
H5MM_free(path);
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, NULL, "couldn't get subfile dirname");
}
HDfree(path);
H5MM_free(path);
}
else {
if (ENOENT == errno) {
if (NULL == (file_ptr->file_path = HDstrdup(name)))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTCOPY, NULL, "can't copy file name");
if (NULL == (file_ptr->file_dir = HDstrdup(".")))
if (NULL == (file_ptr->file_dir = H5MM_strdup(".")))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTOPENFILE, NULL, "can't set subfile directory path");
}
else
@ -1041,21 +991,6 @@ H5FD__subfiling_close_int(H5FD_subfiling_t *file_ptr)
HDassert(file_ptr);
#if H5FD_SUBFILING_DEBUG_OP_CALLS
{
subfiling_context_t *sf_context = H5_get_subfiling_object(file_ptr->context_id);
HDassert(sf_context);
HDassert(sf_context->topology);
if (sf_context->topology->rank_is_ioc)
HDprintf("[%s %d] fd=%d\n", __func__, file_ptr->mpi_rank, sf_context->sf_fid);
else
HDprintf("[%s %d] fd=*\n", __func__, file_ptr->mpi_rank);
HDfflush(stdout);
}
#endif
if (file_ptr->sf_file && H5FD_close(file_ptr->sf_file) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_IO, H5E_CANTCLOSEFILE, FAIL, "unable to close subfile");
@ -1081,7 +1016,7 @@ done:
HDfree(file_ptr->file_path);
file_ptr->file_path = NULL;
HDfree(file_ptr->file_dir);
H5MM_free(file_ptr->file_dir);
file_ptr->file_dir = NULL;
/* Release the file info */
@ -1237,87 +1172,18 @@ H5FD__subfiling_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t a
* Return: End of file address, the first address past the end of the
* "file", either the filesystem file or the HDF5 file.
*
* SUBFILING NOTE:
* The EOF calculation for subfiling is somewhat different
* than for the more traditional HDF5 file implementations.
* This statement derives from the fact that unlike "normal"
* HDF5 files, subfiling introduces a multi-file representation
* of a single HDF5 file. The plurality of sub-files represents
* a software RAID-0 based HDF5 file. As such, each sub-file
* contains a designated portion of the address space of the
* virtual HDF5 storage. We have no notion of HDF5 datatypes,
* datasets, metadata, or other HDF5 structures; only BYTES.
*
* The organization of the bytes within sub-files is consistent
* with the RAID-0 striping, i.e. there are IO Concentrators
* (IOCs) which correspond to a stripe-count (in Lustre) as
* well as a stripe_size. The combination of these two
* variables determines the "address" (a combination of IOC
* and a file offset) of any storage operation.
*
* Having a defined storage layout, the virtual file EOF
* calculation should be the MAXIMUM value returned by the
* collection of IOCs. Every MPI rank which hosts an IOC
* maintains its own EOF by updating that value for each
* WRITE operation that completes, i.e. if a new local EOF
* is greater than the existing local EOF, the new EOF
* will replace the old. The local EOF calculation is as
* follows.
* 1. At file creation, each IOC is assigned a rank value
* (0 to N-1, where N is the total number of IOCs) and
* a 'sf_base_addr' = 'subfile_rank' * 'sf_stripe_size')
* we also determine the 'sf_blocksize_per_stripe' which
* is simply the 'sf_stripe_size' * 'n_ioc_concentrators'
*
* 2. For every write operation, the IOC receives a message
* containing a file_offset and the data_size.
*
* 3. The file_offset + data_size are in turn used to
* create a stripe_id:
* IOC-(ioc_rank) IOC-(ioc_rank+1)
* |<- sf_base_address |<- sf_base_address |
* ID +--------------------+--------------------+
* 0:|<- sf_stripe_size ->|<- sf_stripe_size ->|
* 1:|<- sf_stripe_size ->|<- sf_stripe_size ->|
* ~ ~ ~
* N:|<- sf_stripe_size ->|<- sf_stripe_size ->|
* +--------------------+--------------------+
*
* The new 'stripe_id' is then used to calculate a
* potential new EOF:
* sf_eof = (stripe_id * sf_blocksize_per_stripe) + sf_base_addr
* + ((file_offset + data_size) % sf_stripe_size)
*
* 4. If (sf_eof > current_sf_eof), then current_sf_eof = sf_eof.
*
*
* Programmer: Richard Warren
* NOTE: This VFD mimics the MPI I/O VFD and so does not try
* to keep the EOF updated. The EOF is mostly just needed
* right after the file is opened so the library can determine
* if the file is empty, truncated or okay.
*
*-------------------------------------------------------------------------
*/
static haddr_t
H5FD__subfiling_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
{
const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file;
#if 0
int64_t logical_eof = -1;
#endif
haddr_t ret_value = HADDR_UNDEF;
#if 0
/*
* TODO: this is a heavy weight implementation. We need something like this
* for file open, and probably for file close. However, in between, something
* similar to the current solution in the MPIIO VFD might be more appropriate.
*/
if (H5FD__subfiling__get_real_eof(file->fa.context_id, &logical_eof) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, HADDR_UNDEF, "can't get EOF")
/* Return the global max of all the subfile EOF values */
ret_value = (haddr_t)(logical_eof);
done:
#endif
const H5FD_subfiling_t *file = (const H5FD_subfiling_t *)_file;
haddr_t ret_value = HADDR_UNDEF;
ret_value = file->eof;
@ -1390,8 +1256,7 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL,
"addr overflow, addr = %" PRIuHADDR ", size = %" PRIuHADDR, addr, size);
/* TODO: Temporarily reject collective I/O until support is implemented (unless types are simple MPI_BYTE)
*/
/* Temporarily reject collective I/O until support is implemented (unless types are simple MPI_BYTE) */
{
H5FD_mpio_xfer_t xfer_mode;
@ -1419,11 +1284,6 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT);
}
#if H5FD_SUBFILING_DEBUG_OP_CALLS
HDprintf("[%s %d] addr=%ld, size=%ld\n", __func__, file_ptr->mpi_rank, addr, size);
HDfflush(stdout);
#endif
/*
* Retrieve the subfiling context object and the number
* of I/O concentrators.
@ -1442,14 +1302,6 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
ioc_total = sf_context->topology->n_io_concentrators;
#if H5FD_SUBFILING_DEBUG_OP_CALLS
if (sf_context->topology->rank_is_ioc)
HDprintf("[%s %d] fd=%d\n", __func__, file_ptr->mpi_rank, sf_context->sf_fid);
else
HDprintf("[%s %d] fd=*\n", __func__, file_ptr->mpi_rank);
HDfflush(stdout);
#endif
if (ioc_total == 0) {
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of I/O concentrators (%d)",
ioc_total);
@ -1539,18 +1391,6 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate subfile I/O buffers vector");
/* TODO: The following is left for future work */
/*
* Set ASYNC MODE
* H5FD_class_aio_t *async_file_ptr = (H5FD_class_aio_t *)file_ptr->sf_file;
* uint64_t op_code_begin = OPC_BEGIN;
* uint64_t op_code_complete = OPC_COMPLETE;
* const void *input = NULL;
* void *output = NULL;
* H5FDctl(file_ptr->sf_file, op_code_begin, flags, input, &output);
* (*async_file_ptr->h5fdctl)(file_ptr->sf_file, op_code_begin, flags, input, &output);
*/
for (int64_t i = 0; i < max_io_req_per_ioc; i++) {
uint32_t final_vec_len = vector_len;
int next_ioc = ioc_start;
@ -1588,9 +1428,6 @@ H5FD__subfiling_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr
if (MPI_SUCCESS != MPI_Bcast(buf, (int)size, MPI_BYTE, 0, file_ptr->comm))
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "can't broadcast data from rank 0");
}
/* TODO: The following is left for future work */
/* H5FDctl(file_ptr->sf_file, op_code_complete, flags, input, &output); */
}
}
@ -1658,8 +1495,7 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_OVERFLOW, FAIL,
"addr overflow, addr = %" PRIuHADDR ", size = %" PRIuHADDR, addr, size);
/* TODO: Temporarily reject collective I/O until support is implemented (unless types are simple MPI_BYTE)
*/
/* Temporarily reject collective I/O until support is implemented (unless types are simple MPI_BYTE) */
{
H5FD_mpio_xfer_t xfer_mode;
@ -1684,11 +1520,6 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
H5CX_set_io_xfer_mode(H5FD_MPIO_INDEPENDENT);
}
#if H5FD_SUBFILING_DEBUG_OP_CALLS
HDprintf("[%s %d] addr=%ld, size=%ld\n", __func__, file_ptr->mpi_rank, addr, size);
HDfflush(stdout);
#endif
/*
* Retrieve the subfiling context object and the number
* of I/O concentrators.
@ -1707,14 +1538,6 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
ioc_total = sf_context->topology->n_io_concentrators;
#if H5FD_SUBFILING_DEBUG_OP_CALLS
if (sf_context->topology->rank_is_ioc)
HDprintf("[%s %d] fd=%d\n", __func__, file_ptr->mpi_rank, sf_context->sf_fid);
else
HDprintf("[%s %d] fd=*\n", __func__, file_ptr->mpi_rank);
HDfflush(stdout);
#endif
if (ioc_total == 0) {
H5_SUBFILING_GOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "invalid number of I/O concentrators (%d)",
ioc_total);
@ -1804,18 +1627,6 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
H5_SUBFILING_GOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"can't allocate subfile I/O buffers vector");
/* TODO: The following is left for future work */
/*
* Set ASYNC MODE
* H5FD_class_aio_t *async_file_ptr = (H5FD_class_aio_t *)file_ptr->sf_file;
* uint64_t op_code_begin = OPC_BEGIN;
* uint64_t op_code_complete = OPC_COMPLETE;
* const void *input = NULL;
* void *output = NULL;
* H5FDctl(file_ptr->sf_file, op_code_begin, flags, input, &output);
* (*async_file_ptr->h5fdctl)(file_ptr->sf_file, op_code_begin, flags, input, &output);
*/
for (int64_t i = 0; i < max_io_req_per_ioc; i++) {
uint32_t final_vec_len = vector_len;
int next_ioc = ioc_start;
@ -1845,9 +1656,6 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
io_bufs) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write to subfile failed");
}
/* TODO: The following is left for future work */
/* H5FDctl(file_ptr->sf_file, op_code_complete, flags, input, &output); */
}
}
@ -1858,15 +1666,11 @@ H5FD__subfiling_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t add
file_ptr->pos = addr;
file_ptr->op = OP_WRITE;
#if 1 /* Mimic the MPI I/O VFD */
/* Mimic the MPI I/O VFD */
file_ptr->eof = HADDR_UNDEF;
if (file_ptr->pos > file_ptr->local_eof)
file_ptr->local_eof = file_ptr->pos;
#else
if (file_ptr->pos > file_ptr->eof)
file_ptr->eof = file_ptr->pos;
#endif
done:
HDfree(io_bufs);
@ -2235,7 +2039,6 @@ H5FD__subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5
HDassert(file);
/* Extend the file to make sure it's large enough */
#if 1 /* Mimic the MPI I/O VFD */
if (!H5F_addr_eq(file->eoa, file->last_eoa)) {
int64_t sf_eof;
int64_t eoa;
@ -2274,29 +2077,6 @@ H5FD__subfiling_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, hbool_t H5
/* Update the 'last' eoa value */
file->last_eoa = file->eoa;
}
#else
if (!H5F_addr_eq(file->eoa, file->eof)) {
/* Update the eof value */
file->eof = file->eoa;
/* Reset last file I/O information */
file->pos = HADDR_UNDEF;
file->op = OP_UNKNOWN;
/* Update the 'last' eoa value */
file->last_eoa = file->eoa;
} /* end if */
/* truncate sub-files */
/* This is a hack. We should be doing the truncate of the sub-files via calls to
* H5FD_truncate() with the IOC. However, that system is messed up at present.
* thus the following hack.
* JRM -- 12/18/21
*/
if (H5FD__subfiling__truncate_sub_files(file->context_id, file->eof, file->comm) < 0)
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTUPDATE, FAIL, "sub-file truncate request failed");
#endif
done:
H5_SUBFILING_FUNC_LEAVE_API;
@ -2325,7 +2105,6 @@ H5FD__subfiling_lock(H5FD_t *_file, hbool_t rw)
HDassert(file);
/* TODO: Consider lock only on IOC ranks for one IOC per subfile case */
if (file->fa.require_ioc) {
#ifdef VERBOSE
HDputs("Subfiling driver doesn't support file locking");

File diff suppressed because it is too large Load Diff

View File

@ -23,6 +23,7 @@
#include "H5Iprivate.h"
#include "H5FDsubfiling.h"
#include "H5FDioc.h"
/*
* Some definitions for debugging the Subfiling feature
@ -189,25 +190,15 @@ typedef struct {
*/
typedef struct {
/* {Datasize, Offset, FileID} */
int64_t header[3]; /* The basic RPC input plus */
int tag; /* the supplied OPCODE tag */
int source; /* Rank of who sent the message */
int subfile_rank; /* The IOC rank */
int64_t context_id; /* context to be used to complete */
double start_time; /* the request, + time of receipt */
/* from which we calc Time(queued) */
void *buffer; /* for writes, we keep the buffer */
/* around for awhile... */
volatile int in_progress; /* Not used! */
volatile int serialize; /* worker thread needs to wait while true */
volatile int dependents; //* If current work item has dependents */
int depend_id; /* work queue index of the dependent */
int64_t header[3]; /* The basic RPC input plus */
int tag; /* the supplied OPCODE tag */
int source; /* Rank of who sent the message */
int subfile_rank; /* The IOC rank */
int64_t context_id; /* context to be used to complete */
double start_time; /* the request, + time of receipt */
/* from which we calc Time(queued) */
} sf_work_request_t;
extern int sf_verbose_flag;
extern app_layout_t *sf_app_layout;
#ifdef __cplusplus
extern "C" {
#endif
@ -225,8 +216,6 @@ H5_DLL herr_t H5_get_num_iocs_from_config_file(FILE *config_file, int *n_io_con
H5_DLL void H5_subfiling_log(int64_t sf_context_id, const char *fmt, ...);
void set_verbose_flag(int subfile_rank, int new_value);
#ifdef __cplusplus
}
#endif

View File

@ -342,11 +342,11 @@ setup_vfd_test_file(int file_name_id, char *file_name, int mpi_size, H5FD_mpio_x
/* shared_cfg = */ shared_conf,
};
H5FD_ioc_config_t ioc_config = {
/* magic = */ H5FD_IOC_FAPL_MAGIC,
/* version = */ H5FD_IOC_CURR_FAPL_VERSION,
/* under_fapl_id = */ H5P_DEFAULT,
/* thread_pool_count = */ H5FD_IOC_DEFAULT_THREAD_POOL_SIZE,
/* subf_config = */ shared_conf,
/* magic = */ H5FD_IOC_FAPL_MAGIC,
/* version = */ H5FD_IOC_CURR_FAPL_VERSION,
/* under_fapl_id = */ H5P_DEFAULT,
/* thread_pool_size = */ H5FD_IOC_DEFAULT_THREAD_POOL_SIZE,
/* subf_config = */ shared_conf,
};
hid_t ioc_fapl = H5I_INVALID_HID;