hdf5/src/H5FDmpio.c
vchoi-hdfgroup ed31aaca79
Implement selection vector I/O with collective chunk filling (#3826)
* Changes for ECP-344: Implement selection vector I/O with collective chunk filling.
Also fix a bug in H5FD__mpio_write_vector() to account for fixed size optimization
when computing max address.

* Fixes based on PR review comments:
For H5Dchunk.c: fix H5MM_xfree()
For H5FDmpio.c:
1) Revert the fix to H5FD__mpio_write_vector()
2) Apply the patch from Neil on the proper length of s_sizes reported by H5FD__mpio_vector_build_types()

* Put back the logic of dividing up the work among all the mpi ranks similar to the
original H5D__chunk_collective_fill() routine.

* Add a test to verify the fix for the illegal reference problem in H5FD__mpio_write_vector().
2023-11-16 10:12:45 -06:00

3883 lines
158 KiB
C

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright by The HDF Group. *
* All rights reserved. *
* *
* This file is part of HDF5. The full HDF5 copyright notice, including *
* terms governing use, modification, and redistribution, is contained in *
* the COPYING file, which can be found at the root of the source code *
* distribution tree, or in https://www.hdfgroup.org/licenses. *
* If you do not have access to either file, you may request a copy from *
* help@hdfgroup.org. *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*
* Purpose: This is the MPI I/O driver.
*/
#include "H5FDdrvr_module.h" /* This source code file is part of the H5FD driver module */
#include "H5private.h" /* Generic Functions */
#include "H5CXprivate.h" /* API Contexts */
#include "H5Dprivate.h" /* Dataset functions */
#include "H5Eprivate.h" /* Error handling */
#include "H5Fprivate.h" /* File access */
#include "H5FDprivate.h" /* File drivers */
#include "H5FDmpi.h" /* MPI-based file drivers */
#include "H5Iprivate.h" /* IDs */
#include "H5MMprivate.h" /* Memory management */
#include "H5Pprivate.h" /* Property lists */
#ifdef H5_HAVE_PARALLEL
/*
* The driver identification number, initialized at runtime if H5_HAVE_PARALLEL
* is defined. This allows applications to still have the H5FD_MPIO
* "constants" in their source code.
*/
static hid_t H5FD_MPIO_g = 0;
/* Whether to allow collective I/O operations */
/* (Can be changed by setting "HDF5_MPI_OPT_TYPES" environment variable to '0' or '1') */
hbool_t H5FD_mpi_opt_types_g = true;
/* Whether the driver initialized MPI on its own */
static bool H5FD_mpi_self_initialized = false;
/*
* The view is set to this value
*/
static char H5FD_mpi_native_g[] = "native";
/*
* The description of a file belonging to this driver.
* The EOF value is only used just after the file is opened in order for the
* library to determine whether the file is empty, truncated, or okay. The MPIO
* driver doesn't bother to keep it updated since it's an expensive operation.
*/
typedef struct H5FD_mpio_t {
H5FD_t pub; /* Public stuff, must be first */
MPI_File f; /* MPIO file handle */
MPI_Comm comm; /* MPI Communicator */
MPI_Info info; /* MPI info object */
int mpi_rank; /* This process's rank */
int mpi_size; /* Total number of processes */
haddr_t eof; /* End-of-file marker */
haddr_t eoa; /* End-of-address marker */
haddr_t last_eoa; /* Last known end-of-address marker */
haddr_t local_eof; /* Local end-of-file address for each process */
bool mpi_file_sync_required; /* Whether the ROMIO driver requires MPI_File_sync after write */
} H5FD_mpio_t;
/* Private Prototypes */
/* Callbacks */
static herr_t H5FD__mpio_term(void);
static H5FD_t *H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t maxaddr);
static herr_t H5FD__mpio_close(H5FD_t *_file);
static herr_t H5FD__mpio_query(const H5FD_t *_f1, unsigned long *flags);
static haddr_t H5FD__mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t type);
static herr_t H5FD__mpio_set_eoa(H5FD_t *_file, H5FD_mem_t type, haddr_t addr);
static haddr_t H5FD__mpio_get_eof(const H5FD_t *_file, H5FD_mem_t type);
static herr_t H5FD__mpio_get_handle(H5FD_t *_file, hid_t fapl, void **file_handle);
static herr_t H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
void *buf);
static herr_t H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t dxpl_id, haddr_t addr, size_t size,
const void *buf);
static herr_t H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count,
H5FD_mem_t types[], haddr_t addrs[], size_t sizes[], void *bufs[]);
static herr_t H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count,
H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
const void *bufs[]);
static herr_t H5FD__mpio_read_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id,
size_t count, hid_t mem_space_ids[], hid_t file_space_ids[],
haddr_t offsets[], size_t element_sizes[], void *bufs[]);
static herr_t H5FD__mpio_write_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id,
size_t count, hid_t mem_space_ids[], hid_t file_space_ids[],
haddr_t offsets[], size_t element_sizes[], const void *bufs[]);
static herr_t H5FD__mpio_flush(H5FD_t *_file, hid_t dxpl_id, bool closing);
static herr_t H5FD__mpio_truncate(H5FD_t *_file, hid_t dxpl_id, bool closing);
static herr_t H5FD__mpio_delete(const char *filename, hid_t fapl_id);
static herr_t H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void *input,
void **output);
/* Other functions */
static herr_t H5FD__mpio_vector_build_types(uint32_t count, H5FD_mem_t types[], haddr_t addrs[],
size_t sizes[], H5_flexible_const_ptr_t bufs[],
haddr_t *s_addrs[], size_t *s_sizes[], uint32_t *s_sizes_len,
H5_flexible_const_ptr_t *s_bufs[], bool *vector_was_sorted,
MPI_Offset *mpi_off, H5_flexible_const_ptr_t *mpi_bufs_base,
int *size_i, MPI_Datatype *buf_type, bool *buf_type_created,
MPI_Datatype *file_type, bool *file_type_created, char *unused);
static herr_t H5FD__selection_build_types(bool io_op_write, size_t num_pieces, H5_flexible_const_ptr_t mbb,
H5S_t **file_spaces, H5S_t **mem_spaces, haddr_t offsets[],
H5_flexible_const_ptr_t bufs[], size_t src_element_sizes[],
size_t dst_element_sizes[], MPI_Datatype *final_ftype,
bool *final_ftype_is_derived, MPI_Datatype *final_mtype,
bool *final_mtype_is_derived);
/* The MPIO file driver information */
static const H5FD_class_t H5FD_mpio_g = {
H5FD_CLASS_VERSION, /* struct version */
H5_VFD_MPIO, /* value */
"mpio", /* name */
HADDR_MAX, /* maxaddr */
H5F_CLOSE_SEMI, /* fc_degree */
H5FD__mpio_term, /* terminate */
NULL, /* sb_size */
NULL, /* sb_encode */
NULL, /* sb_decode */
0, /* fapl_size */
NULL, /* fapl_get */
NULL, /* fapl_copy */
NULL, /* fapl_free */
0, /* dxpl_size */
NULL, /* dxpl_copy */
NULL, /* dxpl_free */
H5FD__mpio_open, /* open */
H5FD__mpio_close, /* close */
NULL, /* cmp */
H5FD__mpio_query, /* query */
NULL, /* get_type_map */
NULL, /* alloc */
NULL, /* free */
H5FD__mpio_get_eoa, /* get_eoa */
H5FD__mpio_set_eoa, /* set_eoa */
H5FD__mpio_get_eof, /* get_eof */
H5FD__mpio_get_handle, /* get_handle */
H5FD__mpio_read, /* read */
H5FD__mpio_write, /* write */
H5FD__mpio_read_vector, /* read_vector */
H5FD__mpio_write_vector, /* write_vector */
H5FD__mpio_read_selection, /* read_selection */
H5FD__mpio_write_selection, /* write_selection */
H5FD__mpio_flush, /* flush */
H5FD__mpio_truncate, /* truncate */
NULL, /* lock */
NULL, /* unlock */
H5FD__mpio_delete, /* del */
H5FD__mpio_ctl, /* ctl */
H5FD_FLMAP_DICHOTOMY /* fl_map */
};
#ifdef H5FDmpio_DEBUG
/* Flags to control debug actions in the MPI-IO VFD.
* (Meant to be indexed by characters)
*
* These flags can be set with either (or both) the environment variable
* "H5FD_mpio_Debug" set to a string containing one or more characters
* (flags) or by setting them as a string value for the
* "H5F_mpio_debug_key" MPI Info key.
*
* Supported characters in 'H5FD_mpio_Debug' string:
* 't' trace function entry and exit
* 'r' show read offset and size
* 'w' show write offset and size
* '0'-'9' only show output from a single MPI rank (ranks 0-9 supported)
*/
static int H5FD_mpio_debug_flags_s[256];
static int H5FD_mpio_debug_rank_s = -1;
/* Indicate if this rank should output tracing info */
#define H5FD_MPIO_TRACE_THIS_RANK(file) \
(H5FD_mpio_debug_rank_s < 0 || H5FD_mpio_debug_rank_s == (file)->mpi_rank)
#endif
#ifdef H5FDmpio_DEBUG
/*---------------------------------------------------------------------------
* Function: H5FD__mpio_parse_debug_str
*
* Purpose: Parse a string for debugging flags
*
* Returns: N/A
*
*---------------------------------------------------------------------------
*/
static void
H5FD__mpio_parse_debug_str(const char *s)
{
FUNC_ENTER_PACKAGE_NOERR
/* Sanity check */
assert(s);
/* Set debug mask */
while (*s) {
if ((int)(*s) >= (int)'0' && (int)(*s) <= (int)'9')
H5FD_mpio_debug_rank_s = ((int)*s) - (int)'0';
else
H5FD_mpio_debug_flags_s[(int)*s]++;
s++;
} /* end while */
FUNC_LEAVE_NOAPI_VOID
} /* end H5FD__mpio_parse_debug_str() */
/*---------------------------------------------------------------------------
* Function: H5FD__mem_t_to_str
*
* Purpose: Returns a string representing the enum value in an H5FD_mem_t
* enum
*
* Returns: H5FD_mem_t enum value string
*
*---------------------------------------------------------------------------
*/
static const char *
H5FD__mem_t_to_str(H5FD_mem_t mem_type)
{
switch (mem_type) {
case H5FD_MEM_NOLIST:
return "H5FD_MEM_NOLIST";
case H5FD_MEM_DEFAULT:
return "H5FD_MEM_DEFAULT";
case H5FD_MEM_SUPER:
return "H5FD_MEM_SUPER";
case H5FD_MEM_BTREE:
return "H5FD_MEM_BTREE";
case H5FD_MEM_DRAW:
return "H5FD_MEM_DRAW";
case H5FD_MEM_GHEAP:
return "H5FD_MEM_GHEAP";
case H5FD_MEM_LHEAP:
return "H5FD_MEM_LHEAP";
case H5FD_MEM_OHDR:
return "H5FD_MEM_OHDR";
case H5FD_MEM_NTYPES:
return "H5FD_MEM_NTYPES";
default:
return "(Unknown)";
}
}
#endif /* H5FDmpio_DEBUG */
/*-------------------------------------------------------------------------
* Function: H5FD_mpio_init
*
* Purpose: Initialize this driver by registering the driver with the
* library.
*
* Return: Success: The driver ID for the mpio driver
* Failure: H5I_INVALID_HID
*
*-------------------------------------------------------------------------
*/
hid_t
H5FD_mpio_init(void)
{
static int H5FD_mpio_Debug_inited = 0;
char *env = NULL;
hid_t ret_value = H5I_INVALID_HID; /* Return value */
FUNC_ENTER_NOAPI(H5I_INVALID_HID)
/* Register the MPI-IO VFD, if it isn't already */
if (H5I_VFL != H5I_get_type(H5FD_MPIO_g)) {
H5FD_MPIO_g = H5FD_register((const H5FD_class_t *)&H5FD_mpio_g, sizeof(H5FD_class_t), false);
/* Check if MPI driver has been loaded dynamically */
env = getenv(HDF5_DRIVER);
if (env && !strcmp(env, "mpio")) {
int mpi_initialized = 0;
/* Initialize MPI if not already initialized */
if (MPI_SUCCESS != MPI_Initialized(&mpi_initialized))
HGOTO_ERROR(H5E_VFL, H5E_UNINITIALIZED, H5I_INVALID_HID, "can't check if MPI is initialized");
if (!mpi_initialized) {
if (MPI_SUCCESS != MPI_Init(NULL, NULL))
HGOTO_ERROR(H5E_VFL, H5E_CANTINIT, H5I_INVALID_HID, "can't initialize MPI");
H5FD_mpi_self_initialized = true;
}
}
}
if (!H5FD_mpio_Debug_inited) {
const char *s; /* String for environment variables */
/* Allow MPI buf-and-file-type optimizations? */
s = getenv("HDF5_MPI_OPT_TYPES");
if (s && isdigit(*s))
H5FD_mpi_opt_types_g = (0 == strtol(s, NULL, 0)) ? false : true;
#ifdef H5FDmpio_DEBUG
/* Clear the flag buffer */
memset(H5FD_mpio_debug_flags_s, 0, sizeof(H5FD_mpio_debug_flags_s));
/* Retrieve MPI-IO debugging environment variable */
s = getenv("H5FD_mpio_Debug");
if (s)
H5FD__mpio_parse_debug_str(s);
#endif /* H5FDmpio_DEBUG */
H5FD_mpio_Debug_inited++;
} /* end if */
/* Set return value */
ret_value = H5FD_MPIO_g;
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpio_init() */
/*---------------------------------------------------------------------------
* Function: H5FD__mpio_term
*
* Purpose: Shut down the VFD
*
* Returns: Non-negative on success or negative on failure
*
*---------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_term(void)
{
FUNC_ENTER_PACKAGE_NOERR
/* Terminate MPI if the driver initialized it */
if (H5FD_mpi_self_initialized) {
int mpi_finalized = 0;
MPI_Finalized(&mpi_finalized);
if (!mpi_finalized)
MPI_Finalize();
H5FD_mpi_self_initialized = false;
}
/* Reset VFL ID */
H5FD_MPIO_g = 0;
FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5FD__mpio_term() */
/*-------------------------------------------------------------------------
* Function: H5Pset_fapl_mpio
*
* Purpose: Store the user supplied MPIO communicator comm and info in
* the file access property list FAPL_ID which can then be used
* to create and/or open the file. This function is available
* only in the parallel HDF5 library and is not collective.
*
* comm is the MPI communicator to be used for file open as
* defined in MPI_FILE_OPEN of MPI. This function makes a
* duplicate of comm. Any modification to comm after this function
* call returns has no effect on the access property list.
*
* info is the MPI Info object to be used for file open as
* defined in MPI_FILE_OPEN of MPI. This function makes a
* duplicate of info. Any modification to info after this
* function call returns has no effect on the access property
* list.
*
* If fapl_id has previously set comm and info values, they
* will be replaced and the old communicator and Info object
* are freed.
*
* Return: Success: Non-negative
* Failure: Negative
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pset_fapl_mpio(hid_t fapl_id, MPI_Comm comm, MPI_Info info)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value;
FUNC_ENTER_API(FAIL)
H5TRACE3("e", "iMcMi", fapl_id, comm, info);
/* Check arguments */
if (fapl_id == H5P_DEFAULT)
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list");
if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list");
if (MPI_COMM_NULL == comm)
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "MPI_COMM_NULL is not a valid communicator");
/* Set the MPI communicator and info object */
if (H5P_set(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI communicator");
if (H5P_set(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI info object");
/* duplication is done during driver setting. */
ret_value = H5P_set_driver(plist, H5FD_MPIO, NULL, NULL);
done:
FUNC_LEAVE_API(ret_value)
} /* H5Pset_fapl_mpio() */
/*-------------------------------------------------------------------------
* Function: H5Pget_fapl_mpio
*
* Purpose: If the file access property list is set to the H5FD_MPIO
* driver then this function returns duplicates of the MPI
* communicator and Info object stored through the comm and
* info pointers. It is the responsibility of the application
* to free the returned communicator and Info object.
*
* Return: Success: Non-negative with the communicator and
* Info object returned through the comm and
* info arguments if non-null. Since they are
* duplicates of the stored objects, future
* modifications to the access property list do
* not affect them and it is the responsibility
* of the application to free them.
* Failure: Negative
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pget_fapl_mpio(hid_t fapl_id, MPI_Comm *comm /*out*/, MPI_Info *info /*out*/)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API(FAIL)
H5TRACE3("e", "ixx", fapl_id, comm, info);
/* Set comm and info in case we have problems */
if (comm)
*comm = MPI_COMM_NULL;
if (info)
*info = MPI_INFO_NULL;
/* Check arguments */
if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a file access list");
if (H5FD_MPIO != H5P_peek_driver(plist))
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "VFL driver is not MPI-I/O");
/* Get the MPI communicator and info object */
if (comm)
if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, comm) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI communicator");
if (info)
if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, info) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTGET, FAIL, "can't get MPI info object");
done:
/* Clean up anything duplicated on errors. The free calls will set
* the output values to MPI_COMM|INFO_NULL.
*/
if (ret_value != SUCCEED) {
if (comm)
if (H5_mpi_comm_free(comm) < 0)
HDONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "unable to free MPI communicator");
if (info)
if (H5_mpi_info_free(info) < 0)
HDONE_ERROR(H5E_PLIST, H5E_CANTFREE, FAIL, "unable to free MPI info object");
}
FUNC_LEAVE_API(ret_value)
} /* end H5Pget_fapl_mpio() */
/*-------------------------------------------------------------------------
* Function: H5Pset_dxpl_mpio
*
* Purpose: Set the data transfer property list DXPL_ID to use transfer
* mode XFER_MODE. The property list can then be used to control
* the I/O transfer mode during data I/O operations. The valid
* transfer modes are:
*
* H5FD_MPIO_INDEPENDENT:
* Use independent I/O access (the default).
*
* H5FD_MPIO_COLLECTIVE:
* Use collective I/O access.
*
* Return: Success: Non-negative
* Failure: Negative
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pset_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t xfer_mode)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API(FAIL)
H5TRACE2("e", "iDt", dxpl_id, xfer_mode);
/* Check arguments */
if (dxpl_id == H5P_DEFAULT)
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list");
if (NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl");
if (H5FD_MPIO_INDEPENDENT != xfer_mode && H5FD_MPIO_COLLECTIVE != xfer_mode)
HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "incorrect xfer_mode");
/* Set the transfer mode */
if (H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value");
done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pset_dxpl_mpio() */
/*-------------------------------------------------------------------------
* Function: H5Pget_dxpl_mpio
*
* Purpose: Queries the transfer mode current set in the data transfer
* property list DXPL_ID. This is not collective.
*
* Return: Success: Non-negative, with the transfer mode returned
* through the XFER_MODE argument if it is
* non-null.
* Failure: Negative
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pget_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t *xfer_mode /*out*/)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API(FAIL)
H5TRACE2("e", "ix", dxpl_id, xfer_mode);
/* Check arguments */
if (NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl");
/* Get the transfer mode */
if (xfer_mode)
if (H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, xfer_mode) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to get value");
done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pget_dxpl_mpio() */
/*-------------------------------------------------------------------------
* Function: H5Pset_dxpl_mpio_collective_opt
*
* Purpose: Set the data transfer property list DXPL_ID to use transfer
* mode OPT_MODE during I/O. This allows the application to
* specify collective I/O at the HDF5 interface level (with
* the H5Pset_dxpl_mpio routine), while controlling whether
* the actual I/O is performed collectively (e.g., via
* MPI_File_write_at_all) or independently (e.g., via
* MPI_File_write_at).
*
* Return: Success: Non-negative
* Failure: Negative
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pset_dxpl_mpio_collective_opt(hid_t dxpl_id, H5FD_mpio_collective_opt_t opt_mode)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API(FAIL)
H5TRACE2("e", "iDc", dxpl_id, opt_mode);
/* Check arguments */
if (dxpl_id == H5P_DEFAULT)
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list");
if (NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl");
/* Set the transfer mode */
if (H5P_set(plist, H5D_XFER_MPIO_COLLECTIVE_OPT_NAME, &opt_mode) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value");
done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pset_dxpl_mpio_collective_opt() */
/*-------------------------------------------------------------------------
* Function: H5Pset_dxpl_mpio_chunk_opt
*
* Purpose: To set a flag to choose linked chunk I/O or multi-chunk I/O
* without involving decision-making inside HDF5
*
* Note: The library will do linked chunk I/O or multi-chunk I/O without
* involving communications for decision-making process.
* The library won't behave as it asks for only when we find
* that the low-level MPI-IO package doesn't support this.
*
* Return: Success: Non-negative
* Failure: Negative
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id, H5FD_mpio_chunk_opt_t opt_mode)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API(FAIL)
H5TRACE2("e", "iDh", dxpl_id, opt_mode);
/* Check arguments */
if (dxpl_id == H5P_DEFAULT)
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list");
if (NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl");
/* Set the transfer mode */
if (H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME, &opt_mode) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value");
done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pset_dxpl_mpio_chunk_opt() */
/*-------------------------------------------------------------------------
* Function: H5Pset_dxpl_mpio_chunk_opt_num
*
* Purpose: To set a threshold for doing linked chunk IO
*
* Note: If the number is greater than the threshold set by the user,
* the library will do linked chunk I/O; otherwise, I/O will be
* done for every chunk.
*
* Return: Success: Non-negative
* Failure: Negative
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id, unsigned num_chunk_per_proc)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API(FAIL)
H5TRACE2("e", "iIu", dxpl_id, num_chunk_per_proc);
/* Check arguments */
if (dxpl_id == H5P_DEFAULT)
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list");
if (NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl");
/* Set the transfer mode */
if (H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME, &num_chunk_per_proc) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value");
done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pset_dxpl_mpio_chunk_opt_num() */
/*-------------------------------------------------------------------------
* Function: H5Pset_dxpl_mpio_chunk_opt_ratio
*
* Purpose: To set a threshold for doing collective I/O for each chunk
*
* Note: The library will calculate the percentage of the number of
* process holding selections at each chunk. If that percentage
* of number of process in the individual chunk is greater than
* the threshold set by the user, the library will do collective
* chunk I/O for this chunk; otherwise, independent I/O will be
* done for this chunk.
*
* Return: Success: Non-negative
* Failure: Negative
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id, unsigned percent_num_proc_per_chunk)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_API(FAIL)
H5TRACE2("e", "iIu", dxpl_id, percent_num_proc_per_chunk);
/* Check arguments */
if (dxpl_id == H5P_DEFAULT)
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list");
if (NULL == (plist = H5P_object_verify(dxpl_id, H5P_DATASET_XFER)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl");
/* Set the transfer mode */
if (H5P_set(plist, H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME, &percent_num_proc_per_chunk) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value");
done:
FUNC_LEAVE_API(ret_value)
} /* end H5Pset_dxpl_mpio_chunk_opt_ratio() */
/*-------------------------------------------------------------------------
* Function: H5FD_set_mpio_atomicity
*
* Purpose: Sets the atomicity mode
*
* Return: SUCCEED/FAIL
*
*-------------------------------------------------------------------------
*/
herr_t
H5FD_set_mpio_atomicity(H5FD_t *_file, bool flag)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI_NOINIT
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* set atomicity value */
if (MPI_SUCCESS != (mpi_code = MPI_File_set_atomicity(file->f, (int)(flag != false))))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_atomicity", mpi_code)
done:
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_set_mpio_atomicity() */
/*-------------------------------------------------------------------------
* Function: H5FD_get_mpio_atomicity
*
* Purpose: Returns the atomicity mode
*
* Return: SUCCEED/FAIL
*
*-------------------------------------------------------------------------
*/
herr_t
H5FD_get_mpio_atomicity(H5FD_t *_file, bool *flag)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
int temp_flag;
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI_NOINIT
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Get atomicity value */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_atomicity(file->f, &temp_flag)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_get_atomicity", mpi_code)
if (0 != temp_flag)
*flag = true;
else
*flag = false;
done:
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_get_mpio_atomicity() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_open
*
* Purpose: Opens a file with name NAME. The FLAGS are a bit field with
* purpose similar to the second argument of open(2) and which
* are defined in H5Fpublic.h. The file access property list
* FAPL_ID contains the properties driver properties and MAXADDR
* is the largest address which this file will be expected to
* access. This is collective.
*
* Return: Success: A new file pointer
* Failure: NULL
*
*-------------------------------------------------------------------------
*/
static H5FD_t *
H5FD__mpio_open(const char *name, unsigned flags, hid_t fapl_id, haddr_t H5_ATTR_UNUSED maxaddr)
{
H5FD_mpio_t *file = NULL; /* VFD File struct for new file */
H5P_genplist_t *plist; /* Property list pointer */
MPI_Comm comm = MPI_COMM_NULL; /* MPI Communicator, from plist */
MPI_Info info = MPI_INFO_NULL; /* MPI Info, from plist */
MPI_Info info_used; /* MPI Info returned from MPI_File_open */
MPI_File fh; /* MPI file handle */
bool file_opened = false; /* Flag to indicate that the file was successfully opened */
int mpi_amode; /* MPI file access flags */
int mpi_rank = INT_MAX; /* MPI rank of this process */
int mpi_size; /* Total number of MPI processes */
MPI_Offset file_size; /* File size (of existing files) */
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = false;
#endif
int mpi_code; /* MPI return code */
H5FD_t *ret_value = NULL; /* Return value */
FUNC_ENTER_PACKAGE
/* Get a pointer to the fapl */
if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, NULL, "not a file access property list");
if (H5FD_mpi_self_initialized) {
comm = MPI_COMM_WORLD;
}
else {
/* Get the MPI communicator and info object from the property list */
if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI communicator");
if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "can't get MPI info object");
}
/* Get the MPI rank of this process and the total number of processes */
if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
HMPI_GOTO_ERROR(NULL, "MPI_Comm_rank failed", mpi_code)
if (MPI_SUCCESS != (mpi_code = MPI_Comm_size(comm, &mpi_size)))
HMPI_GOTO_ERROR(NULL, "MPI_Comm_size failed", mpi_code)
#ifdef H5FDmpio_DEBUG
H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] &&
(H5FD_mpio_debug_rank_s < 0 || H5FD_mpio_debug_rank_s == mpi_rank));
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering - name = \"%s\", flags = 0x%x, fapl_id = %d, maxaddr = %lu\n",
__func__, mpi_rank, name, flags, (int)fapl_id, (unsigned long)maxaddr);
#endif
/* Convert HDF5 flags to MPI-IO flags */
/* Some combinations are illegal; let MPI-IO figure it out */
mpi_amode = (flags & H5F_ACC_RDWR) ? MPI_MODE_RDWR : MPI_MODE_RDONLY;
if (flags & H5F_ACC_CREAT)
mpi_amode |= MPI_MODE_CREATE;
if (flags & H5F_ACC_EXCL)
mpi_amode |= MPI_MODE_EXCL;
#ifdef H5FDmpio_DEBUG
/* Check for debug commands in the info parameter */
if (MPI_INFO_NULL != info) {
char debug_str[128];
int flag;
MPI_Info_get(info, H5F_MPIO_DEBUG_KEY, sizeof(debug_str) - 1, debug_str, &flag);
if (flag)
H5FD__mpio_parse_debug_str(debug_str);
} /* end if */
#endif
if (MPI_SUCCESS != (mpi_code = MPI_File_open(comm, name, mpi_amode, info, &fh)))
HMPI_GOTO_ERROR(NULL, "MPI_File_open failed", mpi_code)
file_opened = true;
/* Get the MPI-IO hints that actually used by MPI-IO underneath. */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_info(fh, &info_used)))
HMPI_GOTO_ERROR(NULL, "MPI_File_get_info failed", mpi_code)
/* Copy hints in info_used into info. Note hints in info_used supersede
* info. There may be some hints set and used by HDF5 only, but not
* recognizable by MPI-IO. We need to keep them, as MPI_File_get_info()
* will remove any hints unrecognized by MPI-IO library underneath.
*/
if (info_used != MPI_INFO_NULL) {
int i, nkeys;
if (info == MPI_INFO_NULL) /* reuse info created from MPI_File_get_info() */
info = info_used;
else {
/* retrieve the number of hints */
if (MPI_SUCCESS != (mpi_code = MPI_Info_get_nkeys(info_used, &nkeys)))
HMPI_GOTO_ERROR(NULL, "MPI_Info_get_nkeys failed", mpi_code)
/* copy over each hint */
for (i = 0; i < nkeys; i++) {
char key[MPI_MAX_INFO_KEY], value[MPI_MAX_INFO_VAL + 1];
int valuelen, flag;
/* retrieve the nth hint */
if (MPI_SUCCESS != (mpi_code = MPI_Info_get_nthkey(info_used, i, key)))
HMPI_GOTO_ERROR(NULL, "MPI_Info_get_nkeys failed", mpi_code)
/* retrieve the key of nth hint */
if (MPI_SUCCESS != (mpi_code = MPI_Info_get_valuelen(info_used, key, &valuelen, &flag)))
HMPI_GOTO_ERROR(NULL, "MPI_Info_get_valuelen failed", mpi_code)
/* retrieve the value of nth hint */
if (MPI_SUCCESS != (mpi_code = MPI_Info_get(info_used, key, valuelen, value, &flag)))
HMPI_GOTO_ERROR(NULL, "MPI_Info_get failed", mpi_code)
/* copy the hint into info */
if (MPI_SUCCESS != (mpi_code = MPI_Info_set(info, key, value)))
HMPI_GOTO_ERROR(NULL, "MPI_Info_set failed", mpi_code)
}
/* Free info_used allocated in the call to MPI_File_get_info() */
if (MPI_SUCCESS != (mpi_code = MPI_Info_free(&info_used)))
HMPI_GOTO_ERROR(NULL, "MPI_Info_free failed", mpi_code)
}
/* Add info to the file access property list */
if (H5P_set(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTSET, NULL, "can't set MPI info object");
}
/* Build the return value and initialize it */
if (NULL == (file = (H5FD_mpio_t *)H5MM_calloc(sizeof(H5FD_mpio_t))))
HGOTO_ERROR(H5E_RESOURCE, H5E_NOSPACE, NULL, "memory allocation failed");
file->f = fh;
file->comm = comm;
file->info = info;
file->mpi_rank = mpi_rank;
file->mpi_size = mpi_size;
/* Retrieve the flag indicating whether MPI_File_sync is needed after each write */
if (H5_mpio_get_file_sync_required(fh, &file->mpi_file_sync_required) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, NULL, "unable to get mpi_file_sync_required hint");
/* Only processor p0 will get the file size and broadcast it. */
if (mpi_rank == 0) {
/* If MPI_File_get_size fails, broadcast file size as -1 to signal error */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(fh, &file_size)))
file_size = (MPI_Offset)-1;
}
/* Broadcast file size */
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&file_size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, comm)))
HMPI_GOTO_ERROR(NULL, "MPI_Bcast failed", mpi_code)
if (file_size < 0)
HMPI_GOTO_ERROR(NULL, "MPI_File_get_size failed", mpi_code)
/* Determine if the file should be truncated */
if (file_size && (flags & H5F_ACC_TRUNC)) {
/* Truncate the file */
if (MPI_SUCCESS != (mpi_code = MPI_File_set_size(fh, (MPI_Offset)0)))
HMPI_GOTO_ERROR(NULL, "MPI_File_set_size failed", mpi_code)
/* Don't let any proc return until all have truncated the file. */
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
HMPI_GOTO_ERROR(NULL, "MPI_Barrier failed", mpi_code)
/* File is zero size now */
file_size = 0;
} /* end if */
/* Set the size of the file (from library's perspective) */
file->eof = H5FD_mpi_MPIOff_to_haddr(file_size);
file->local_eof = file->eof;
/* Set return value */
ret_value = (H5FD_t *)file;
done:
if (ret_value == NULL) {
if (file_opened)
MPI_File_close(&fh);
if (H5_mpi_comm_free(&comm) < 0)
HDONE_ERROR(H5E_VFL, H5E_CANTFREE, NULL, "unable to free MPI communicator");
if (H5_mpi_info_free(&info) < 0)
HDONE_ERROR(H5E_VFL, H5E_CANTFREE, NULL, "unable to free MPI info object");
if (file)
H5MM_xfree(file);
} /* end if */
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving\n", __func__, mpi_rank);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_open() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_close
*
* Purpose: Closes a file. This is collective.
*
* Return: SUCCEED/FAIL
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_close(H5FD_t *_file)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
int mpi_rank = file->mpi_rank;
#endif
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_PACKAGE
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
/* MPI_File_close sets argument to MPI_FILE_NULL */
if (MPI_SUCCESS != (mpi_code = MPI_File_close(&(file->f))))
HMPI_GOTO_ERROR(FAIL, "MPI_File_close failed", mpi_code)
/* Clean up other stuff */
H5_mpi_comm_free(&file->comm);
H5_mpi_info_free(&file->info);
H5MM_xfree(file);
done:
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving\n", __func__, mpi_rank);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_close() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_query
*
* Purpose: Set the flags that this VFL driver is capable of supporting.
* (listed in H5FDpublic.h)
*
* Return: SUCCEED/FAIL
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_query(const H5FD_t H5_ATTR_UNUSED *_file, unsigned long *flags /* out */)
{
FUNC_ENTER_PACKAGE_NOERR
/* Set the VFL feature flags that this driver supports */
if (flags) {
*flags = 0;
*flags |= H5FD_FEAT_AGGREGATE_METADATA; /* OK to aggregate metadata allocations */
*flags |= H5FD_FEAT_AGGREGATE_SMALLDATA; /* OK to aggregate "small" raw data allocations */
*flags |= H5FD_FEAT_HAS_MPI; /* This driver uses MPI */
*flags |= H5FD_FEAT_DEFAULT_VFD_COMPATIBLE; /* VFD creates a file which can be opened with the default
VFD */
} /* end if */
FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5FD__mpio_query() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_get_eoa
*
* Purpose: Gets the end-of-address marker for the file. The EOA marker
* is the first address past the last byte allocated in the
* format address space.
*
* Return: Success: The end-of-address marker
* Failure: HADDR_UNDEF
*
*-------------------------------------------------------------------------
*/
static haddr_t
H5FD__mpio_get_eoa(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
{
const H5FD_mpio_t *file = (const H5FD_mpio_t *)_file;
FUNC_ENTER_PACKAGE_NOERR
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
FUNC_LEAVE_NOAPI(file->eoa)
} /* end H5FD__mpio_get_eoa() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_set_eoa
*
* Purpose: Set the end-of-address marker for the file. This function is
* called shortly after an existing HDF5 file is opened in order
* to tell the driver where the end of the HDF5 data is located.
*
* Return: SUCCEED/FAIL
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_set_eoa(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, haddr_t addr)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
FUNC_ENTER_PACKAGE_NOERR
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
file->eoa = addr;
FUNC_LEAVE_NOAPI(SUCCEED)
} /* end H5FD__mpio_set_eoa() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_get_eof
*
* Purpose: Gets the end-of-file marker for the file. The EOF marker
* is the real size of the file.
*
* The MPIO driver doesn't bother keeping this field updated
* since that's a relatively expensive operation. Fortunately
* the library only needs the EOF just after the file is opened
* in order to determine whether the file is empty, truncated,
* or okay. Therefore, any MPIO I/O function will set its value
* to HADDR_UNDEF which is the error return value of this
* function.
*
* Keeping the EOF updated (during write calls) is expensive
* because any process may extend the physical end of the
* file. -QAK
*
* Return: Success: The end-of-file marker
* Failure: HADDR_UNDEF
*
*-------------------------------------------------------------------------
*/
static haddr_t
H5FD__mpio_get_eof(const H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type)
{
const H5FD_mpio_t *file = (const H5FD_mpio_t *)_file;
FUNC_ENTER_PACKAGE_NOERR
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
FUNC_LEAVE_NOAPI(file->eof)
} /* end H5FD__mpio_get_eof() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_get_handle
*
* Purpose: Returns the file handle of MPIO file driver.
*
* Returns: SUCCEED/FAIL
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_get_handle(H5FD_t *_file, hid_t H5_ATTR_UNUSED fapl, void **file_handle)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
if (!file_handle)
HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "file handle not valid");
*file_handle = &(file->f);
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_get_handle() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_read
*
* Purpose: Reads SIZE bytes of data from FILE beginning at address ADDR
* into buffer BUF according to data transfer properties in
* DXPL_ID using potentially complex file and buffer types to
* effect the transfer.
*
* Reading past the end of the MPI file returns zeros instead of
* failing. MPI is able to coalesce requests from different
* processes (collective or independent).
*
* Return: Success: SUCCEED. Result is stored in caller-supplied
* buffer BUF.
*
* Failure: FAIL. Contents of buffer BUF are undefined.
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_read(H5FD_t *_file, H5FD_mem_t H5_ATTR_UNUSED type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr,
size_t size, void *buf /*out*/)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
MPI_Offset mpi_off;
MPI_Status mpi_stat; /* Status from I/O operation */
MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
int size_i; /* Integer copy of 'size' to read */
MPI_Count bytes_read = 0; /* Number of bytes read in */
MPI_Count type_size; /* MPI datatype used for I/O's size */
MPI_Count io_size; /* Actual number of bytes requested */
MPI_Count n;
bool use_view_this_time = false;
bool derived_type = false;
bool rank0_bcast = false; /* If read-with-rank0-and-bcast flag was used */
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
bool H5FD_mpio_debug_r_flag = (H5FD_mpio_debug_flags_s[(int)'r'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
assert(buf);
/* Portably initialize MPI status variable */
memset(&mpi_stat, 0, sizeof(MPI_Status));
/* some numeric conversions */
if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off /*out*/) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off");
size_i = (int)size;
/* Only look for MPI views for raw data transfers */
if (type == H5FD_MEM_DRAW) {
H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
/* Get the transfer mode from the API context */
if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");
/*
* Set up for a fancy xfer using complex types, or single byte block. We
* wouldn't need to rely on the use_view field if MPI semantics allowed
* us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
* could mean "use MPI_BYTE" by convention).
*/
if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
MPI_Datatype file_type;
/* Remember that views are used */
use_view_this_time = true;
/* Prepare for a full-blown xfer using btype, ftype, and displacement */
if (H5CX_get_mpi_coll_datatypes(&buf_type, &file_type) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O datatypes");
/*
* Set the file view when we are using MPI derived types
*/
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
/* When using types, use the address as the displacement for
* MPI_File_set_view and reset the address for the read to zero
*/
mpi_off = 0;
} /* end if */
} /* end if */
/* Read the data. */
if (use_view_this_time) {
H5FD_mpio_collective_opt_t coll_opt_mode;
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) using MPIO collective mode\n", __func__, file->mpi_rank);
#endif
/* Get the collective_opt property to check whether the application wants to do IO individually. */
if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property");
if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) doing MPI collective IO\n", __func__, file->mpi_rank);
#endif
/* Check whether we should read from rank 0 and broadcast to other ranks */
if (H5CX_get_mpio_rank0_bcast()) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) doing read-rank0-and-MPI_Bcast\n", __func__, file->mpi_rank);
#endif
/* Indicate path we've taken */
rank0_bcast = true;
/* Read on rank 0 Bcast to other ranks */
if (file->mpi_rank == 0) {
/* If MPI_File_read_at fails, push an error, but continue
* to participate in following MPI_Bcast */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_DONE_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
}
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(buf, size_i, buf_type, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
} /* end if */
else
/* Perform collective read operation */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_read_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
} /* end if */
else {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
#endif
/* Perform independent read operation */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
} /* end else */
/*
* Reset the file view when we used MPI derived types
*/
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
} /* end if */
else {
if (size != (hsize_t)size_i) {
/* If HERE, then we need to work around the integer size limit
* of 2GB. The input size_t size variable cannot fit into an integer,
* but we can get around that limitation by creating a different datatype
* and then setting the integer size (or element count) to 1 when using
* the derived_type.
*/
if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype");
derived_type = true;
size_i = 1;
}
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
#endif
/* Perform independent read operation */
if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
} /* end else */
/* Only retrieve bytes read if this rank _actually_ participated in I/O */
if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) {
/* How many bytes were actually read? */
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read))) {
if (rank0_bcast && file->mpi_rank == 0) {
/* If MPI_Get_elements(_x) fails for a rank 0 bcast strategy,
* push an error, but continue to participate in the following
* MPI_Bcast.
*/
bytes_read = -1;
HMPI_DONE_ERROR(FAIL, "MPI_Get_elements failed for rank 0", mpi_code)
}
else
HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
}
} /* end if */
/* If the rank0-bcast feature was used, broadcast the # of bytes read to
* other ranks, which didn't perform any I/O.
*/
/* NOTE: This could be optimized further to be combined with the broadcast
* of the data. (QAK - 2019/1/2)
*/
if (rank0_bcast)
if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0)
/* Get the type's size */
if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
/* Compute the actual number of bytes requested */
io_size = type_size * size_i;
/* Check for read failure */
if (bytes_read < 0 || bytes_read > io_size)
HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed");
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld type = %s\n", __func__, file->mpi_rank,
(long)mpi_off, (long long)bytes_read, H5FD__mem_t_to_str(type));
#endif
/*
* This gives us zeroes beyond end of physical MPI file.
*/
if ((n = (io_size - bytes_read)) > 0)
memset((char *)buf + bytes_read, 0, (size_t)n);
done:
if (derived_type)
MPI_Type_free(&buf_type);
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_read() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_write
*
* Purpose: Writes SIZE bytes of data to FILE beginning at address ADDR
* from buffer BUF according to data transfer properties in
* DXPL_ID using potentially complex file and buffer types to
* effect the transfer.
*
* MPI is able to coalesce requests from different processes
* (collective and independent).
*
* Return: Success: SUCCEED. USE_TYPES and OLD_USE_TYPES in the
* access params are altered.
* Failure: FAIL. USE_TYPES and OLD_USE_TYPES in the
* access params may be altered.
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_write(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, haddr_t addr, size_t size,
const void *buf)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
MPI_Offset mpi_off;
MPI_Status mpi_stat; /* Status from I/O operation */
MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
MPI_Count bytes_written;
MPI_Count type_size; /* MPI datatype used for I/O's size */
MPI_Count io_size; /* Actual number of bytes requested */
int size_i;
bool use_view_this_time = false;
bool derived_type = false;
H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
bool H5FD_mpio_debug_w_flag = (H5FD_mpio_debug_flags_s[(int)'w'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
assert(buf);
/* Verify that no data is written when between MPI_Barrier()s during file flush */
assert(!H5CX_get_mpi_file_flushing());
/* Portably initialize MPI status variable */
memset(&mpi_stat, 0, sizeof(MPI_Status));
/* some numeric conversions */
if (H5FD_mpi_haddr_to_MPIOff(addr, &mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off");
size_i = (int)size;
/* Get the transfer mode from the API context */
if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");
/*
* Set up for a fancy xfer using complex types, or single byte block. We
* wouldn't need to rely on the use_view field if MPI semantics allowed
* us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
* could mean "use MPI_BYTE" by convention).
*/
if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
MPI_Datatype file_type;
/* Remember that views are used */
use_view_this_time = true;
/* Prepare for a full-blown xfer using btype, ftype, and disp */
if (H5CX_get_mpi_coll_datatypes(&buf_type, &file_type) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O datatypes");
/*
* Set the file view when we are using MPI derived types
*/
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
/* When using types, use the address as the displacement for
* MPI_File_set_view and reset the address for the read to zero
*/
mpi_off = 0;
} /* end if */
/* Write the data. */
if (use_view_this_time) {
H5FD_mpio_collective_opt_t coll_opt_mode;
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) using MPIO collective mode\n", __func__, file->mpi_rank);
#endif
/* Get the collective_opt property to check whether the application wants to do IO individually. */
if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property");
if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) doing MPI collective IO\n", __func__, file->mpi_rank);
#endif
/* Perform collective write operation */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_write_at_all(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
/* Do MPI_File_sync when needed by underlying ROMIO driver */
if (file->mpi_file_sync_required) {
if (MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code)
}
} /* end if */
else {
if (type != H5FD_MEM_DRAW)
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL,
"Metadata Coll opt property should be collective at this point");
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
#endif
/* Perform independent write operation */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_write_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
} /* end else */
/* Reset the file view when we used MPI derived types */
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
} /* end if */
else {
if (size != (hsize_t)size_i) {
/* If HERE, then we need to work around the integer size limit
* of 2GB. The input size_t size variable cannot fit into an integer,
* but we can get around that limitation by creating a different datatype
* and then setting the integer size (or element count) to 1 when using
* the derived_type.
*/
if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype");
derived_type = true;
size_i = 1;
}
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
#endif
/* Perform independent write operation */
if (MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, buf, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
} /* end else */
/* How many bytes were actually written? */
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_written)))
HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
/* Get the type's size */
if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
/* Compute the actual number of bytes requested */
io_size = type_size * size_i;
/* Check for write failure */
if (bytes_written != io_size || bytes_written < 0)
HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed");
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld type = %s\n", __func__, file->mpi_rank,
(long)mpi_off, (long long)bytes_written, H5FD__mem_t_to_str(type));
#endif
/* Each process will keep track of its perceived EOF value locally, and
* ultimately we will reduce this value to the maximum amongst all
* processes, but until then keep the actual eof at HADDR_UNDEF just in
* case something bad happens before that point. (rather have a value
* we know is wrong sitting around rather than one that could only
* potentially be wrong.) */
file->eof = HADDR_UNDEF;
if (bytes_written && (((haddr_t)bytes_written + addr) > file->local_eof))
file->local_eof = addr + (haddr_t)bytes_written;
done:
if (derived_type)
MPI_Type_free(&buf_type);
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving: ret_value = %d\n", __func__, file->mpi_rank, ret_value);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_write() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_vector_build_types
*
* Purpose: Build MPI datatypes and calculate offset, base buffer, and
* size for MPIO vector I/O. Spun off from common code in
* H5FD__mpio_vector_read() and H5FD__mpio_vector_write().
*
* Return: Success: SUCCEED.
* Failure: FAIL.
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_vector_build_types(uint32_t count, H5FD_mem_t types[], haddr_t addrs[], size_t sizes[],
H5_flexible_const_ptr_t bufs[], haddr_t *s_addrs[], size_t *s_sizes[],
uint32_t *s_sizes_len, H5_flexible_const_ptr_t *s_bufs[],
bool *vector_was_sorted, MPI_Offset *mpi_off,
H5_flexible_const_ptr_t *mpi_bufs_base, int *size_i, MPI_Datatype *buf_type,
bool *buf_type_created, MPI_Datatype *file_type, bool *file_type_created,
char *unused)
{
hsize_t bigio_count; /* Transition point to create derived type */
bool fixed_size = false;
size_t size;
H5FD_mem_t *s_types = NULL;
int *mpi_block_lengths = NULL;
MPI_Aint mpi_bufs_base_Aint;
MPI_Aint *mpi_bufs = NULL;
MPI_Aint *mpi_displacements = NULL;
MPI_Datatype *sub_types = NULL;
uint8_t *sub_types_created = NULL;
int i;
int j;
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
/* Sanity checks */
assert(s_sizes);
assert(s_bufs);
assert(vector_was_sorted);
assert(*vector_was_sorted);
assert(mpi_off);
assert(mpi_bufs_base);
assert(size_i);
assert(buf_type);
assert(buf_type_created);
assert(!*buf_type_created);
assert(file_type);
assert(file_type_created);
assert(!*file_type_created);
assert(unused);
/* Get bio I/O transition point (may be lower than 2G for testing) */
bigio_count = H5_mpi_get_bigio_count();
/* Start with s_sizes_len at count */
if (s_sizes_len)
*s_sizes_len = count;
if (count == 1) {
/* Single block. Just use a series of MPI_BYTEs for the file view.
*/
*size_i = (int)sizes[0];
*buf_type = MPI_BYTE;
*file_type = MPI_BYTE;
*mpi_bufs_base = bufs[0];
/* Setup s_addrs, s_sizes and s_bufs (needed for incomplete read filling code and eof
* calculation code) */
*s_addrs = addrs;
*s_sizes = sizes;
*s_bufs = bufs;
/* some numeric conversions */
if (H5FD_mpi_haddr_to_MPIOff(addrs[0], mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI offset");
/* Check for size overflow */
if (sizes[0] > bigio_count) {
/* We need to work around the integer size limit of 2GB. The input size_t size
* variable cannot fit into an integer, but we can get around that limitation by
* creating a different datatype and then setting the integer size (or element
* count) to 1 when using the derived_type. */
if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, buf_type) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype");
*buf_type_created = true;
if (H5_mpio_create_large_type(sizes[0], 0, MPI_BYTE, file_type) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype");
*file_type_created = true;
*size_i = 1;
}
}
else if (count > 0) { /* create MPI derived types describing the vector write */
/* sort the vector I/O request into increasing address order if required
*
* If the vector is already sorted, the base addresses of types, addrs, sizes,
* and bufs will be returned in s_types, s_addrs, s_sizes, and s_bufs respectively.
*
* If the vector was not already sorted, new, sorted versions of types, addrs, sizes, and bufs
* are allocated, populated, and returned in s_types, s_addrs, s_sizes, and s_bufs respectively.
* In this case, this function must free the memory allocated for the sorted vectors.
*/
if (H5FD_sort_vector_io_req(vector_was_sorted, count, types, addrs, sizes, bufs, &s_types, s_addrs,
s_sizes, s_bufs) < 0)
HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "can't sort vector I/O request");
if ((NULL == (mpi_block_lengths = (int *)malloc((size_t)count * sizeof(int)))) ||
(NULL == (mpi_displacements = (MPI_Aint *)malloc((size_t)count * sizeof(MPI_Aint)))) ||
(NULL == (mpi_bufs = (MPI_Aint *)malloc((size_t)count * sizeof(MPI_Aint))))) {
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc mpi block lengths / displacement");
}
/* when we setup mpi_bufs[] below, all addresses are offsets from
* mpi_bufs_base.
*
* Since these offsets must all be positive, we must scan through
* s_bufs[] to find the smallest value, and choose that for
* mpi_bufs_base.
*/
j = 0; /* guess at the index of the smallest value of s_bufs[] */
for (i = 1; i < (int)count; i++) {
if ((*s_bufs)[i].cvp < (*s_bufs)[j].cvp) {
j = i;
}
}
*mpi_bufs_base = (*s_bufs)[j];
if (MPI_SUCCESS != (mpi_code = MPI_Get_address(mpi_bufs_base->cvp, &mpi_bufs_base_Aint)))
HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] to mpi_bufs_base failed", mpi_code)
*size_i = 1;
fixed_size = false;
/* load the mpi_block_lengths and mpi_displacements arrays */
for (i = 0; i < (int)count; i++) {
/* Determine size of this vector element */
if (!fixed_size) {
if ((*s_sizes)[i] == 0) {
assert(vector_was_sorted);
assert(i > 0);
fixed_size = true;
size = sizes[i - 1];
/* Return the used length of the s_sizes buffer */
if (s_sizes_len)
*s_sizes_len = (uint32_t)i;
}
else {
size = (*s_sizes)[i];
}
}
/* Add to block lengths and displacements arrays */
mpi_block_lengths[i] = (int)size;
mpi_displacements[i] = (MPI_Aint)(*s_addrs)[i];
/* convert s_bufs[i] to MPI_Aint... */
if (MPI_SUCCESS != (mpi_code = MPI_Get_address((*s_bufs)[i].cvp, &(mpi_bufs[i]))))
HMPI_GOTO_ERROR(FAIL, "MPI_Get_address for s_bufs[] - mpi_bufs_base failed", mpi_code)
/*... and then subtract mpi_bufs_base_Aint from it. */
#if H5_CHECK_MPI_VERSION(3, 1)
mpi_bufs[i] = MPI_Aint_diff(mpi_bufs[i], mpi_bufs_base_Aint);
#else
mpi_bufs[i] = mpi_bufs[i] - mpi_bufs_base_Aint;
#endif
/* Check for size overflow */
if (size > bigio_count) {
/* We need to work around the integer size limit of 2GB. The input size_t size
* variable cannot fit into an integer, but we can get around that limitation by
* creating a different datatype and then setting the integer size (or element
* count) to 1 when using the derived_type. */
/* Allocate arrays to keep track of types and whether they were created, if
* necessary */
if (!sub_types) {
assert(!sub_types_created);
if (NULL == (sub_types = malloc((size_t)count * sizeof(MPI_Datatype))))
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types array");
if (NULL == (sub_types_created = (uint8_t *)calloc((size_t)count, 1))) {
H5MM_free(sub_types);
sub_types = NULL;
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "can't alloc sub types created array");
}
/* Initialize sub_types to all MPI_BYTE */
for (j = 0; j < (int)count; j++)
sub_types[j] = MPI_BYTE;
}
assert(sub_types_created);
/* Create type for large block */
if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &sub_types[i]) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype");
sub_types_created[i] = true;
/* Only one of these large types for this vector element */
mpi_block_lengths[i] = 1;
}
else
assert(size == (size_t)mpi_block_lengths[i]);
}
/* create the memory MPI derived types */
if (sub_types) {
if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths, mpi_bufs,
sub_types, buf_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for buf_type failed", mpi_code)
}
else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths, mpi_bufs,
MPI_BYTE, buf_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for buf_type failed", mpi_code)
*buf_type_created = true;
if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(buf_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for buf_type failed", mpi_code)
/* create the file MPI derived type */
if (sub_types) {
if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)count, mpi_block_lengths,
mpi_displacements, sub_types, file_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct for file_type failed", mpi_code)
}
else if (MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed((int)count, mpi_block_lengths,
mpi_displacements, MPI_BYTE, file_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed for file_type failed", mpi_code)
*file_type_created = true;
if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(file_type)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit for file_type failed", mpi_code)
/* Free up memory used to build types */
assert(mpi_block_lengths);
free(mpi_block_lengths);
mpi_block_lengths = NULL;
assert(mpi_displacements);
free(mpi_displacements);
mpi_displacements = NULL;
assert(mpi_bufs);
free(mpi_bufs);
mpi_bufs = NULL;
if (sub_types) {
assert(sub_types);
for (i = 0; i < (int)count; i++)
if (sub_types_created[i])
MPI_Type_free(&sub_types[i]);
free(sub_types);
sub_types = NULL;
free(sub_types_created);
sub_types_created = NULL;
}
/* some numeric conversions */
if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0");
}
else {
/* setup for null participation in the collective operation. */
*buf_type = MPI_BYTE;
*file_type = MPI_BYTE;
/* Set non-NULL pointer for I/O operation */
mpi_bufs_base->vp = unused;
/* MPI count to read */
*size_i = 0;
/* some numeric conversions */
if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0");
}
done:
/* free sorted vectors if they exist */
if (!vector_was_sorted)
if (s_types) {
free(s_types);
s_types = NULL;
}
/* Clean up on error */
if (ret_value < 0) {
if (mpi_block_lengths) {
free(mpi_block_lengths);
mpi_block_lengths = NULL;
}
if (mpi_displacements) {
free(mpi_displacements);
mpi_displacements = NULL;
}
if (mpi_bufs) {
free(mpi_bufs);
mpi_bufs = NULL;
}
if (sub_types) {
assert(sub_types_created);
for (i = 0; i < (int)count; i++)
if (sub_types_created[i])
MPI_Type_free(&sub_types[i]);
free(sub_types);
sub_types = NULL;
free(sub_types_created);
sub_types_created = NULL;
}
}
/* Make sure we cleaned up */
assert(!mpi_block_lengths);
assert(!mpi_displacements);
assert(!mpi_bufs);
assert(!sub_types);
assert(!sub_types_created);
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_vector_build_types() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_read_vector()
*
* Purpose: The behavior of this function depends on the value of
* the io_xfer_mode obtained from the context.
*
* If it is H5FD_MPIO_COLLECTIVE, this is a collective
* operation, which allows us to use MPI_File_set_view, and
* then perform the entire vector read in a single MPI call.
*
* Do this (if count is positive), by constructing memory
* and file derived types from the supplied vector, using
* file type to set the file view, and then reading the
* the memory type from file. Note that this read is
* either independent or collective depending on the
* value of mpio_coll_opt -- again obtained from the context.
*
* If count is zero, participate in the collective read
* (if so configured) with an empty read.
*
* Finally, set the file view back to its default state.
*
* In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT,
* this call is independent, and thus we cannot use
* MPI_File_set_view().
*
* In this case, simply walk the vector, and issue an
* independent read for each entry.
*
* Return: Success: SUCCEED.
* Failure: FAIL.
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_read_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[],
haddr_t addrs[], size_t sizes[], void *bufs[])
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
bool vector_was_sorted = true;
haddr_t *s_addrs = NULL;
size_t *s_sizes = NULL;
void **s_bufs = NULL;
char unused = 0; /* Unused, except for non-NULL pointer value */
void *mpi_bufs_base = NULL;
MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
bool buf_type_created = false;
MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */
bool file_type_created = false;
int i;
int mpi_code; /* MPI return code */
MPI_Offset mpi_off = 0;
MPI_Status mpi_stat; /* Status from I/O operation */
H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */
int size_i;
MPI_Count bytes_read = 0; /* Number of bytes read in */
MPI_Count type_size; /* MPI datatype used for I/O's size */
MPI_Count io_size; /* Actual number of bytes requested */
MPI_Count n;
bool rank0_bcast = false; /* If read-with-rank0-and-bcast flag was used */
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
bool H5FD_mpio_debug_r_flag = (H5FD_mpio_debug_flags_s[(int)'r'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
assert((types) || (count == 0));
assert((addrs) || (count == 0));
assert((sizes) || (count == 0));
assert((bufs) || (count == 0));
/* verify that the first elements of the sizes and types arrays are
* valid.
*/
assert((count == 0) || (sizes[0] != 0));
assert((count == 0) || (types[0] != H5FD_MEM_NOLIST));
/* Get the transfer mode from the API context
*
* This flag is set to H5FD_MPIO_COLLECTIVE if the API call is
* collective, and to H5FD_MPIO_INDEPENDENT if it is not.
*
* While this doesn't mean that we are actually about to do a collective
* read, it does mean that all ranks are here, so we can use MPI_File_set_view().
*/
if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");
if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
/* Build MPI types, etc. */
if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs,
&s_addrs, &s_sizes, NULL, (H5_flexible_const_ptr_t **)&s_bufs,
&vector_was_sorted, &mpi_off,
(H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type,
&buf_type_created, &file_type, &file_type_created, &unused) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O");
/* free sorted addrs vector if it exists */
if (!vector_was_sorted)
if (s_addrs) {
free(s_addrs);
s_addrs = NULL;
}
/* Portably initialize MPI status variable */
memset(&mpi_stat, 0, sizeof(mpi_stat));
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i);
#endif
/* Setup the file view. */
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
/* Reset mpi_off to 0 since the view now starts at the data offset */
if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0");
/* Get the collective_opt property to check whether the application wants to do IO individually.
*/
if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property");
/* Read the data. */
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stdout, "%s: using MPIO collective mode\n", __func__);
#endif
if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stdout, "%s: doing MPI collective IO\n", __func__);
#endif
/* Check whether we should read from rank 0 and broadcast to other ranks */
if (H5CX_get_mpio_rank0_bcast()) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stdout, "%s: doing read-rank0-and-MPI_Bcast\n", __func__);
#endif
/* Indicate path we've taken */
rank0_bcast = true;
/* Read on rank 0 Bcast to other ranks */
if (file->mpi_rank == 0)
if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i,
buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(mpi_bufs_base, size_i, buf_type, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
} /* end if */
else if (MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code)
} /* end if */
else if (size_i > 0) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stdout, "%s: doing MPI independent IO\n", __func__);
#endif
if (MPI_SUCCESS !=
(mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
} /* end else */
/* Reset the file view */
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
/* Only retrieve bytes read if this rank _actually_ participated in I/O */
if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) {
/* How many bytes were actually read? */
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, buf_type, &bytes_read)))
HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
} /* end if */
/* If the rank0-bcast feature was used, broadcast the # of bytes read to
* other ranks, which didn't perform any I/O.
*/
/* NOTE: This could be optimized further to be combined with the broadcast
* of the data. (QAK - 2019/1/2)
* Or have rank 0 clear the unread parts of the buffer prior to
* the bcast. (NAF - 2021/9/15)
*/
if (rank0_bcast)
if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0)
/* Get the type's size */
if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(buf_type, &type_size)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code)
/* Compute the actual number of bytes requested */
io_size = type_size * size_i;
/* Check for read failure */
if (bytes_read < 0 || bytes_read > io_size)
HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed");
/* Check for incomplete read */
n = io_size - bytes_read;
if (n > 0) {
i = (int)count - 1;
/* Iterate over sorted array in reverse, filling in zeroes to
* sections of the buffers that were not read to */
do {
assert(i >= 0);
io_size = MIN(n, (MPI_Count)s_sizes[i]);
bytes_read = (MPI_Count)s_sizes[i] - io_size;
assert(bytes_read >= 0);
memset((char *)s_bufs[i] + bytes_read, 0, (size_t)io_size);
n -= io_size;
i--;
} while (n > 0);
}
}
else if (count > 0) {
haddr_t max_addr = HADDR_MAX;
bool fixed_size = false;
size_t size;
/* The read is part of an independent operation. As a result,
* we can't use MPI_File_set_view() (since it it a collective operation),
* and thus we can't use the above code to construct the MPI datatypes.
* In the future, we could write code to detect when a contiguous slab
* in the file selection spans multiple vector elements and construct a
* memory datatype to match this larger block in the file, but for now
* just read in each element of the vector in a separate
* MPI_File_read_at() call.
*
* We could also just detect the case when the entire file selection is
* contiguous, which would allow us to use
* H5FD__mpio_vector_build_types() to construct the memory datatype.
*/
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stdout, "%s: doing MPI independent IO\n", __func__);
#endif
/* Loop over vector elements */
for (i = 0; i < (int)count; i++) {
/* Convert address to mpi offset */
if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off");
/* Calculate I/O size */
if (!fixed_size) {
if (sizes[i] == 0) {
fixed_size = true;
size = sizes[i - 1];
}
else {
size = sizes[i];
}
}
size_i = (int)size;
if (size != (size_t)size_i) {
/* If HERE, then we need to work around the integer size limit
* of 2GB. The input size_t size variable cannot fit into an integer,
* but we can get around that limitation by creating a different datatype
* and then setting the integer size (or element count) to 1 when using
* the derived_type.
*/
if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype");
buf_type_created = true;
size_i = 1;
}
/* Check if we actually need to do I/O */
if (addrs[i] < max_addr) {
/* Portably initialize MPI status variable */
memset(&mpi_stat, 0, sizeof(mpi_stat));
/* Issue read */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_read_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code)
/* How many bytes were actually read? */
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, MPI_BYTE, &bytes_read)))
HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code)
/* Compute the actual number of bytes requested */
io_size = (MPI_Count)size;
/* Check for read failure */
if (bytes_read < 0 || bytes_read > io_size)
HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed");
/*
* If we didn't read the entire I/O, fill in zeroes beyond end of
* the physical MPI file and don't issue any more reads at higher
* addresses.
*/
if ((n = (io_size - bytes_read)) > 0) {
memset((char *)bufs[i] + bytes_read, 0, (size_t)n);
max_addr = addrs[i] + (haddr_t)bytes_read;
}
}
else {
/* Read is past the max address, fill in zeroes */
memset((char *)bufs[i], 0, size);
}
}
}
done:
if (buf_type_created) {
MPI_Type_free(&buf_type);
}
if (file_type_created) {
MPI_Type_free(&file_type);
}
/* free sorted vectors if they exist */
if (!vector_was_sorted) {
if (s_addrs) {
free(s_addrs);
s_addrs = NULL;
}
if (s_sizes) {
free(s_sizes);
s_sizes = NULL;
}
if (s_bufs) {
free(s_bufs);
s_bufs = NULL;
}
}
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_read_vector() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_write_vector
*
* Purpose: The behavior of this function depends on the value of
* the io_xfer_mode obtained from the context.
*
* If it is H5FD_MPIO_COLLECTIVE, this is a collective
* operation, which allows us to use MPI_File_set_view, and
* then perform the entire vector write in a single MPI call.
*
* Do this (if count is positive), by constructing memory
* and file derived types from the supplied vector, using
* file type to set the file view, and then writing the
* the memory type to file. Note that this write is
* either independent or collective depending on the
* value of mpio_coll_opt -- again obtained from the context.
*
* If count is zero, participate in the collective write
* (if so configured) with an empty write.
*
* Finally, set the file view back to its default state.
*
* In contrast, if io_xfer_mode is H5FD_MPIO_INDEPENDENT,
* this call is independent, and thus we cannot use
* MPI_File_set_view().
*
* In this case, simply walk the vector, and issue an
* independent write for each entry.
*
* Return: Success: SUCCEED.
* Failure: FAIL.
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_write_vector(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, uint32_t count, H5FD_mem_t types[],
haddr_t addrs[], size_t sizes[], const void *bufs[])
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
bool vector_was_sorted = true;
haddr_t *s_addrs = NULL;
size_t *s_sizes = NULL;
const void **s_bufs = NULL;
char unused = 0; /* Unused, except for non-NULL pointer value */
const void *mpi_bufs_base = NULL;
MPI_Datatype buf_type = MPI_BYTE; /* MPI description of the selection in memory */
bool buf_type_created = false;
MPI_Datatype file_type = MPI_BYTE; /* MPI description of the selection in file */
bool file_type_created = false;
int i;
int mpi_code; /* MPI return code */
MPI_Offset mpi_off = 0;
MPI_Status mpi_stat; /* Status from I/O operation */
H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
H5FD_mpio_collective_opt_t coll_opt_mode; /* whether we are doing collective or independent I/O */
int size_i;
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
bool H5FD_mpio_debug_w_flag = (H5FD_mpio_debug_flags_s[(int)'w'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
haddr_t max_addr = 0;
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
assert((types) || (count == 0));
assert((addrs) || (count == 0));
assert((sizes) || (count == 0));
assert((bufs) || (count == 0));
/* verify that the first elements of the sizes and types arrays are
* valid.
*/
assert((count == 0) || (sizes[0] != 0));
assert((count == 0) || (types[0] != H5FD_MEM_NOLIST));
/* Verify that no data is written when between MPI_Barrier()s during file flush */
assert(!H5CX_get_mpi_file_flushing());
/* Get the transfer mode from the API context
*
* This flag is set to H5FD_MPIO_COLLECTIVE if the API call is
* collective, and to H5FD_MPIO_INDEPENDENT if it is not.
*
* While this doesn't mean that we are actually about to do a collective
* write, it does mean that all ranks are here, so we can use MPI_File_set_view().
*/
if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");
if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
uint32_t s_sizes_len;
/* Build MPI types, etc. */
if (H5FD__mpio_vector_build_types(count, types, addrs, sizes, (H5_flexible_const_ptr_t *)bufs,
&s_addrs, &s_sizes, &s_sizes_len,
(H5_flexible_const_ptr_t **)&s_bufs, &vector_was_sorted, &mpi_off,
(H5_flexible_const_ptr_t *)&mpi_bufs_base, &size_i, &buf_type,
&buf_type_created, &file_type, &file_type_created, &unused) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't build MPI datatypes for I/O");
/* Compute max address written to. Note s_sizes is indexed according to the length of that array as
* reported by H5FD__mpio_vector_build_types(), which may be shorter if using the compressed arrays
* feature. */
if (count > 0)
max_addr = s_addrs[count - 1] + (haddr_t)(s_sizes[s_sizes_len - 1]);
/* free sorted vectors if they exist */
if (!vector_was_sorted) {
if (s_addrs) {
free(s_addrs);
s_addrs = NULL;
}
if (s_sizes) {
free(s_sizes);
s_sizes = NULL;
}
if (s_bufs) {
free(s_bufs);
s_bufs = NULL;
}
}
/* Portably initialize MPI status variable */
memset(&mpi_stat, 0, sizeof(MPI_Status));
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stdout, "%s: mpi_off = %ld size_i = %d\n", __func__, (long)mpi_off, size_i);
#endif
/* Setup the file view. */
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, file_type,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
/* Reset mpi_off to 0 since the view now starts at the data offset */
if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0");
/* Get the collective_opt property to check whether the application wants to do IO individually.
*/
if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property");
/* Write the data. */
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stdout, "%s: using MPIO collective mode\n", __func__);
#endif
if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stdout, "%s: doing MPI collective IO\n", __func__);
#endif
if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
/* Do MPI_File_sync when needed by underlying ROMIO driver */
if (file->mpi_file_sync_required) {
if (MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code)
}
} /* end if */
else if (size_i > 0) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stdout, "%s: doing MPI independent IO\n", __func__);
#endif
if (MPI_SUCCESS !=
(mpi_code = MPI_File_write_at(file->f, mpi_off, mpi_bufs_base, size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
} /* end else */
/* Reset the file view */
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
}
else if (count > 0) {
bool fixed_size = false;
size_t size;
/* The read is part of an independent operation. As a result,
* we can't use MPI_File_set_view() (since it it a collective operation),
* and thus we can't use the above code to construct the MPI datatypes.
* In the future, we could write code to detect when a contiguous slab
* in the file selection spans multiple vector elements and construct a
* memory datatype to match this larger block in the file, but for now
* just read in each element of the vector in a separate
* MPI_File_read_at() call.
*
* We could also just detect the case when the entire file selection is
* contiguous, which would allow us to use
* H5FD__mpio_vector_build_types() to construct the memory datatype.
*/
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stdout, "%s: doing MPI independent IO\n", __func__);
#endif
/* Loop over vector elements */
for (i = 0; i < (int)count; i++) {
/* Convert address to mpi offset */
if (H5FD_mpi_haddr_to_MPIOff(addrs[i], &mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off");
/* Calculate I/O size */
if (!fixed_size) {
if (sizes[i] == 0) {
fixed_size = true;
size = sizes[i - 1];
}
else {
size = sizes[i];
}
}
size_i = (int)size;
if (size != (size_t)size_i) {
/* If HERE, then we need to work around the integer size limit
* of 2GB. The input size_t size variable cannot fit into an integer,
* but we can get around that limitation by creating a different datatype
* and then setting the integer size (or element count) to 1 when using
* the derived_type.
*/
if (H5_mpio_create_large_type(size, 0, MPI_BYTE, &buf_type) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_CANTGET, FAIL, "can't create MPI-I/O datatype");
buf_type_created = true;
size_i = 1;
}
/* Perform write */
if (MPI_SUCCESS !=
(mpi_code = MPI_File_write_at(file->f, mpi_off, bufs[i], size_i, buf_type, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code)
/* Check if this is the highest address written to so far */
if (addrs[i] + size > max_addr)
max_addr = addrs[i] + size;
}
}
/* Each process will keep track of its perceived EOF value locally, and
* ultimately we will reduce this value to the maximum amongst all
* processes, but until then keep the actual eof at HADDR_UNDEF just in
* case something bad happens before that point. (rather have a value
* we know is wrong sitting around rather than one that could only
* potentially be wrong.)
*/
file->eof = HADDR_UNDEF;
/* check to see if the local eof has been extended, and update if so */
if (max_addr > file->local_eof)
file->local_eof = max_addr;
done:
if (buf_type_created)
MPI_Type_free(&buf_type);
if (file_type_created)
MPI_Type_free(&file_type);
/* Cleanup on error */
if (ret_value < 0 && !vector_was_sorted) {
if (s_addrs) {
free(s_addrs);
s_addrs = NULL;
}
if (s_sizes) {
free(s_sizes);
s_sizes = NULL;
}
if (s_bufs) {
free(s_bufs);
s_bufs = NULL;
}
}
/* Make sure we cleaned up */
assert(vector_was_sorted || !s_addrs);
assert(vector_was_sorted || !s_sizes);
assert(vector_was_sorted || !s_bufs);
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stdout, "%s: Leaving, proc %d: ret_value = %d\n", __func__, file->mpi_rank, ret_value);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_write_vector() */
/*-------------------------------------------------------------------------
* Function: H5FD__selection_build_types
*
* Purpose: Build MPI derived datatype for each piece and then
* build MPI final derived datatype for file and memory.
*
* Note: This is derived from H5D__link_piece_collective_io() in
* src/H5Dmpio.c.
*
* Return: Non-negative on success/Negative on failure
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__selection_build_types(bool io_op_write, size_t num_pieces, H5_flexible_const_ptr_t mbb,
H5S_t **file_spaces, H5S_t **mem_spaces, haddr_t offsets[],
H5_flexible_const_ptr_t bufs[], size_t src_element_sizes[],
size_t dst_element_sizes[], MPI_Datatype *final_ftype,
bool *final_ftype_is_derived, MPI_Datatype *final_mtype,
bool *final_mtype_is_derived)
{
MPI_Datatype *piece_mtype = NULL;
MPI_Datatype *piece_ftype = NULL;
MPI_Aint *piece_file_disp_array = NULL;
MPI_Aint *piece_mem_disp_array = NULL;
bool *piece_mft_is_derived_array = NULL; /* Flags to indicate each piece's MPI file datatype is derived */
;
bool *piece_mmt_is_derived_array =
NULL; /* Flags to indicate each piece's MPI memory datatype is derived */
int *piece_mpi_file_counts = NULL; /* Count of MPI file datatype for each piece */
int *piece_mpi_mem_counts = NULL; /* Count of MPI memory datatype for each piece */
haddr_t base_file_addr;
size_t i; /* Local index variable */
int mpi_code; /* MPI return code */
bool extend_src_sizes = false;
bool extend_dst_sizes = false;
bool extend_bufs = false;
H5_flexible_const_ptr_t buf;
size_t src_element_size, dst_element_size;
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
/* Allocate information for num_pieces */
if (NULL == (piece_mtype = (MPI_Datatype *)H5MM_malloc(num_pieces * sizeof(MPI_Datatype))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece memory datatype buffer");
if (NULL == (piece_ftype = (MPI_Datatype *)H5MM_malloc(num_pieces * sizeof(MPI_Datatype))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece file datatype buffer");
if (NULL == (piece_file_disp_array = (MPI_Aint *)H5MM_malloc(num_pieces * sizeof(MPI_Aint))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece file displacement buffer");
if (NULL == (piece_mem_disp_array = (MPI_Aint *)H5MM_calloc(num_pieces * sizeof(MPI_Aint))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece memory displacement buffer");
if (NULL == (piece_mpi_mem_counts = (int *)H5MM_calloc(num_pieces * sizeof(int))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece memory counts buffer");
if (NULL == (piece_mpi_file_counts = (int *)H5MM_calloc(num_pieces * sizeof(int))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL, "couldn't allocate piece file counts buffer");
if (NULL == (piece_mmt_is_derived_array = (bool *)H5MM_calloc(num_pieces * sizeof(bool))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
"couldn't allocate piece memory is derived datatype flags buffer");
if (NULL == (piece_mft_is_derived_array = (bool *)H5MM_calloc(num_pieces * sizeof(bool))))
HGOTO_ERROR(H5E_DATASET, H5E_CANTALLOC, FAIL,
"couldn't allocate piece file is derived datatype flags buffer");
/* save lowest file address */
base_file_addr = offsets[0];
/* Obtain MPI derived datatype from all individual pieces */
/* Iterate over selected pieces for this process */
for (i = 0; i < num_pieces; i++) {
hsize_t *permute_map = NULL; /* array that holds the mapping from the old,
out-of-order displacements to the in-order
displacements of the MPI datatypes of the
point selection of the file space */
bool is_permuted = false;
if (!extend_src_sizes) {
if (src_element_sizes[i] == 0) {
extend_src_sizes = true;
src_element_size = src_element_sizes[i - 1];
}
else
src_element_size = src_element_sizes[i];
}
if (!extend_dst_sizes) {
if (dst_element_sizes[i] == 0) {
extend_dst_sizes = true;
dst_element_size = dst_element_sizes[i - 1];
}
else
dst_element_size = src_element_sizes[i];
}
if (!extend_bufs) {
if (bufs[i].cvp == NULL) {
extend_bufs = true;
buf = bufs[i - 1];
}
else
buf = bufs[i];
}
/* Obtain disk and memory MPI derived datatype */
/* NOTE: The permute_map array can be allocated within H5S_mpio_space_type
* and will be fed into the next call to H5S_mpio_space_type
* where it will be freed.
*/
if (H5S_mpio_space_type(file_spaces[i], src_element_size, &piece_ftype[i], /* OUT: datatype created */
&piece_mpi_file_counts[i], /* OUT */
&(piece_mft_is_derived_array[i]), /* OUT */
true, /* this is a file space,
so permute the
datatype if the point
selections are out of
order */
&permute_map, /* OUT: a map to indicate the
permutation of points
selected in case they
are out of order */
&is_permuted /* OUT */) < 0)
HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI file type");
/* Sanity check */
if (is_permuted)
assert(permute_map);
if (H5S_mpio_space_type(mem_spaces[i], dst_element_size, &piece_mtype[i], &piece_mpi_mem_counts[i],
&(piece_mmt_is_derived_array[i]), false, /* this is a memory
space, so if the file
space is not
permuted, there is no
need to permute the
datatype if the point
selections are out of
order*/
&permute_map, /* IN: the permutation map
generated by the
file_space selection
and applied to the
memory selection */
&is_permuted /* IN */) < 0)
HGOTO_ERROR(H5E_DATASPACE, H5E_BADTYPE, FAIL, "couldn't create MPI buf type");
/* Sanity check */
if (is_permuted)
assert(!permute_map);
/* Piece address relative to the first piece addr
* Assign piece address to MPI displacement
* (assume MPI_Aint big enough to hold it) */
piece_file_disp_array[i] = (MPI_Aint)offsets[i] - (MPI_Aint)base_file_addr;
if (io_op_write) {
piece_mem_disp_array[i] = (MPI_Aint)buf.cvp - (MPI_Aint)mbb.cvp;
}
else {
piece_mem_disp_array[i] = (MPI_Aint)buf.vp - (MPI_Aint)mbb.vp;
}
} /* end for */
/* Create final MPI derived datatype for the file */
if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)num_pieces, piece_mpi_file_counts,
piece_file_disp_array, piece_ftype, final_ftype)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code);
if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(final_ftype)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
*final_ftype_is_derived = true;
/* Create final MPI derived datatype for memory */
if (MPI_SUCCESS != (mpi_code = MPI_Type_create_struct((int)num_pieces, piece_mpi_mem_counts,
piece_mem_disp_array, piece_mtype, final_mtype)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_struct failed", mpi_code);
if (MPI_SUCCESS != (mpi_code = MPI_Type_commit(final_mtype)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code);
*final_mtype_is_derived = true;
/* Free the file & memory MPI datatypes for each piece */
for (i = 0; i < num_pieces; i++) {
if (piece_mmt_is_derived_array[i])
if (MPI_SUCCESS != (mpi_code = MPI_Type_free(piece_mtype + i)))
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
if (piece_mft_is_derived_array[i])
if (MPI_SUCCESS != (mpi_code = MPI_Type_free(piece_ftype + i)))
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
} /* end for */
done:
/* Release resources */
if (piece_mtype)
H5MM_xfree(piece_mtype);
if (piece_ftype)
H5MM_xfree(piece_ftype);
if (piece_file_disp_array)
H5MM_xfree(piece_file_disp_array);
if (piece_mem_disp_array)
H5MM_xfree(piece_mem_disp_array);
if (piece_mpi_mem_counts)
H5MM_xfree(piece_mpi_mem_counts);
if (piece_mpi_file_counts)
H5MM_xfree(piece_mpi_file_counts);
if (piece_mmt_is_derived_array)
H5MM_xfree(piece_mmt_is_derived_array);
if (piece_mft_is_derived_array)
H5MM_xfree(piece_mft_is_derived_array);
FUNC_LEAVE_NOAPI(ret_value);
} /* H5FD__selection_build_types() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_read_selection
*
* Purpose: The behavior of this function depends on the value of
* the transfer mode obtained from the context.
*
* If the transfer mode is H5FD_MPIO_COLLECTIVE:
* --sort the selections
* --set mpi_bufs_base
* --build the MPI derived types
* --perform MPI_File_set_view()
* --perform MPI_File_read_at_all() or MPI_File_read_at()
* depending on whether this is a H5FD_MPIO_COLLECTIVE_IO
*
* If this is not H5FD_MPIO_COLLECTIVE:
* --undo possible base address addition in internal routines
* --call H5FD_read_vector_from_selection() to perform vector
* or scalar writes for the selections
*
* Return: Success: SUCCEED.
* Failure: FAIL.
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_read_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, size_t count,
hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[],
size_t element_sizes[], void *bufs[] /* out */)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
MPI_Offset mpi_off;
MPI_Status mpi_stat; /* Status from I/O operation */
int size_i; /* Integer copy of 'size' to read */
H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
H5FD_mpio_collective_opt_t coll_opt_mode;
MPI_Datatype final_mtype; /* Final memory MPI datatype for all pieces with selection */
bool final_mtype_is_derived = false;
MPI_Datatype final_ftype; /* Final file MPI datatype for all pieces with selection */
bool final_ftype_is_derived = false;
hid_t *s_mem_space_ids = NULL;
hid_t *s_file_space_ids = NULL;
haddr_t *s_offsets = NULL;
size_t *s_element_sizes = NULL;
H5_flexible_const_ptr_t *s_bufs = NULL;
bool selection_was_sorted = true;
uint32_t i, j;
H5S_t **s_mem_spaces = NULL;
H5S_t **s_file_spaces = NULL;
haddr_t tmp_offset = 0;
void *mpi_bufs_base = NULL;
char unused = 0; /* Unused, except for non-NULL pointer value */
MPI_Count bytes_read = 0; /* Number of bytes read in */
MPI_Count type_size; /* MPI datatype used for I/O's size */
MPI_Count io_size; /* Actual number of bytes requested */
MPI_Count n;
bool rank0_bcast = false; /* If read-with-rank0-and-bcast flag was used */
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
bool H5FD_mpio_debug_r_flag = (H5FD_mpio_debug_flags_s[(int)'r'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
int mpi_code; /* MPI return code */
H5_flexible_const_ptr_t mbb;
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
assert((count == 0) || (mem_space_ids));
assert((count == 0) || (file_space_ids));
assert((count == 0) || (offsets));
assert((count == 0) || (element_sizes));
assert((count == 0) || (bufs));
/* Verify that the first elements of the element_sizes and bufs arrays are
* valid. */
assert((count == 0) || (element_sizes[0] != 0));
assert((count == 0) || (bufs[0] != NULL));
/* Portably initialize MPI status variable */
memset(&mpi_stat, 0, sizeof(MPI_Status));
/* Get the transfer mode from the API context */
if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");
/*
* Set up for a fancy xfer using complex types, or single byte block. We
* wouldn't need to rely on the use_view field if MPI semantics allowed
* us to test that btype=ftype=MPI_BYTE (or even MPI_TYPE_NULL, which
* could mean "use MPI_BYTE" by convention).
*/
if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
if (count) {
if (H5FD_sort_selection_io_req(&selection_was_sorted, count, mem_space_ids, file_space_ids,
offsets, element_sizes, (H5_flexible_const_ptr_t *)bufs,
&s_mem_space_ids, &s_file_space_ids, &s_offsets, &s_element_sizes,
&s_bufs) < 0)
HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't sort selection I/O request");
tmp_offset = s_offsets[0];
if (NULL == (s_file_spaces = H5MM_malloc(count * sizeof(H5S_t *))))
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"memory allocation failed for file space list");
if (NULL == (s_mem_spaces = H5MM_malloc(count * sizeof(H5S_t *))))
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"memory allocation failed for memory space list");
for (i = 0; i < count; i++) {
if (NULL == (s_mem_spaces[i] = (H5S_t *)H5I_object_verify(s_mem_space_ids[i], H5I_DATASPACE)))
HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID,
"can't retrieve memory dataspace from ID");
if (NULL ==
(s_file_spaces[i] = (H5S_t *)H5I_object_verify(s_file_space_ids[i], H5I_DATASPACE)))
HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID,
"can't retrieve file dataspace from ID");
}
/* when we setup mpi_bufs[] below, all addresses are offsets from
* mpi_bufs_base.
*
* Since these offsets must all be positive, we must scan through
* s_bufs[] to find the smallest value, and choose that for
* mpi_bufs_base.
*/
j = 0; /* guess at the index of the smallest value of s_bufs[] */
if ((count > 1) && (s_bufs[1].vp != NULL)) {
for (i = 1; i < count; i++)
if (s_bufs[i].vp < s_bufs[j].vp)
j = i;
}
mpi_bufs_base = s_bufs[j].vp;
mbb.vp = mpi_bufs_base;
if (H5FD__selection_build_types(false, count, mbb, s_file_spaces, s_mem_spaces, s_offsets, s_bufs,
s_element_sizes, s_element_sizes, &final_ftype,
&final_ftype_is_derived, &final_mtype,
&final_mtype_is_derived) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't build type for MPI-IO");
/* We have a single, complicated MPI datatype for both memory & file */
size_i = 1;
}
else {
/* No chunks selected for this process */
size_i = 0;
mpi_bufs_base = &unused;
/* Set the MPI datatype */
final_ftype = MPI_BYTE;
final_mtype = MPI_BYTE;
}
/* some numeric conversions */
if (H5FD_mpi_haddr_to_MPIOff(tmp_offset, &mpi_off /*out*/) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off");
/*
* Set the file view when we are using MPI derived types
*/
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, final_ftype,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code);
/* When using types, use the address as the displacement for
* MPI_File_set_view and reset the address for the read to zero
*/
/* Reset mpi_off to 0 since the view now starts at the data offset */
if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0");
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) using MPIO collective mode\n", __func__, file->mpi_rank);
#endif
/* Get the collective_opt property to check whether the application wants to do IO individually. */
if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property");
if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) doing MPI collective IO\n", __func__, file->mpi_rank);
#endif
/* Check whether we should read from rank 0 and broadcast to other ranks */
if (H5CX_get_mpio_rank0_bcast()) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) doing read-rank0-and-MPI_Bcast\n", __func__, file->mpi_rank);
#endif
/* Indicate path we've taken */
rank0_bcast = true;
/* Read on rank 0 Bcast to other ranks */
if (file->mpi_rank == 0) {
/* If MPI_File_read_at fails, push an error, but continue
* to participate in following MPI_Bcast */
if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i,
final_mtype, &mpi_stat)))
HMPI_DONE_ERROR(FAIL, "MPI_File_read_at failed", mpi_code);
}
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(mpi_bufs_base, size_i, final_mtype, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code);
} /* end if */
else
/* Perform collective read operation */
if (MPI_SUCCESS != (mpi_code = MPI_File_read_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
final_mtype, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at_all failed", mpi_code);
} /* end if */
else {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
#endif
/* Perform independent read operation */
if (MPI_SUCCESS != (mpi_code = MPI_File_read_at(file->f, mpi_off, mpi_bufs_base, size_i,
final_mtype, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_read_at failed", mpi_code);
} /* end else */
/*
* Reset the file view when we used MPI derived types
*/
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code);
/* Only retrieve bytes read if this rank _actually_ participated in I/O */
if (!rank0_bcast || (rank0_bcast && file->mpi_rank == 0)) {
/* How many bytes were actually read? */
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, final_mtype, &bytes_read))) {
if (rank0_bcast && file->mpi_rank == 0) {
/* If MPI_Get_elements(_x) fails for a rank 0 bcast strategy,
* push an error, but continue to participate in the following
* MPI_Bcast.
*/
bytes_read = -1;
HMPI_DONE_ERROR(FAIL, "MPI_Get_elements failed", mpi_code);
}
else
HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code);
}
} /* end if */
/* If the rank0-bcast feature was used, broadcast the # of bytes read to
* other ranks, which didn't perform any I/O.
*/
/* NOTE: This could be optimized further to be combined with the broadcast
* of the data. (QAK - 2019/1/2)
*/
if (rank0_bcast)
if (MPI_SUCCESS != MPI_Bcast(&bytes_read, 1, MPI_COUNT, 0, file->comm))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", 0);
/* Get the type's size */
if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(final_mtype, &type_size)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code);
/* Compute the actual number of bytes requested */
io_size = type_size * size_i;
/* Check for read failure */
if (bytes_read < 0 || bytes_read > io_size)
HGOTO_ERROR(H5E_IO, H5E_READERROR, FAIL, "file read failed");
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) mpi_off = %ld bytes_read = %lld type = %s\n", __func__,
file->mpi_rank, (long)mpi_off, (long long)bytes_read, H5FD__mem_t_to_str(type));
#endif
/*
* This gives us zeroes beyond end of physical MPI file.
*/
if ((n = (io_size - bytes_read)) > 0)
memset((char *)bufs[0] + bytes_read, 0, (size_t)n);
} /* end if */
else {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_r_flag)
fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
#endif
if (_file->base_addr > 0) {
/* Undo base address addition in internal routines before passing down to the mpio driver */
for (i = 0; i < count; i++) {
assert(offsets[i] >= _file->base_addr);
offsets[i] -= _file->base_addr;
}
}
if (H5FD_read_from_selection(_file, type, (uint32_t)count, mem_space_ids, file_space_ids, offsets,
element_sizes, bufs) < 0)
HGOTO_ERROR(H5E_VFL, H5E_READERROR, FAIL, "read vector from selection failed");
}
done:
/* Free the MPI buf and file types, if they were derived */
if (final_mtype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_mtype)))
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
if (final_ftype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_ftype)))
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
/* Cleanup dataspace arrays */
if (s_mem_spaces)
s_mem_spaces = H5MM_xfree(s_mem_spaces);
if (s_file_spaces)
s_file_spaces = H5MM_xfree(s_file_spaces);
if (!selection_was_sorted) {
free(s_mem_space_ids);
s_mem_space_ids = NULL;
free(s_file_space_ids);
s_file_space_ids = NULL;
free(s_offsets);
s_offsets = NULL;
free(s_element_sizes);
s_element_sizes = NULL;
free(s_bufs);
s_bufs = NULL;
}
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_read_selection() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_write_selection
*
* Purpose: The behavior of this function depends on the value of
* the transfer mode obtained from the context.
*
* If the transfer mode is H5FD_MPIO_COLLECTIVE:
* --sort the selections
* --set mpi_bufs_base
* --build the MPI derived types
* --perform MPI_File_set_view()
* --perform MPI_File_write_at_all() or MPI_File_write_at()
* depending on whether this is a H5FD_MPIO_COLLECTIVE_IO
* --calculate and set the file's eof for the bytes written
*
* If this is not H5FD_MPIO_COLLECTIVE:
* --undo possible base address addition in internal routines
* --call H5FD_write_vector_from_selection() to perform vector
* or scalar writes for the selections
*
* Return: Success: SUCCEED.
* Failure: FAIL.
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_write_selection(H5FD_t *_file, H5FD_mem_t type, hid_t H5_ATTR_UNUSED dxpl_id, size_t count,
hid_t mem_space_ids[], hid_t file_space_ids[], haddr_t offsets[],
size_t element_sizes[], const void *bufs[])
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
MPI_Offset mpi_off;
MPI_Offset save_mpi_off; /* Use at the end of the routine for setting local_eof */
MPI_Status mpi_stat; /* Status from I/O operation */
int size_i;
H5FD_mpio_xfer_t xfer_mode; /* I/O transfer mode */
H5FD_mpio_collective_opt_t coll_opt_mode;
MPI_Datatype final_mtype; /* Final memory MPI datatype for all pieces with selection */
bool final_mtype_is_derived = false;
MPI_Datatype final_ftype; /* Final file MPI datatype for all pieces with selection */
bool final_ftype_is_derived = false;
hid_t *s_mem_space_ids = NULL;
hid_t *s_file_space_ids = NULL;
haddr_t *s_offsets = NULL;
size_t *s_element_sizes = NULL;
H5_flexible_const_ptr_t *s_bufs = NULL;
bool selection_was_sorted = true;
const void *mpi_bufs_base = NULL;
uint32_t i, j;
H5S_t **s_mem_spaces = NULL;
H5S_t **s_file_spaces = NULL;
haddr_t tmp_offset = 0;
char unused = 0; /* Unused, except for non-NULL pointer value */
H5_flexible_const_ptr_t mbb;
MPI_Count bytes_written;
MPI_Count type_size; /* MPI datatype used for I/O's size */
MPI_Count io_size; /* Actual number of bytes requested */
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
bool H5FD_mpio_debug_w_flag = (H5FD_mpio_debug_flags_s[(int)'w'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
assert((count == 0) || (mem_space_ids));
assert((count == 0) || (file_space_ids));
assert((count == 0) || (offsets));
assert((count == 0) || (element_sizes));
assert((count == 0) || (bufs));
/* Verify that the first elements of the element_sizes and bufs arrays are
* valid. */
assert((count == 0) || (element_sizes[0] != 0));
assert((count == 0) || (bufs[0] != NULL));
/* Verify that no data is written when between MPI_Barrier()s during file flush */
assert(!H5CX_get_mpi_file_flushing());
/* Portably initialize MPI status variable */
memset(&mpi_stat, 0, sizeof(MPI_Status));
/* Get the transfer mode from the API context */
if (H5CX_get_io_xfer_mode(&xfer_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O transfer mode");
if (xfer_mode == H5FD_MPIO_COLLECTIVE) {
if (count) {
if (H5FD_sort_selection_io_req(&selection_was_sorted, count, mem_space_ids, file_space_ids,
offsets, element_sizes, (H5_flexible_const_ptr_t *)bufs,
&s_mem_space_ids, &s_file_space_ids, &s_offsets, &s_element_sizes,
&s_bufs) < 0)
HGOTO_ERROR(H5E_VFL, H5E_BADVALUE, FAIL, "can't sort selection I/O request");
tmp_offset = s_offsets[0];
if (NULL == (s_file_spaces = H5MM_malloc(count * sizeof(H5S_t *))))
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"memory allocation failed for file space list");
if (NULL == (s_mem_spaces = H5MM_malloc(count * sizeof(H5S_t *))))
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL,
"memory allocation failed for memory space list");
for (i = 0; i < count; i++) {
if (NULL ==
(s_file_spaces[i] = (H5S_t *)H5I_object_verify(s_file_space_ids[i], H5I_DATASPACE)))
HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID,
"can't retrieve file dataspace from ID");
if (NULL == (s_mem_spaces[i] = (H5S_t *)H5I_object_verify(s_mem_space_ids[i], H5I_DATASPACE)))
HGOTO_ERROR(H5E_VFL, H5E_BADTYPE, H5I_INVALID_HID,
"can't retrieve memory dataspace from ID");
}
/* when we setup mpi_bufs[] below, all addresses are offsets from
* mpi_bufs_base.
*
* Since these offsets must all be positive, we must scan through
* s_bufs[] to find the smallest value, and choose that for
* mpi_bufs_base.
*/
j = 0; /* guess at the index of the smallest value of s_bufs[] */
if ((count > 1) && (s_bufs[1].cvp != NULL)) {
for (i = 1; i < count; i++)
if (s_bufs[i].cvp < s_bufs[j].cvp)
j = i;
}
mpi_bufs_base = s_bufs[j].cvp;
mbb.cvp = mpi_bufs_base;
if (H5FD__selection_build_types(true, count, mbb, s_file_spaces, s_mem_spaces, s_offsets, s_bufs,
s_element_sizes, s_element_sizes, &final_ftype,
&final_ftype_is_derived, &final_mtype,
&final_mtype_is_derived) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "couldn't build type for MPI-IO");
/* We have a single, complicated MPI datatype for both memory & file */
size_i = 1;
}
else {
/* No chunks selected for this process */
size_i = 0;
mpi_bufs_base = &unused;
/* Set the MPI datatype */
final_ftype = MPI_BYTE;
final_mtype = MPI_BYTE;
}
/* some numeric conversions */
if (H5FD_mpi_haddr_to_MPIOff(tmp_offset, &mpi_off /*out*/) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't convert from haddr to MPI off");
/* To be used at the end of the routine for setting local_eof */
save_mpi_off = mpi_off;
/*
* Set the file view when we are using MPI derived types
*/
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, mpi_off, MPI_BYTE, final_ftype,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code);
/* Reset mpi_off to 0 since the view now starts at the data offset */
if (H5FD_mpi_haddr_to_MPIOff((haddr_t)0, &mpi_off) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "can't set MPI off to 0");
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) using MPIO collective mode\n", __func__, file->mpi_rank);
#endif
/* Get the collective_opt property to check whether the application wants to do IO individually. */
if (H5CX_get_mpio_coll_opt(&coll_opt_mode) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI-I/O collective_op property");
if (coll_opt_mode == H5FD_MPIO_COLLECTIVE_IO) {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) doing MPI collective IO\n", __func__, file->mpi_rank);
#endif
/* Perform collective write operation */
if (MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(file->f, mpi_off, mpi_bufs_base, size_i,
final_mtype, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code);
/* Do MPI_File_sync when needed by underlying ROMIO driver */
if (file->mpi_file_sync_required) {
if (MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code);
}
}
else {
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
#endif
/* Perform independent write operation */
if (MPI_SUCCESS != (mpi_code = MPI_File_write_at(file->f, mpi_off, mpi_bufs_base, size_i,
final_mtype, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at failed", mpi_code);
} /* end else */
/* Reset the file view when we used MPI derived types */
if (MPI_SUCCESS != (mpi_code = MPI_File_set_view(file->f, (MPI_Offset)0, MPI_BYTE, MPI_BYTE,
H5FD_mpi_native_g, file->info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code);
/* How many bytes were actually written */
if (MPI_SUCCESS != (mpi_code = MPI_Get_elements_x(&mpi_stat, final_mtype, &bytes_written)))
HMPI_GOTO_ERROR(FAIL, "MPI_Get_elements failed", mpi_code);
/* Get the type's size */
if (MPI_SUCCESS != (mpi_code = MPI_Type_size_x(final_mtype, &type_size)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_size failed", mpi_code);
/* Compute the actual number of bytes requested */
io_size = type_size * size_i;
/* Check for write failure */
if (bytes_written != io_size || bytes_written < 0)
HGOTO_ERROR(H5E_IO, H5E_WRITEERROR, FAIL, "file write failed");
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) mpi_off = %ld bytes_written = %lld type = %s\n", __func__,
file->mpi_rank, (long)mpi_off, (long long)bytes_written, H5FD__mem_t_to_str(type));
#endif
/* Each process will keep track of its perceived EOF value locally, and
* ultimately we will reduce this value to the maximum amongst all
* processes, but until then keep the actual eof at HADDR_UNDEF just in
* case something bad happens before that point. (rather have a value
* we know is wrong sitting around rather than one that could only
* potentially be wrong.) */
file->eof = HADDR_UNDEF;
if (bytes_written && (((haddr_t)bytes_written + (haddr_t)save_mpi_off) > file->local_eof))
file->local_eof = (haddr_t)save_mpi_off + (haddr_t)bytes_written;
}
else { /* Not H5FD_MPIO_COLLECTIVE */
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_w_flag)
fprintf(stderr, "%s: (%d) doing MPI independent IO\n", __func__, file->mpi_rank);
#endif
if (_file->base_addr > 0) {
/* Undo base address addition in internal routines before passing down to the mpio driver */
for (i = 0; i < count; i++) {
assert(offsets[i] >= _file->base_addr);
offsets[i] -= _file->base_addr;
}
}
if (H5FD_write_from_selection(_file, type, (uint32_t)count, mem_space_ids, file_space_ids, offsets,
element_sizes, bufs) < 0)
HGOTO_ERROR(H5E_VFL, H5E_WRITEERROR, FAIL, "write vector from selection failed");
}
done:
/* Free the MPI buf and file types, if they were derived */
if (final_mtype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_mtype)))
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
if (final_ftype_is_derived && MPI_SUCCESS != (mpi_code = MPI_Type_free(&final_ftype)))
HMPI_DONE_ERROR(FAIL, "MPI_Type_free failed", mpi_code);
/* Cleanup dataspace arrays */
if (s_mem_spaces)
s_mem_spaces = H5MM_xfree(s_mem_spaces);
if (s_file_spaces)
s_file_spaces = H5MM_xfree(s_file_spaces);
if (!selection_was_sorted) {
free(s_mem_space_ids);
s_mem_space_ids = NULL;
free(s_file_space_ids);
s_file_space_ids = NULL;
free(s_offsets);
s_offsets = NULL;
free(s_element_sizes);
s_element_sizes = NULL;
free(s_bufs);
s_bufs = NULL;
}
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving: ret_value = %d\n", __func__, file->mpi_rank, ret_value);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_write_selection() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_flush
*
* Purpose: Makes sure that all data is on disk. This is collective.
*
* Return: SUCCEED/FAIL
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_flush(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, bool closing)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
int mpi_code; /* mpi return code */
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
/* Only sync the file if we are not going to immediately close it */
if (!closing)
if (MPI_SUCCESS != (mpi_code = MPI_File_sync(file->f)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_sync failed", mpi_code)
done:
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_flush() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_truncate
*
* Purpose: Make certain the file's size matches it's allocated size
*
* This is a little sticky in the mpio case, as it is not
* easy for us to track the current EOF by extracting it from
* write calls, since other ranks could have written to the
* file beyond the local EOF.
*
* Instead, we first check to see if the EOA has changed since
* the last call to this function. If it has, we call
* MPI_File_get_size() to determine the current EOF, and
* only call MPI_File_set_size() if this value disagrees
* with the current EOA.
*
* Return: SUCCEED/FAIL
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_truncate(H5FD_t *_file, hid_t H5_ATTR_UNUSED dxpl_id, bool H5_ATTR_UNUSED closing)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
#ifdef H5FDmpio_DEBUG
bool H5FD_mpio_debug_t_flag = (H5FD_mpio_debug_flags_s[(int)'t'] && H5FD_MPIO_TRACE_THIS_RANK(file));
#endif
herr_t ret_value = SUCCEED;
FUNC_ENTER_PACKAGE
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Entering\n", __func__, file->mpi_rank);
#endif
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
if (!H5_addr_eq(file->eoa, file->last_eoa)) {
int mpi_code; /* mpi return code */
MPI_Offset size;
MPI_Offset needed_eof;
/* In principle, it is possible for the size returned by the
* call to MPI_File_get_size() to depend on whether writes from
* all processes have completed at the time process 0 makes the
* call.
*
* In practice, most (all?) truncate calls will come after a barrier
* and with no intervening writes to the file (with the possible
* exception of sueprblock / superblock extension message updates).
*
* Check the "MPI file closing" flag in the API context to determine
* if we can skip the barrier.
*/
if (!H5CX_get_mpi_file_flushing())
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
/* Only processor p0 will get the filesize and broadcast it. */
if (0 == file->mpi_rank) {
/* If MPI_File_get_size fails, broadcast file size as -1 to signal error */
if (MPI_SUCCESS != (mpi_code = MPI_File_get_size(file->f, &size)))
size = (MPI_Offset)-1;
}
/* Broadcast file size */
if (MPI_SUCCESS != (mpi_code = MPI_Bcast(&size, (int)sizeof(MPI_Offset), MPI_BYTE, 0, file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Bcast failed", mpi_code)
if (size < 0)
HMPI_GOTO_ERROR(FAIL, "MPI_File_get_size failed", mpi_code)
if (H5FD_mpi_haddr_to_MPIOff(file->eoa, &needed_eof) < 0)
HGOTO_ERROR(H5E_INTERNAL, H5E_BADRANGE, FAIL, "cannot convert from haddr_t to MPI_Offset");
/* EOA != EOF. Set EOF to EOA */
if (size != needed_eof) {
/* Extend the file's size */
if (MPI_SUCCESS != (mpi_code = MPI_File_set_size(file->f, needed_eof)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_size failed", mpi_code)
/* In general, we must wait until all processes have finished
* the truncate before any process can continue, since it is
* possible that a process would write at the end of the
* file, and this write would be discarded by the truncate.
*
* While this is an issue for a user initiated flush, it may
* not be an issue at file close. If so, we may be able to
* optimize out the following barrier in that case.
*/
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(file->comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
} /* end if */
/* Update the 'last' eoa value */
file->last_eoa = file->eoa;
} /* end if */
done:
#ifdef H5FDmpio_DEBUG
if (H5FD_mpio_debug_t_flag)
fprintf(stderr, "%s: (%d) Leaving\n", __func__, file->mpi_rank);
#endif
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_truncate() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_delete
*
* Purpose: Delete a file
*
* Return: SUCCEED/FAIL
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_delete(const char *filename, hid_t fapl_id)
{
H5P_genplist_t *plist; /* Property list pointer */
MPI_Comm comm = MPI_COMM_NULL;
MPI_Info info = MPI_INFO_NULL;
int mpi_rank = INT_MAX;
int mpi_code; /* MPI return code */
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_PACKAGE
assert(filename);
if (NULL == (plist = H5P_object_verify(fapl_id, H5P_FILE_ACCESS)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list");
assert(H5FD_MPIO == H5P_peek_driver(plist));
if (H5FD_mpi_self_initialized) {
comm = MPI_COMM_WORLD;
}
else {
/* Get the MPI communicator and info from the fapl */
if (H5P_get(plist, H5F_ACS_MPI_PARAMS_INFO_NAME, &info) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI info object");
if (H5P_get(plist, H5F_ACS_MPI_PARAMS_COMM_NAME, &comm) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, FAIL, "can't get MPI communicator");
}
/* Get the MPI rank of this process */
if (MPI_SUCCESS != (mpi_code = MPI_Comm_rank(comm, &mpi_rank)))
HMPI_GOTO_ERROR(FAIL, "MPI_Comm_rank failed", mpi_code)
/* Set up a barrier */
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
/* Delete the file */
if (mpi_rank == 0) {
/* If MPI_File_delete fails, push an error but
* still participate in the following MPI_Barrier
*/
if (MPI_SUCCESS != (mpi_code = MPI_File_delete(filename, info)))
HMPI_DONE_ERROR(FAIL, "MPI_File_delete failed", mpi_code)
}
/* Set up a barrier (don't want processes to run ahead of the delete) */
if (MPI_SUCCESS != (mpi_code = MPI_Barrier(comm)))
HMPI_GOTO_ERROR(FAIL, "MPI_Barrier failed", mpi_code)
done:
/* Free duplicated MPI Communicator and Info objects */
if (H5_mpi_comm_free(&comm) < 0)
HDONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI communicator");
if (H5_mpi_info_free(&info) < 0)
HDONE_ERROR(H5E_VFL, H5E_CANTFREE, FAIL, "unable to free MPI info object");
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_delete() */
/*-------------------------------------------------------------------------
* Function: H5FD__mpio_ctl
*
* Purpose: MPIO version of the ctl callback.
*
* The desired operation is specified by the op_code
* parameter.
*
* The flags parameter controls management of op_codes that
* are unknown to the callback
*
* The input and output parameters allow op_code specific
* input and output
*
* At present, the supported op codes are:
*
* H5FD_CTL_GET_MPI_COMMUNICATOR_OPCODE
* H5FD_CTL_GET_MPI_INFO_OPCODE
* H5FD_CTL_GET_MPI_RANK_OPCODE
* H5FD_CTL_GET_MPI_SIZE_OPCODE
* H5FD_CTL_GET_MPI_FILE_SYNC_OPCODE
*
* Note that these opcodes must be supported by all VFDs that
* support MPI.
*
* Return: Non-negative on success/Negative on failure
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD__mpio_ctl(H5FD_t *_file, uint64_t op_code, uint64_t flags, const void H5_ATTR_UNUSED *input,
void **output)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
herr_t ret_value = SUCCEED; /* Return value */
FUNC_ENTER_NOAPI(FAIL)
/* Sanity checks */
assert(file);
assert(H5FD_MPIO == file->pub.driver_id);
switch (op_code) {
case H5FD_CTL_GET_MPI_COMMUNICATOR_OPCODE:
assert(output);
assert(*output);
**((MPI_Comm **)output) = file->comm;
break;
case H5FD_CTL_GET_MPI_INFO_OPCODE:
assert(output);
assert(*output);
**((MPI_Info **)output) = file->info;
break;
case H5FD_CTL_GET_MPI_RANK_OPCODE:
assert(output);
assert(*output);
**((int **)output) = file->mpi_rank;
break;
case H5FD_CTL_GET_MPI_SIZE_OPCODE:
assert(output);
assert(*output);
**((int **)output) = file->mpi_size;
break;
case H5FD_CTL_GET_MPI_FILE_SYNC_OPCODE:
assert(output);
assert(*output);
**((bool **)output) = file->mpi_file_sync_required;
break;
default: /* unknown op code */
if (flags & H5FD_CTL_FAIL_IF_UNKNOWN_FLAG) {
HGOTO_ERROR(H5E_VFL, H5E_FCNTL, FAIL, "unknown op_code and fail if unknown");
}
break;
}
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD__mpio_ctl() */
#endif /* H5_HAVE_PARALLEL */