[svn-r12090] Purpose:

New APIs to add for collective chunk IO

Description:
Three new APIs
H5Pset_dxpl_mpio_chunk_opt_ratio
H5Pset_dxpl_mpio_chunk_opt_num
H5Pset_dxpl_mpio_chunk_opt

for optional optimization choices from users.
Solution:
Haven't added tests yet, won't affect other parts of the library.
Will add tests after urgent investigations of memory leaking problems from NASA Aura team.
Platforms tested:
heping: both parallel and sequential
shanti
Misc. update:
This commit is contained in:
MuQun Yang 2006-03-14 13:29:35 -05:00
parent 838e799710
commit a4c816eb75
6 changed files with 222 additions and 24 deletions

View File

@ -195,7 +195,10 @@ H5D_init_interface(void)
void *def_vfl_info = H5D_XFER_VFL_INFO_DEF;
size_t def_hyp_vec_size = H5D_XFER_HYPER_VECTOR_SIZE_DEF;
#ifdef H5_HAVE_PARALLEL
H5FD_mpio_xfer_t def_io_xfer_mode = H5D_XFER_IO_XFER_MODE_DEF;
H5FD_mpio_xfer_t def_io_xfer_mode = H5D_XFER_IO_XFER_MODE_DEF;
H5FD_mpio_chunk_opt_t def_mpio_chunk_opt_mode = H5D_XFER_MPIO_CHUNK_OPT_HARD_DEF;
unsigned def_mpio_chunk_opt_num = H5D_XFER_MPIO_CHUNK_OPT_NUM_DEF;
unsigned def_mpio_chunk_opt_ratio = H5D_XFER_MPIO_CHUNK_OPT_RATIO_DEF;
#endif /* H5_HAVE_PARALLEL */
H5Z_EDC_t enable_edc = H5D_XFER_EDC_DEF;
H5Z_cb_t filter_cb = H5D_XFER_FILTER_CB_DEF;
@ -298,6 +301,12 @@ H5D_init_interface(void)
/* Register the I/O transfer mode property */
if(H5P_register(xfer_pclass,H5D_XFER_IO_XFER_MODE_NAME,H5D_XFER_IO_XFER_MODE_SIZE,&def_io_xfer_mode,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
if(H5P_register(xfer_pclass,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME,H5D_XFER_MPIO_CHUNK_OPT_HARD_SIZE,&def_mpio_chunk_opt_mode,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
if(H5P_register(xfer_pclass,H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME,H5D_XFER_MPIO_CHUNK_OPT_NUM_SIZE,&def_mpio_chunk_opt_num,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
if(H5P_register(xfer_pclass,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME,H5D_XFER_MPIO_CHUNK_OPT_RATIO_SIZE,&def_mpio_chunk_opt_ratio,NULL,NULL,NULL,NULL,NULL,NULL,NULL)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTINSERT, FAIL, "can't insert property into class")
#endif /* H5_HAVE_PARALLEL */
/* Register the EDC property */

View File

@ -59,7 +59,6 @@
If the average number of chunks per process is greater than this value,
the library will create an MPI derived datatype to link all chunks to do collective IO.
The user can set this value through an API. */
#define H5D_ONE_LINK_CHUNK_IO_THRESHOLD 0 /* always set this option with value 0*/
/* Macros to represent options on how to obtain chunk address for one linked-chunk IO case */
#define H5D_OBTAIN_ONE_CHUNK_ADDR_IND 0
@ -75,7 +74,6 @@
If the average number of processes per chunk is greater than the default value,
collective IO is done for this chunk.
*/
#define H5D_MULTI_CHUNK_IO_COL_THRESHOLD 50
/* Macros to represent different IO modes(NONE, Independent or collective)for multiple chunk IO case */
#define H5D_CHUNK_IO_MODE_IND 0
@ -640,34 +638,39 @@ H5D_chunk_collective_io(H5D_io_info_t *io_info,fm_map *fm,const void *buf, hbool
int io_option = H5D_MULTI_CHUNK_IO;
int sum_chunk,mpi_size;
int one_link_chunk_io_threshold;
unsigned one_link_chunk_io_threshold;
H5P_genplist_t *plist;
H5FD_mpio_chunk_opt_t chunk_opt_mode;
herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI_NOINIT(H5D_chunk_collective_io)
assert (IS_H5FD_MPIO(io_info->dset->oloc.file));
/* Obtain the data transfer properties */
if(NULL == (plist = H5I_object(io_info->dxpl_id)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
chunk_opt_mode=(H5FD_mpio_chunk_opt_t)H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME);
if(chunk_opt_mode != H5FD_MPIO_OPT_ONE_IO) {
if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)
HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes");
if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
if(H5D_mpio_get_sum_chunk(io_info,fm,&sum_chunk)<0)
HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the total chunk number of all processes");
if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
one_link_chunk_io_threshold = H5D_ONE_LINK_CHUNK_IO_THRESHOLD;/*This should be replaced by the user inputting value from API. */
if(NULL == (plist = H5I_object(io_info->dxpl_id)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
/* step 1: choose an IO option */
/* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */
if(sum_chunk/mpi_size >= one_link_chunk_io_threshold) io_option = H5D_ONE_LINK_CHUNK_IO;
one_link_chunk_io_threshold =H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME);
/* If this MPI-IO package doesn't support collective IO when no IO is done for one or more processes,
use MULTIPLE CHUNK IO */
/*
#ifndef H5_MPI_SPECIAL_COLLECTIVE_IO_WORKS
if(H5D_mpio_get_min_chunk(io_info,fm,&min_chunk)<0)
HGOTO_ERROR(H5E_DATASPACE, H5E_CANTSWAP, FAIL, "unable to obtain the min chunk number of all processes");
if(min_chunk == 0) io_option = H5D_MULTI_CHUNK_IO;
#endif
*/
/* step 1: choose an IO option */
/* If the average number of chunk per process is greater than a threshold, we will do one link chunked IO. */
if(sum_chunk/mpi_size >= one_link_chunk_io_threshold) io_option = H5D_ONE_LINK_CHUNK_IO;
}
else
io_option = H5D_ONE_LINK_CHUNK_IO;
#ifndef H5_MPI_COMPLEX_DERIVED_DATATYPE_WORKS
if(io_option == H5D_ONE_LINK_CHUNK_IO ) io_option = H5D_MULTI_CHUNK_IO ;/* We can not do this with one chunk IO. */
#endif
@ -1557,7 +1560,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
int total_chunks;
hsize_t ori_total_chunks;
int percent_nproc_per_chunk,threshold_nproc_per_chunk;
unsigned percent_nproc_per_chunk,threshold_nproc_per_chunk;
uint8_t* io_mode_info;
uint8_t* recv_io_mode_info;
uint8_t* mergebuf;
@ -1577,7 +1580,7 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
MPI_Comm comm;
int root;
int mpi_code;
int multi_chunk_io_col_threshold;
H5P_genplist_t *plist;
int mem_cleanup = 0,
mpi_type_cleanup = 0;
@ -1594,8 +1597,11 @@ H5D_obtain_mpio_mode(H5D_io_info_t* io_info,
HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi rank");
if((mpi_size = H5F_mpi_get_size(io_info->dset->oloc.file))<0)
HGOTO_ERROR (H5E_IO, H5E_MPI, FAIL, "unable to obtain mpi size");
multi_chunk_io_col_threshold = H5D_MULTI_CHUNK_IO_COL_THRESHOLD; /* May replace by user-input */
percent_nproc_per_chunk = multi_chunk_io_col_threshold;/* For example, above 50%, do collective IO */
/* Obtain the data transfer properties */
if(NULL == (plist = H5I_object(io_info->dxpl_id)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a file access property list")
percent_nproc_per_chunk=H5P_peek_unsigned(plist,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME);
threshold_nproc_per_chunk = mpi_size * percent_nproc_per_chunk/100;
/* Allocate memory */

View File

@ -141,6 +141,18 @@
#define H5D_XFER_IO_XFER_MODE_NAME "io_xfer_mode"
#define H5D_XFER_IO_XFER_MODE_SIZE sizeof(H5FD_mpio_xfer_t)
#define H5D_XFER_IO_XFER_MODE_DEF H5FD_MPIO_INDEPENDENT
/* Definitions for optimization of MPI-IO transfer mode property */
#define H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME "mpio_chunk_opt_hard"
#define H5D_XFER_MPIO_CHUNK_OPT_HARD_SIZE sizeof(H5FD_mpio_chunk_opt_t)
#define H5D_XFER_MPIO_CHUNK_OPT_HARD_DEF H5FD_MPIO_OPT_IGNORE
#define H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME "mpio_chunk_opt_num"
#define H5D_XFER_MPIO_CHUNK_OPT_NUM_SIZE sizeof(unsigned)
#define H5D_XFER_MPIO_CHUNK_OPT_NUM_DEF H5D_ONE_LINK_CHUNK_IO_THRESHOLD
#define H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME "mpio_chunk_opt_ratio"
#define H5D_XFER_MPIO_CHUNK_OPT_RATIO_SIZE sizeof(unsigned)
#define H5D_XFER_MPIO_CHUNK_OPT_RATIO_DEF H5D_MULTI_CHUNK_IO_COL_THRESHOLD
/* Definitions for EDC property */
#define H5D_XFER_EDC_NAME "err_detect"
#define H5D_XFER_EDC_SIZE sizeof(H5Z_EDC_t)

View File

@ -21,12 +21,34 @@
#ifndef H5FDmpi_H
#define H5FDmpi_H
/***** Macros for One linked collective IO case. *****/
/* The default value to do one linked collective IO for all chunks.
If the average number of chunks per process is greater than this value,
the library will create an MPI derived datatype to link all chunks to do collective IO.
The user can set this value through an API. */
#define H5D_ONE_LINK_CHUNK_IO_THRESHOLD 0
/***** Macros for multi-chunk collective IO case. *****/
/* The default value of the threshold to do collective IO for this chunk.
If the average number of processes per chunk is greater than the default value,
collective IO is done for this chunk.
*/
#define H5D_MULTI_CHUNK_IO_COL_THRESHOLD 50
/* Type of I/O for data transfer properties */
typedef enum H5FD_mpio_xfer_t {
H5FD_MPIO_INDEPENDENT = 0, /*zero is the default*/
H5FD_MPIO_COLLECTIVE
} H5FD_mpio_xfer_t;
/* Type of I/O for data transfer properties */
typedef enum H5FD_mpio_chunk_opt_t {
H5FD_MPIO_OPT_IGNORE = 0,
H5FD_MPIO_OPT_ONE_IO, /*zero is the default*/
H5FD_MPIO_OPT_MULTI_IO
} H5FD_mpio_chunk_opt_t;
#ifdef H5_HAVE_PARALLEL
/* Sub-class the H5FD_class_t to add more specific functions for MPI-based VFDs */

View File

@ -526,6 +526,152 @@ H5Pget_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t *xfer_mode/*out*/)
if (H5P_get(plist,H5D_XFER_IO_XFER_MODE_NAME,xfer_mode)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to get value")
done:
FUNC_LEAVE_API(ret_value)
}
/*-------------------------------------------------------------------------
* Function: H5Pset_dxpl_mpio_chunk_opt
Purpose:
To set a flag to choose linked chunk IO or multi-chunk IO without
involving decision-making inside HDF5
Description:
The library will do linked chunk IO or multi-chunk IO without
involving communications for decision-making process.
The library won't behave as it asks for only when we find
that the low-level MPI-IO package doesn't support this.
Parameters:
hid_t dxpl_id in: Data transfer property list identifier
H5FD_mpio_chunk_opt_t in: The optimization flag for linked chunk IO
or multi-chunk IO.
Returns:
Returns a non-negative value if successful. Otherwise returns a negative value.
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id, H5FD_mpio_chunk_opt_t opt_mode)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value;
FUNC_ENTER_API(H5Pset_dxpl_mpio_chunk_opt, FAIL)
/* H5TRACE2("e","iDt",dxpl_id,xfer_mode);*/
if(dxpl_id==H5P_DEFAULT)
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
/* Check arguments */
if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
/* Set the transfer mode */
if (H5P_set(plist,H5D_XFER_MPIO_CHUNK_OPT_HARD_NAME,&opt_mode)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
/* Initialize driver-specific properties */
ret_value= H5P_set_driver(plist, H5FD_MPIO, NULL);
done:
FUNC_LEAVE_API(ret_value)
}
/*-------------------------------------------------------------------------
* Function: H5Pset_dxpl_mpio_chunk_opt_num
Purpose:
To set a threshold for doing linked chunk IO
Description:
If the number is greater than the threshold set by the user,
the library will do linked chunk IO; otherwise, IO will be done for every chunk.
Parameters:
hid_t dxpl_id in: Data transfer property list identifier
unsigned num_proc_per_chunk in: the threshold of the average number of chunks selected by each process
Returns:
Returns a non-negative value if successful. Otherwise returns a negative value.
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id, unsigned num_chunk_per_proc)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value;
FUNC_ENTER_API(H5Pset_dxpl_mpio_chunk_opt_num, FAIL)
/* H5TRACE2("e","iDt",dxpl_id,xfer_mode);*/
if(dxpl_id==H5P_DEFAULT)
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
/* Check arguments */
if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
/* Set the transfer mode */
if (H5P_set(plist,H5D_XFER_MPIO_CHUNK_OPT_NUM_NAME,&num_chunk_per_proc)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
/* Initialize driver-specific properties */
ret_value= H5P_set_driver(plist, H5FD_MPIO, NULL);
done:
FUNC_LEAVE_API(ret_value)
}
/*-------------------------------------------------------------------------
* Function: H5Pset_dxpl_mpio_chunk_opt_ratio
Purpose:
To set a threshold for doing collective IO for each chunk
Description:
The library will calculate the percentage of the number of process holding selections at each chunk. If that percentage of number of process in the individual chunk is greater than the threshold set by the user, the library will do collective chunk IO for this chunk; otherwise, independent IO will be done for this chunk.
Parameters:
hid_t dxpl_id
in: Data transfer property list identifier
unsigned percent_num_proc_per_chunk
in: the threshold of the percentage of the number of process holding selections per chunk
Returns:
Returns a non-negative value if successful. Otherwise returns a negative value.
*
*-------------------------------------------------------------------------
*/
herr_t
H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id, unsigned percent_num_proc_per_chunk)
{
H5P_genplist_t *plist; /* Property list pointer */
herr_t ret_value;
FUNC_ENTER_API(H5Pset_dxpl_mpio_chunk_opt_ratio, FAIL)
/* H5TRACE2("e","iDt",dxpl_id,xfer_mode);*/
if(dxpl_id==H5P_DEFAULT)
HGOTO_ERROR(H5E_PLIST, H5E_BADVALUE, FAIL, "can't set values in default property list")
/* Check arguments */
if(NULL == (plist = H5P_object_verify(dxpl_id,H5P_DATASET_XFER)))
HGOTO_ERROR(H5E_PLIST, H5E_BADTYPE, FAIL, "not a dxpl")
/* Set the transfer mode */
if (H5P_set(plist,H5D_XFER_MPIO_CHUNK_OPT_RATIO_NAME,&percent_num_proc_per_chunk)<0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "unable to set value")
/* Initialize driver-specific properties */
ret_value= H5P_set_driver(plist, H5FD_MPIO, NULL);
done:
FUNC_LEAVE_API(ret_value)
}

View File

@ -51,6 +51,9 @@ H5_DLL herr_t H5Pget_fapl_mpio(hid_t fapl_id, MPI_Comm *comm/*out*/,
MPI_Info *info/*out*/);
H5_DLL herr_t H5Pset_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t xfer_mode);
H5_DLL herr_t H5Pget_dxpl_mpio(hid_t dxpl_id, H5FD_mpio_xfer_t *xfer_mode/*out*/);
H5_DLL herr_t H5Pset_dxpl_mpio_chunk_opt(hid_t dxpl_id, H5FD_mpio_chunk_opt_t opt_mode);
H5_DLL herr_t H5Pset_dxpl_mpio_chunk_opt_num(hid_t dxpl_id, unsigned num_chunk_per_proc);
H5_DLL herr_t H5Pset_dxpl_mpio_chunk_opt_ratio(hid_t dxpl_id, unsigned percent_num_proc_per_chunk);
#ifdef __cplusplus
}
#endif