Checkin of fix for CGNS bug

(https://jira.hdfgroup.org/browse/HDFFV-10055).

    Briefly, in H5C_collective_write() in H5Cmpio.c,
the metadata cache attempts to perform a collective
write of metadata cache entries.

    This worked fine as long as all processes had at
least one entry to write.

    However, when the process has no entries, the
function tries to participate in the collective write
by calling MPI_File_set_view(),
MPI_File_write_all() and then MPI_File_set_view()
again, to match the calls in H5FD_mpio_write().

   After pull request 183, the CGNS test benchmark_hdf5
started failing.  On investigation, I determined that
the failure occurred in the first call to MPI_File_set_view()
in the "no data to write" path through H5C_collective_write().
Note that pull request 183 did not create the problem,
it only exposed it.  The bug can be observed after pull
request 182 if one executes the CGNS progam
src/ptests/benchmark_hdf5 with 90 processes.

    The problem appears to have been that the calls to
MPI_File_set_view() in H5C_collective_write() and
H5FD_mpio_write() were using different values for the
info parameter.  I patched the problem by adding a
MPI specific VFD call allowing me to get the MPI_Info
used in H5FD_mpio_write() for use in
MPI_File_set_view() calls in H5C_collective_write().

    Tested serial & parallel, debug & production on
Jelly.
This commit is contained in:
mainzer 2017-04-06 18:11:21 -05:00
parent 60167ae875
commit 94c34773ce
8 changed files with 298 additions and 26 deletions

View File

@ -30,12 +30,16 @@
#include "H5Cmodule.h" /* This source code file is part of the H5C module */
#define H5AC_FRIEND
/***********/
/* Headers */
/***********/
#include "H5private.h" /* Generic Functions */
#include "H5ACprivate.h" /* Metadata Cache */
#include "H5ACpkg.h" /* Metadata Cache */
#include "H5Cpkg.h" /* Cache */
#include "H5Eprivate.h" /* Error Handling */
@ -338,6 +342,112 @@ H5C_dump_cache_skip_list(H5C_t * cache_ptr, char * calling_fcn)
} /* H5C_dump_cache_skip_list() */
#endif /* NDEBUG */
/*-------------------------------------------------------------------------
* Function: H5C_dump_coll_write_list
*
* Purpose: Debugging routine that prints a summary of the contents of
* the collective write skip list used by the metadata cache
* in the parallel case to maintain a list of entries to write
* collectively at a sync point.
*
* Return: Non-negative on success/Negative on failure
*
* Programmer: John Mainzer
* 4/1/17
*
*-------------------------------------------------------------------------
*/
#ifdef H5_HAVE_PARALLEL
#ifndef NDEBUG
herr_t
H5C_dump_coll_write_list(H5C_t * cache_ptr, char * calling_fcn)
{
herr_t ret_value = SUCCEED; /* Return value */
int i;
int list_len;
H5AC_aux_t * aux_ptr = NULL;
H5C_cache_entry_t * entry_ptr = NULL;
H5SL_node_t * node_ptr = NULL;
FUNC_ENTER_NOAPI_NOERR
HDassert(cache_ptr != NULL);
HDassert(cache_ptr->magic == H5C__H5C_T_MAGIC);
HDassert(cache_ptr->aux_ptr);
aux_ptr = (H5AC_aux_t *)cache_ptr->aux_ptr;
HDassert(aux_ptr->magic == H5AC__H5AC_AUX_T_MAGIC);
HDassert(calling_fcn != NULL);
list_len = (int)H5SL_count(cache_ptr->coll_write_list);
HDfprintf(stdout, "\n\nDumping MDC coll write list from %d:%s.\n",
aux_ptr->mpi_rank, calling_fcn);
HDfprintf(stdout, " slist len = %u.\n", cache_ptr->slist_len);
if ( list_len > 0 ) {
/* scan the collective write list generating the desired output */
HDfprintf(stdout,
"Num: Addr: Len: Prot/Pind: Dirty: Type:\n");
i = 0;
node_ptr = H5SL_first(cache_ptr->coll_write_list);
if ( node_ptr != NULL )
entry_ptr = (H5C_cache_entry_t *)H5SL_item(node_ptr);
else
entry_ptr = NULL;
while ( entry_ptr != NULL ) {
HDassert(entry_ptr->magic == H5C__H5C_CACHE_ENTRY_T_MAGIC);
HDfprintf(stdout,
"%s%d 0x%016llx %4lld %d/%d %d %s\n",
cache_ptr->prefix, i,
(long long)(entry_ptr->addr),
(long long)(entry_ptr->size),
(int)(entry_ptr->is_protected),
(int)(entry_ptr->is_pinned),
(int)(entry_ptr->is_dirty),
entry_ptr->type->name);
/* HDfprintf(stdout, " node_ptr = 0x%llx, item = %p\n",
(unsigned long long)node_ptr,
H5SL_item(node_ptr));
*/
node_ptr = H5SL_next(node_ptr);
if ( node_ptr != NULL )
entry_ptr = (H5C_cache_entry_t *)H5SL_item(node_ptr);
else
entry_ptr = NULL;
i++;
} /* end while */
} /* end if */
HDfprintf(stdout, "\n\n");
FUNC_LEAVE_NOAPI(ret_value)
} /* H5C_dump_coll_write_list() */
#endif /* NDEBUG */
#endif /* H5_HAVE_PARALLEL */
/*-------------------------------------------------------------------------
* Function: H5C_set_prefix

View File

@ -950,12 +950,15 @@ H5C__collective_write(H5F_t *f, hid_t dxpl_id)
/* Get original transfer mode */
if(NULL == (plist = (H5P_genplist_t *)H5I_object(dxpl_id)))
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, "not a data transfer property list")
HGOTO_ERROR(H5E_ARGS, H5E_BADTYPE, FAIL, \
"not a data transfer property list")
if(H5P_get(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property")
/* Get number of entries in collective write list */
count = (int)H5SL_count(cache_ptr->coll_write_list);
if(count > 0) {
H5FD_mpio_xfer_t xfer_mode = H5FD_MPIO_COLLECTIVE;
H5SL_node_t *node;
@ -964,21 +967,34 @@ H5C__collective_write(H5F_t *f, hid_t dxpl_id)
int i;
if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &xfer_mode) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property")
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, \
"can't set MPI-I/O property")
/* Allocate arrays */
if(NULL == (length_array = (int *)H5MM_malloc((size_t)count * sizeof(int))))
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective write table length array")
if(NULL == (buf_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))))
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective buf table length array")
if(NULL == (offset_array = (MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))))
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, "memory allocation failed for collective offset table length array")
if ( NULL == (length_array =
(int *)H5MM_malloc((size_t)count * sizeof(int))) )
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, \
"memory allocation failed for collective write table length array")
if ( NULL == (buf_array =
(MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))) )
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, \
"memory allocation failed for collective buf table length array")
if(NULL == (offset_array =
(MPI_Aint *)H5MM_malloc((size_t)count * sizeof(MPI_Aint))) )
HGOTO_ERROR(H5E_RESOURCE, H5E_CANTALLOC, FAIL, \
"memory allocation failed for collective offset table length array")
/* Fill arrays */
node = H5SL_first(cache_ptr->coll_write_list);
HDassert(node);
if(NULL == (entry_ptr = (H5C_cache_entry_t *)H5SL_item(node)))
HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item")
HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, \
"can't retrieve skip list item")
/* Set up initial array position & buffer base address */
length_array[0] = (int)entry_ptr->size;
@ -989,8 +1005,10 @@ H5C__collective_write(H5F_t *f, hid_t dxpl_id)
node = H5SL_next(node);
i = 1;
while(node) {
if(NULL == (entry_ptr = (H5C_cache_entry_t *)H5SL_item(node)))
HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, "can't retrieve skip list item")
HGOTO_ERROR(H5E_CACHE, H5E_NOTFOUND, FAIL, \
"can't retrieve skip list item")
/* Set up array position */
length_array[i] = (int)entry_ptr->size;
@ -1003,48 +1021,85 @@ H5C__collective_write(H5F_t *f, hid_t dxpl_id)
} /* end while */
/* Create memory MPI type */
if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, buf_array, MPI_BYTE, &btype)))
if(MPI_SUCCESS != (mpi_code =
MPI_Type_create_hindexed(count, length_array,
buf_array, MPI_BYTE,
&btype)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
btype_created = TRUE;
if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&btype)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
/* Create file MPI type */
if(MPI_SUCCESS != (mpi_code = MPI_Type_create_hindexed(count, length_array, offset_array, MPI_BYTE, &ftype)))
if(MPI_SUCCESS != (mpi_code =
MPI_Type_create_hindexed(count, length_array,
offset_array, MPI_BYTE,
&ftype)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_create_hindexed failed", mpi_code)
ftype_created = TRUE;
if(MPI_SUCCESS != (mpi_code = MPI_Type_commit(&ftype)))
HMPI_GOTO_ERROR(FAIL, "MPI_Type_commit failed", mpi_code)
/* Pass buf type, file type to the file driver */
if(H5FD_mpi_setup_collective(dxpl_id, &btype, &ftype) < 0)
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O properties")
HGOTO_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, \
"can't set MPI-I/O properties")
/* Write data */
if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0, (size_t)1, dxpl_id, base_buf) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, "unable to write entries collectively")
if(H5F_block_write(f, H5FD_MEM_DEFAULT, (haddr_t)0,
(size_t)1, dxpl_id, base_buf) < 0)
HGOTO_ERROR(H5E_CACHE, H5E_CANTFLUSH, FAIL, \
"unable to write entries collectively")
} /* end if */
else {
MPI_Status mpi_stat;
MPI_File mpi_fh_p;
MPI_File *mpi_fh_p;
MPI_File mpi_fh;
MPI_Info *info_p;
MPI_Info info;
if(H5F_get_mpi_handle(f, (MPI_File **)&mpi_fh_p) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file handle")
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, \
"can't get mpi file handle")
mpi_fh = *(MPI_File*)mpi_fh_p;
/* just to match up with the 1st MPI_File_set_view from H5FD_mpio_write() */
if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL)))
if (H5F_get_mpi_info(f, &info_p) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, \
"can't get mpi file info")
info = *info_p;
/* just to match up with the 1st MPI_File_set_view from
* H5FD_mpio_write()
*/
if(MPI_SUCCESS != (mpi_code =
MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE,
MPI_BYTE, "native",
info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
/* just to match up with MPI_File_write_at_all from H5FD_mpio_write() */
HDmemset(&mpi_stat, 0, sizeof(MPI_Status));
if(MPI_SUCCESS != (mpi_code = MPI_File_write_at_all(mpi_fh, (MPI_Offset)0, NULL, 0, MPI_BYTE, &mpi_stat)))
if(MPI_SUCCESS != (mpi_code =
MPI_File_write_at_all(mpi_fh, (MPI_Offset)0,
NULL, 0, MPI_BYTE, &mpi_stat)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_write_at_all failed", mpi_code)
/* just to match up with the 2nd MPI_File_set_view (reset) in H5FD_mpio_write() */
if(MPI_SUCCESS != (mpi_code = MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE, MPI_BYTE, "native", MPI_INFO_NULL)))
/* just to match up with the 2nd MPI_File_set_view (reset) in
* H5FD_mpio_write()
*/
if(MPI_SUCCESS != (mpi_code =
MPI_File_set_view(mpi_fh, (MPI_Offset)0, MPI_BYTE,
MPI_BYTE, "native",
info)))
HMPI_GOTO_ERROR(FAIL, "MPI_File_set_view failed", mpi_code)
} /* end else */
done:
@ -1063,7 +1118,8 @@ done:
if(orig_xfer_mode != H5FD_MPIO_COLLECTIVE) {
HDassert(plist);
if(H5P_set(plist, H5D_XFER_IO_XFER_MODE_NAME, &orig_xfer_mode) < 0)
HDONE_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, "can't set MPI-I/O property")
HDONE_ERROR(H5E_PLIST, H5E_CANTSET, FAIL, \
"can't set MPI-I/O property")
} /* end if */
FUNC_LEAVE_NOAPI(ret_value);

View File

@ -2340,9 +2340,12 @@ H5_DLL herr_t H5C_dump_cache_LRU(H5C_t *cache_ptr, const char *cache_name);
H5_DLL hbool_t H5C_get_serialization_in_progress(const H5C_t *cache_ptr);
H5_DLL hbool_t H5C_cache_is_clean(const H5C_t *cache_ptr, H5C_ring_t inner_ring);
H5_DLL herr_t H5C_dump_cache_skip_list(H5C_t *cache_ptr, char *calling_fcn);
#ifdef H5_HAVE_PARALLEL
H5_DLL herr_t H5C_dump_coll_write_list(H5C_t * cache_ptr, char * calling_fcn);
#endif /* H5_HAVE_PARALLEL */
H5_DLL herr_t H5C_get_entry_ptr_from_addr(H5C_t *cache_ptr, haddr_t addr,
void **entry_ptr_ptr);
H5_DLL herr_t H5C_flush_dependency_exists(H5C_t *cache_ptr, haddr_t parent_addr,
H5_DLL herr_t H5C_flush_dependency_exists(H5C_t *cache_ptr, haddr_t parent_addr,
haddr_t child_addr, hbool_t *fd_exists_ptr);
H5_DLL herr_t H5C_verify_entry_type(H5C_t *cache_ptr, haddr_t addr,
const H5C_class_t *expected_type, hbool_t *in_cache_ptr,

View File

@ -146,6 +146,45 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_mpi_get_comm() */
/*-------------------------------------------------------------------------
* Function: H5FD_get_mpi_info
*
* Purpose: Retrieves the file's mpi info
*
* Return: Success: SUCCEED
*
* Failure: Negative
*
* Programmer: John Mainzer
* 4/4/17
*
* Modifications:
*
*-------------------------------------------------------------------------
*/
herr_t
H5FD_get_mpi_info(H5FD_t *file, void** mpi_info)
{
const H5FD_class_mpi_t *cls;
herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI_NOINIT
HDassert(file);
cls = (const H5FD_class_mpi_t *)(file->cls);
HDassert(cls);
HDassert(cls->get_mpi_info); /* All MPI drivers must implement this */
/* Dispatch to driver */
if ((ret_value=(cls->get_mpi_info)(file, mpi_info)) < 0)
HGOTO_ERROR(H5E_VFL, H5E_CANTGET, MPI_COMM_NULL, \
"driver get_mpi_info request failed")
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5FD_get_mpi_info() */
/*-------------------------------------------------------------------------
* Function: H5FD_mpi_MPIOff_to_haddr

View File

@ -95,6 +95,7 @@ static herr_t H5FD_mpio_truncate(H5FD_t *_file, hid_t dxpl_id, hbool_t closing);
static int H5FD_mpio_mpi_rank(const H5FD_t *_file);
static int H5FD_mpio_mpi_size(const H5FD_t *_file);
static MPI_Comm H5FD_mpio_communicator(const H5FD_t *_file);
static herr_t H5FD_mpio_get_info(H5FD_t *_file, void** mpi_info);
/* The MPIO file driver information */
static const H5FD_class_mpi_t H5FD_mpio_g = {
@ -134,7 +135,8 @@ static const H5FD_class_mpi_t H5FD_mpio_g = {
}, /* End of superclass information */
H5FD_mpio_mpi_rank, /*get_rank */
H5FD_mpio_mpi_size, /*get_size */
H5FD_mpio_communicator /*get_comm */
H5FD_mpio_communicator, /*get_comm */
H5FD_mpio_get_info /*get_info */
};
#ifdef H5FDmpio_DEBUG
@ -1306,6 +1308,39 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
}
/*-------------------------------------------------------------------------
* Function: H5FD_mpio_get_info
*
* Purpose: Returns the file info of MPIO file driver.
*
* Returns: Non-negative if succeed or negative if fails.
*
* Programmer: John Mainzer
* April 4, 2017
*
* Modifications:
*
*-------------------------------------------------------------------------
*/
static herr_t
H5FD_mpio_get_info(H5FD_t *_file, void** mpi_info)
{
H5FD_mpio_t *file = (H5FD_mpio_t *)_file;
herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI_NOINIT
if(!mpi_info)
HGOTO_ERROR(H5E_ARGS, H5E_BADVALUE, FAIL, "mpi info not valid")
*mpi_info = &(file->info);
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* H5FD_mpio_get_info() */
/*-------------------------------------------------------------------------
* Function: H5FD_mpio_read

View File

@ -53,6 +53,7 @@ typedef struct H5FD_class_mpi_t {
int (*get_rank)(const H5FD_t *file); /* Get the MPI rank of a process */
int (*get_size)(const H5FD_t *file); /* Get the MPI size of a communicator */
MPI_Comm (*get_comm)(const H5FD_t *file); /* Get the communicator for a file */
herr_t (*get_mpi_info)(H5FD_t *file, void** mpi_info); /* get MPI_Info for a file */
} H5FD_class_mpi_t;
#endif
@ -202,6 +203,7 @@ H5_DLL herr_t H5FD_get_mpio_atomicity(H5FD_t *file, hbool_t *flag);
H5_DLL int H5FD_mpi_get_rank(const H5FD_t *file);
H5_DLL int H5FD_mpi_get_size(const H5FD_t *file);
H5_DLL MPI_Comm H5FD_mpi_get_comm(const H5FD_t *_file);
H5_DLL herr_t H5FD_get_mpi_info(H5FD_t *file, void** file_info);
#endif /* H5_HAVE_PARALLEL */
#endif /* !_H5FDprivate_H */

View File

@ -356,5 +356,31 @@ done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_mpi_retrieve_comm */
/*-------------------------------------------------------------------------
* Function: H5F_get_mpi_info
*
* Purpose: Retrieves MPI File info.
*
* Return: Success: The size (positive)
* Failure: Negative
*
*-------------------------------------------------------------------------
*/
herr_t
H5F_get_mpi_info(const H5F_t *f, MPI_Info **f_info)
{
herr_t ret_value = SUCCEED;
FUNC_ENTER_NOAPI(FAIL)
HDassert(f && f->shared);
/* Dispatch to driver */
if ((ret_value = H5FD_get_mpi_info(f->shared->lf, (void **)f_info)) < 0)
HGOTO_ERROR(H5E_FILE, H5E_CANTGET, FAIL, "can't get mpi file info")
done:
FUNC_LEAVE_NOAPI(ret_value)
} /* end H5F_get_mpi_info() */
#endif /* H5_HAVE_PARALLEL */

View File

@ -852,6 +852,7 @@ H5_DLL int H5F_mpi_get_rank(const H5F_t *f);
H5_DLL MPI_Comm H5F_mpi_get_comm(const H5F_t *f);
H5_DLL int H5F_mpi_get_size(const H5F_t *f);
H5_DLL herr_t H5F_mpi_retrieve_comm(hid_t loc_id, hid_t acspl_id, MPI_Comm *mpi_comm);
H5_DLL herr_t H5F_get_mpi_info(const H5F_t *f, MPI_Info **f_info);
#endif /* H5_HAVE_PARALLEL */
/* External file cache routines */