mirror of
https://github.com/HDFGroup/hdf5.git
synced 2025-03-31 17:10:47 +08:00
Fix Subfiling VFD IOC assignment bug (#3456)
This commit is contained in:
parent
896270cc6b
commit
0b24f40b0f
@ -292,6 +292,19 @@ Bug Fixes since HDF5-1.14.0 release
|
||||
===================================
|
||||
Library
|
||||
-------
|
||||
- Fixed a bug with the way the Subfiling VFD assigns I/O concentrators
|
||||
|
||||
During a file open operation, the Subfiling VFD determines the topology
|
||||
of the application and uses that to select a subset of MPI ranks that
|
||||
I/O will be forwarded to, called I/O concentrators. The code for this
|
||||
had previously assumed that the parallel job launcher application (e.g.,
|
||||
mpirun, srun, etc.) would distribute MPI ranks sequentially among a node
|
||||
until all processors on that node have been assigned before going on to
|
||||
the next node. When the launcher application mapped MPI ranks to nodes
|
||||
in a different fashion, such as round-robin, this could cause the Subfiling
|
||||
VFD to incorrectly map MPI ranks as I/O concentrators, leading to missing
|
||||
subfiles.
|
||||
|
||||
- Fixed performance regression with some compound type conversions
|
||||
|
||||
In-place type conversion was introduced for most use cases in 1.14.2.
|
||||
|
@ -40,11 +40,6 @@
|
||||
#include "H5subfiling_common.h"
|
||||
#include "H5subfiling_err.h"
|
||||
|
||||
/*
|
||||
* Some definitions for debugging the Subfiling VFD
|
||||
*/
|
||||
/* #define H5FD_SUBFILING_DEBUG */
|
||||
|
||||
#define DRIVER_INFO_MESSAGE_MAX_INFO 65536
|
||||
#define DRIVER_INFO_MESSAGE_MAX_LENGTH 65552 /* MAX_INFO + sizeof(info_header_t) */
|
||||
|
||||
|
@ -55,8 +55,8 @@ static herr_t H5_free_subfiling_topology(sf_topology_t *topology);
|
||||
static herr_t init_subfiling(const char *base_filename, uint64_t file_id,
|
||||
H5FD_subfiling_params_t *subfiling_config, int file_acc_flags, MPI_Comm comm,
|
||||
int64_t *context_id_out);
|
||||
static herr_t init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
|
||||
sf_topology_t **app_topology_out);
|
||||
static herr_t init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config,
|
||||
MPI_Comm comm, MPI_Comm node_comm, sf_topology_t **app_topology_out);
|
||||
static herr_t get_ioc_selection_criteria_from_env(H5FD_subfiling_ioc_select_t *ioc_selection_type,
|
||||
char **ioc_sel_info_str);
|
||||
static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *subf_config,
|
||||
@ -64,7 +64,7 @@ static herr_t find_cached_topology_info(MPI_Comm comm, H5FD_subfiling_params_t *
|
||||
static herr_t init_app_layout(sf_topology_t *app_topology, MPI_Comm comm, MPI_Comm node_comm);
|
||||
static herr_t gather_topology_info(app_layout_t *app_layout, MPI_Comm comm, MPI_Comm intra_comm);
|
||||
static int compare_layout_nodelocal(const void *layout1, const void *layout2);
|
||||
static herr_t identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride);
|
||||
static herr_t identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride);
|
||||
static herr_t init_subfiling_context(subfiling_context_t *sf_context, const char *base_filename,
|
||||
uint64_t file_id, H5FD_subfiling_params_t *subfiling_config,
|
||||
sf_topology_t *app_topology, MPI_Comm file_comm);
|
||||
@ -886,7 +886,7 @@ init_subfiling(const char *base_filename, uint64_t file_id, H5FD_subfiling_param
|
||||
* Setup the application topology information, including the computed
|
||||
* number and distribution map of the set of I/O concentrators
|
||||
*/
|
||||
if (init_app_topology(subfiling_config, comm, node_comm, &app_topology) < 0)
|
||||
if (init_app_topology(context_id, subfiling_config, comm, node_comm, &app_topology) < 0)
|
||||
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL, "couldn't initialize application topology");
|
||||
|
||||
new_context->sf_context_id = context_id;
|
||||
@ -938,8 +938,8 @@ done:
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
static herr_t
|
||||
init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_Comm node_comm,
|
||||
sf_topology_t **app_topology_out)
|
||||
init_app_topology(int64_t sf_context_id, H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm,
|
||||
MPI_Comm node_comm, sf_topology_t **app_topology_out)
|
||||
{
|
||||
H5FD_subfiling_ioc_select_t ioc_selection_type;
|
||||
sf_topology_t *app_topology = NULL;
|
||||
@ -1142,7 +1142,7 @@ init_app_topology(H5FD_subfiling_params_t *subfiling_config, MPI_Comm comm, MPI_
|
||||
* Determine which ranks are I/O concentrator ranks, based on the
|
||||
* given IOC selection strategy and MPI information.
|
||||
*/
|
||||
if (identify_ioc_ranks(app_topology, rank_multiple) < 0)
|
||||
if (identify_ioc_ranks(sf_context_id, app_topology, rank_multiple) < 0)
|
||||
H5_SUBFILING_GOTO_ERROR(H5E_VFL, H5E_CANTINIT, FAIL,
|
||||
"couldn't determine which MPI ranks are I/O concentrators");
|
||||
}
|
||||
@ -1600,7 +1600,7 @@ compare_layout_nodelocal(const void *layout1, const void *layout2)
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
static herr_t
|
||||
identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
|
||||
identify_ioc_ranks(int64_t sf_context_id, sf_topology_t *app_topology, int rank_stride)
|
||||
{
|
||||
app_layout_t *app_layout = NULL;
|
||||
int *io_concentrators = NULL;
|
||||
@ -1613,6 +1613,11 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
|
||||
assert(app_topology->app_layout);
|
||||
assert(app_topology->app_layout->layout);
|
||||
assert(app_topology->app_layout->node_count > 0);
|
||||
assert(app_topology->app_layout->node_count <= app_topology->app_layout->world_size);
|
||||
|
||||
#ifndef H5_SUBFILING_DEBUG
|
||||
(void)sf_context_id;
|
||||
#endif
|
||||
|
||||
app_layout = app_topology->app_layout;
|
||||
|
||||
@ -1629,36 +1634,52 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
|
||||
case SELECT_IOC_ONE_PER_NODE: {
|
||||
int total_ioc_count = 0;
|
||||
int iocs_per_node = 1;
|
||||
int last_lead_rank;
|
||||
|
||||
if (app_topology->n_io_concentrators > app_layout->node_count)
|
||||
iocs_per_node = app_topology->n_io_concentrators / app_layout->node_count;
|
||||
|
||||
assert(app_layout->node_ranks);
|
||||
/*
|
||||
* NOTE: The below code assumes that the app_layout->layout
|
||||
* array was sorted according to the node_lead_rank field,
|
||||
* such that entries for MPI ranks on the same node will occur
|
||||
* together in the array.
|
||||
*/
|
||||
|
||||
for (size_t i = 0; i < (size_t)app_layout->node_count; i++) {
|
||||
int node_index = app_layout->node_ranks[i];
|
||||
int local_size = app_layout->layout[node_index].node_local_size;
|
||||
last_lead_rank = app_layout->layout[0].node_lead_rank;
|
||||
for (size_t i = 0, layout_idx = 0; i < (size_t)app_layout->node_count; i++) {
|
||||
int local_size = app_layout->layout[layout_idx].node_local_size;
|
||||
|
||||
/* Assign first I/O concentrator from this node */
|
||||
assert(total_ioc_count < app_topology->n_io_concentrators);
|
||||
io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
|
||||
io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank;
|
||||
|
||||
if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
|
||||
assert(!app_topology->rank_is_ioc);
|
||||
|
||||
app_topology->ioc_idx = total_ioc_count;
|
||||
app_topology->rank_is_ioc = TRUE;
|
||||
}
|
||||
|
||||
total_ioc_count++;
|
||||
|
||||
/* Assign any additional I/O concentrators from this node */
|
||||
for (size_t j = 1; j < (size_t)iocs_per_node; j++) {
|
||||
if (total_ioc_count >= max_iocs)
|
||||
break;
|
||||
if (j >= (size_t)local_size)
|
||||
break;
|
||||
|
||||
/* Sanity check to make sure this rank is on the same node */
|
||||
assert(app_layout->layout[layout_idx].node_lead_rank ==
|
||||
app_layout->layout[layout_idx - 1].node_lead_rank);
|
||||
|
||||
assert(total_ioc_count < app_topology->n_io_concentrators);
|
||||
io_concentrators[total_ioc_count] = app_layout->layout[node_index++].rank;
|
||||
io_concentrators[total_ioc_count] = app_layout->layout[layout_idx++].rank;
|
||||
|
||||
if (app_layout->world_rank == io_concentrators[total_ioc_count]) {
|
||||
assert(!app_topology->rank_is_ioc);
|
||||
|
||||
app_topology->ioc_idx = total_ioc_count;
|
||||
app_topology->rank_is_ioc = TRUE;
|
||||
}
|
||||
@ -1668,8 +1689,25 @@ identify_ioc_ranks(sf_topology_t *app_topology, int rank_stride)
|
||||
|
||||
if (total_ioc_count >= max_iocs)
|
||||
break;
|
||||
|
||||
/* Find the block of layout structures for the next node */
|
||||
while ((layout_idx < (size_t)app_layout->world_size) &&
|
||||
(last_lead_rank == app_layout->layout[layout_idx].node_lead_rank))
|
||||
layout_idx++;
|
||||
|
||||
if (layout_idx >= (size_t)app_layout->world_size)
|
||||
break;
|
||||
|
||||
last_lead_rank = app_layout->layout[layout_idx].node_lead_rank;
|
||||
}
|
||||
|
||||
#ifdef H5_SUBFILING_DEBUG
|
||||
if (app_topology->n_io_concentrators != total_ioc_count)
|
||||
H5_subfiling_log(sf_context_id,
|
||||
"%s: **WARN** Number of I/O concentrators adjusted from %d to %d", __func__,
|
||||
app_topology->n_io_concentrators, total_ioc_count);
|
||||
#endif
|
||||
|
||||
/* Set final number of I/O concentrators after adjustments */
|
||||
app_topology->n_io_concentrators = total_ioc_count;
|
||||
|
||||
|
@ -155,6 +155,25 @@ typedef enum io_ops {
|
||||
* will be broadcast to all MPI ranks and will
|
||||
* provide a basis for determining which MPI ranks
|
||||
* will host an I/O concentrator.
|
||||
*
|
||||
* - rank
|
||||
* The MPI rank value for this processor
|
||||
*
|
||||
* - node_local_rank
|
||||
* The MPI rank value for this processor in an MPI
|
||||
* communicator that only involves MPI ranks on the
|
||||
* same node as this processor
|
||||
*
|
||||
* - node_local_size
|
||||
* The number of MPI ranks on the same node as this
|
||||
* processor, including this processor itself
|
||||
*
|
||||
* - node_lead_rank
|
||||
* The lowest MPI rank value for processors on the
|
||||
* same node as this processor (possibly the MPI
|
||||
* rank value for this processor); Denotes a "lead"
|
||||
* MPI rank for certain operations.
|
||||
*
|
||||
*/
|
||||
typedef struct {
|
||||
int rank;
|
||||
|
Loading…
x
Reference in New Issue
Block a user