From c4a146efc40c66de7d06a387465c9f7ea9b2e280 Mon Sep 17 00:00:00 2001 From: jhendersonHDF Date: Thu, 19 Oct 2023 08:14:20 -0500 Subject: [PATCH] Fix issue with unmatched messages in ph5diff (#3719) --- release_docs/RELEASE.txt | 13 +++++++++++++ tools/lib/h5diff.c | 3 --- tools/src/h5diff/ph5diff_main.c | 8 ++++---- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/release_docs/RELEASE.txt b/release_docs/RELEASE.txt index ea774af456..83c20b07d3 100644 --- a/release_docs/RELEASE.txt +++ b/release_docs/RELEASE.txt @@ -791,6 +791,19 @@ Bug Fixes since HDF5-1.14.0 release Tools ----- + - Fixed an issue with unmatched MPI messages in ph5diff + + The "manager" MPI rank in ph5diff was unintentionally sending "program end" + messages to its workers twice, leading to an error from MPICH similar to the + following: + + Abort(810645519) on node 1 (rank 1 in comm 0): Fatal error in internal_Finalize: Other MPI error, error stack: + internal_Finalize(50)...........: MPI_Finalize failed + MPII_Finalize(394)..............: + MPIR_Comm_delete_internal(1224).: Communicator (handle=44000000) being freed has 1 unmatched message(s) + MPIR_Comm_release_always(1250)..: + MPIR_finalize_builtin_comms(154): + - Fixed an issue in h5repack for variable-length typed datasets When repacking datasets into a new file, h5repack tries to determine whether diff --git a/tools/lib/h5diff.c b/tools/lib/h5diff.c index 924f9f35de..15f2a1428b 100644 --- a/tools/lib/h5diff.c +++ b/tools/lib/h5diff.c @@ -1485,9 +1485,6 @@ diff_match(hid_t file1_id, const char *grp1, trav_info_t *info1, hid_t file2_id, } /* end else */ } /* end while */ - for (i = 1; (int)i < g_nTasks; i++) - MPI_Send(NULL, 0, MPI_BYTE, (int)i, MPI_TAG_END, MPI_COMM_WORLD); - /* Print any final data waiting in our queue */ print_incoming_data(); } /* end if */ diff --git a/tools/src/h5diff/ph5diff_main.c b/tools/src/h5diff/ph5diff_main.c index 0f43261078..f90bd484ac 100644 --- a/tools/src/h5diff/ph5diff_main.c +++ b/tools/src/h5diff/ph5diff_main.c @@ -127,7 +127,7 @@ ph5diff_worker(int nID) char filenames[2][MAX_FILENAME]; /* Retrieve filenames */ - MPI_Recv(filenames, MAX_FILENAME * 2, MPI_CHAR, 0, MPI_ANY_TAG, MPI_COMM_WORLD, &Status); + MPI_Recv(filenames, MAX_FILENAME * 2, MPI_CHAR, 0, MPI_TAG_PARALLEL, MPI_COMM_WORLD, &Status); /* disable error reporting */ H5E_BEGIN_TRY @@ -173,7 +173,7 @@ ph5diff_worker(int nID) /* When get token, send all of our output to the manager task and then return the token */ for (i = 0; i < outBuffOffset; i += PRINT_DATA_MAX_SIZE) - MPI_Send(outBuff + i, PRINT_DATA_MAX_SIZE, MPI_BYTE, 0, MPI_TAG_PRINT_DATA, + MPI_Send(outBuff + i, PRINT_DATA_MAX_SIZE, MPI_CHAR, 0, MPI_TAG_PRINT_DATA, MPI_COMM_WORLD); /* An overflow file exists, so we send it's output to the manager too and then delete it */ @@ -188,7 +188,7 @@ ph5diff_worker(int nID) while ((tmp = getc(overflow_file)) >= 0) { *(out_data + i++) = (char)tmp; if (i == PRINT_DATA_MAX_SIZE) { - MPI_Send(out_data, PRINT_DATA_MAX_SIZE, MPI_BYTE, 0, MPI_TAG_PRINT_DATA, + MPI_Send(out_data, PRINT_DATA_MAX_SIZE, MPI_CHAR, 0, MPI_TAG_PRINT_DATA, MPI_COMM_WORLD); i = 0; memset(out_data, 0, PRINT_DATA_MAX_SIZE); @@ -196,7 +196,7 @@ ph5diff_worker(int nID) } if (i > 0) - MPI_Send(out_data, PRINT_DATA_MAX_SIZE, MPI_BYTE, 0, MPI_TAG_PRINT_DATA, + MPI_Send(out_data, PRINT_DATA_MAX_SIZE, MPI_CHAR, 0, MPI_TAG_PRINT_DATA, MPI_COMM_WORLD); fclose(overflow_file);