hdf5/perform/pio_engine.c

1794 lines
58 KiB
C
Raw Normal View History

/*
* Copyright (C) 2001, 2002
* National Center for Supercomputing Applications
* All rights reserved.
*
* Author: Albert Cheng of NCSA, Oct 24, 2001.
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include "hdf5.h"
#ifdef H5_HAVE_PARALLEL
#include <mpi.h>
#ifndef MPI_FILE_NULL /*MPIO may be defined in mpi.h already */
# include <mpio.h>
#endif /* !MPI_FILE_NULL */
#ifdef H5_HAVE_GPFS
# include <gpfs_fcntl.h>
#endif /* H5_HAVE_GPFS */
#include "pio_perf.h"
#include "pio_timer.h"
/* Macro definitions */
/* sizes of various items. these sizes won't change during program execution */
/* The following three must have the same type */
#define ELMT_SIZE (sizeof(int)) /* we're doing ints */
#define ELMT_MPI_TYPE MPI_INT
#define ELMT_H5_TYPE H5T_NATIVE_INT
#define GOTOERROR(errcode) { ret_code = errcode; goto done; }
#define GOTODONE { goto done; }
#define ERRMSG(mesg) { \
fprintf(stderr, "Proc %d: ", pio_mpi_rank_g); \
fprintf(stderr, "*** Assertion failed (%s) at line %4d in %s\n", \
mesg, (int)__LINE__, __FILE__); \
}
#define MSG(mesg) { \
fprintf(stderr, "Proc %d: ", pio_mpi_rank_g); \
fprintf(stderr, "(%s) at line %4d in %s\n", \
mesg, (int)__LINE__, __FILE__); \
}
/* verify: if val is false (0), print mesg. */
#define VRFY(val, mesg) do { \
if (!val) { \
ERRMSG(mesg); \
GOTOERROR(FAIL); \
} \
} while(0)
/* POSIX I/O macros */
#define POSIXCREATE(fn) HDopen(fn, O_CREAT|O_TRUNC|O_RDWR, 0600)
#define POSIXOPEN(fn, F) HDopen(fn, F, 0600)
#define POSIXCLOSE(F) HDclose(F)
#define POSIXSEEK(F,L) HDlseek(F, L, SEEK_SET)
#define POSIXWRITE(F,B,S) HDwrite(F,B,S)
#define POSIXREAD(F,B,S) HDread(F,B,S)
enum {
PIO_CREATE = 1,
PIO_WRITE = 2,
PIO_READ = 4
};
/* Global variables */
static int clean_file_g = -1; /*whether to cleanup temporary test */
/*files. -1 is not defined; */
/*0 is no cleanup; 1 is do cleanup */
/*
* In a parallel machine, the filesystem suitable for compiling is
* unlikely a parallel file system that is suitable for parallel I/O.
* There is no standard pathname for the parallel file system. /tmp
* is about the best guess.
*/
#ifndef HDF5_PARAPREFIX
# ifdef __PUMAGON__
/* For the PFS of TFLOPS */
# define HDF5_PARAPREFIX "pfs:/pfs_grande/multi/tmp_1"
# else
# define HDF5_PARAPREFIX "/tmp"
# endif /* __PUMAGON__ */
#endif /* !HDF5_PARAPREFIX */
#ifndef MIN
# define MIN(a,b) ((a) < (b) ? (a) : (b))
#endif /* !MIN */
/* the different types of file descriptors we can expect */
typedef union _file_descr {
int posixfd; /* POSIX file handle*/
MPI_File mpifd; /* MPI file */
hid_t h5fd; /* HDF5 file */
} file_descr;
/* local functions */
static char *pio_create_filename(iotype iot, const char *base_name,
char *fullname, size_t size);
static herr_t do_write(results *res, file_descr *fd, parameters *parms,
long ndsets, off_t nelmts, size_t blk_size, size_t buf_size, void *buffer);
static herr_t do_read(results *res, file_descr *fd, parameters *parms,
long ndsets, off_t nelmts, size_t blk_size, size_t buf_size, void *buffer /*out*/);
static herr_t do_fopen(parameters *param, char *fname, file_descr *fd /*out*/,
int flags);
static herr_t do_fclose(iotype iot, file_descr *fd);
static void do_cleanupfile(iotype iot, char *fname);
/* GPFS-specific functions */
static void access_range(int handle, off_t start, off_t length, int is_write);
static void free_range(int handle, off_t start, off_t length);
static void clear_file_cache(int handle);
static void cancel_hints(int handle);
static void start_data_shipping(int handle, int num_insts);
static void stop_data_shipping(int handle);
static void invalidate_file_cache(const char *filename);
/*
* Function: do_pio
* Purpose: PIO Engine where Parallel IO are executed.
* Return: results
* Programmer: Albert Cheng, Bill Wendling 2001/12/12
* Modifications:
*/
results
do_pio(parameters param)
{
/* return codes */
herr_t ret_code = 0; /*return code */
results res;
file_descr fd;
iotype iot;
char fname[FILENAME_MAX];
int maxprocs;
long nfiles, nf;
long ndsets;
off_t nelmts;
char *buffer = NULL; /*data buffer pointer */
size_t buf_size; /*data buffer size in bytes */
size_t blk_size; /*interleaved I/O block size */
/* HDF5 variables */
herr_t hrc; /*HDF5 return code */
/* Sanity check parameters */
/* IO type */
iot = param.io_type;
switch (iot) {
case MPIO:
fd.mpifd = MPI_FILE_NULL;
res.timers = pio_time_new(MPI_TIMER);
break;
case POSIXIO:
fd.posixfd = -1;
res.timers = pio_time_new(MPI_TIMER);
break;
case PHDF5:
fd.h5fd = -1;
res.timers = pio_time_new(MPI_TIMER);
break;
default:
/* unknown request */
fprintf(stderr, "Unknown IO type request (%d)\n", iot);
GOTOERROR(FAIL);
}
nfiles = param.num_files; /* number of files */
ndsets = param.num_dsets; /* number of datasets per file */
nelmts = param.num_elmts; /* number of elements per dataset */
maxprocs = param.num_procs; /* max number of mpi-processes to use */
buf_size = param.buf_size;
blk_size = param.block_size; /* interleaved IO block size */
if (nfiles < 0 ) {
fprintf(stderr,
"number of files must be >= 0 (%ld)\n",
nfiles);
GOTOERROR(FAIL);
}
if (ndsets < 0 ) {
fprintf(stderr,
"number of datasets per file must be >= 0 (%ld)\n",
ndsets);
GOTOERROR(FAIL);
}
if (maxprocs <= 0 ) {
fprintf(stderr,
"maximum number of process to use must be > 0 (%d)\n",
maxprocs);
GOTOERROR(FAIL);
}
/* allocate transfer buffer */
if(buf_size<=0) {
HDfprintf(stderr,
"Transfer buffer size (%Hd) must be > 0\n", (long_long)buf_size);
GOTOERROR(FAIL);
}else{
buffer = malloc(buf_size);
if (buffer == NULL){
HDfprintf(stderr, "malloc for transfer buffer size (%Hd) failed\n",
(long_long)buf_size);
GOTOERROR(FAIL);
}
}
/* Should only need blk_size <= buf_size. */
/* More restrictive condition for easier implementation for now. */
if (blk_size > 0 && (buf_size % blk_size)){
HDfprintf(stderr,
"Transfer buffer size (%Hd) must be a multiple of the "
"interleaved I/O block size (%Hd)\n",
(long_long)buf_size, (long_long)blk_size);
GOTOERROR(FAIL);
}
if (pio_debug_level >= 4) {
int myrank;
MPI_Comm_rank(pio_comm_g, &myrank);
/* output all of the times for all iterations */
if (myrank == 0)
fprintf(output, "Timer details:\n");
}
for (nf = 1; nf <= nfiles; nf++) {
/*
* Write performance measurement
*/
/* Open file for write */
char base_name[256];
MPI_Barrier(pio_comm_g);
sprintf(base_name, "#pio_tmp_%lu", nf);
pio_create_filename(iot, base_name, fname, sizeof(fname));
set_time(res.timers, HDF5_GROSS_WRITE_FIXED_DIMS, START);
hrc = do_fopen(&param, fname, &fd, PIO_CREATE | PIO_WRITE);
VRFY((hrc == SUCCESS), "do_fopen failed");
set_time(res.timers, HDF5_FINE_WRITE_FIXED_DIMS, START);
hrc = do_write(&res, &fd, &param, ndsets, nelmts, blk_size, buf_size, buffer);
set_time(res.timers, HDF5_FINE_WRITE_FIXED_DIMS, STOP);
VRFY((hrc == SUCCESS), "do_write failed");
/* Close file for write */
hrc = do_fclose(iot, &fd);
set_time(res.timers, HDF5_GROSS_WRITE_FIXED_DIMS, STOP);
VRFY((hrc == SUCCESS), "do_fclose failed");
if (!param.h5_write_only) {
/*
* Read performance measurement
*/
MPI_Barrier(pio_comm_g);
/* Open file for read */
set_time(res.timers, HDF5_GROSS_READ_FIXED_DIMS, START);
hrc = do_fopen(&param, fname, &fd, PIO_READ);
VRFY((hrc == SUCCESS), "do_fopen failed");
set_time(res.timers, HDF5_FINE_READ_FIXED_DIMS, START);
hrc = do_read(&res, &fd, &param, ndsets, nelmts, blk_size, buf_size, buffer);
set_time(res.timers, HDF5_FINE_READ_FIXED_DIMS, STOP);
VRFY((hrc == SUCCESS), "do_read failed");
/* Close file for read */
hrc = do_fclose(iot, &fd);
set_time(res.timers, HDF5_GROSS_READ_FIXED_DIMS, STOP);
VRFY((hrc == SUCCESS), "do_fclose failed");
}
MPI_Barrier(pio_comm_g);
do_cleanupfile(iot, fname);
}
done:
/* clean up */
/* release HDF5 objects */
/* close any opened files */
/* no remove(fname) because that should have happened normally. */
switch (iot) {
case POSIXIO:
if (fd.posixfd != -1)
hrc = do_fclose(iot, &fd);
break;
case MPIO:
if (fd.mpifd != MPI_FILE_NULL)
hrc = do_fclose(iot, &fd);
break;
case PHDF5:
if (fd.h5fd != -1)
hrc = do_fclose(iot, &fd);
break;
}
/* release generic resources */
if(buffer)
free(buffer);
res.ret_code = ret_code;
return res;
}
/*
* Function: pio_create_filename
* Purpose: Create a new filename to write to. Determine the correct
* suffix to append to the filename by the type of I/O we're
* doing. Also, place in the /tmp/{$USER,$LOGIN} directory if
* USER or LOGIN are specified in the environment.
* Return: Pointer to filename or NULL
* Programmer: Bill Wendling, 21. November 2001
* Modifications:
*/
static char *
pio_create_filename(iotype iot, const char *base_name, char *fullname, size_t size)
{
const char *prefix, *suffix="";
char *ptr, last = '\0';
size_t i, j;
if (!base_name || !fullname || size < 1)
return NULL;
memset(fullname, 0, size);
switch (iot) {
case POSIXIO:
suffix = ".posix";
break;
case MPIO:
suffix = ".mpio";
break;
case PHDF5:
suffix = ".h5";
break;
}
/* First use the environment variable and then try the constant */
prefix = getenv("HDF5_PARAPREFIX");
#ifdef HDF5_PARAPREFIX
if (!prefix)
prefix = HDF5_PARAPREFIX;
#endif /* HDF5_PARAPREFIX */
/* Prepend the prefix value to the base name */
if (prefix && *prefix) {
/* If the prefix specifies the HDF5_PARAPREFIX directory, then
* default to using the "/tmp/$USER" or "/tmp/$LOGIN"
* directory instead. */
register char *user, *login, *subdir;
user = getenv("USER");
login = getenv("LOGIN");
subdir = (user ? user : login);
if (subdir) {
for (i = 0; i < size && prefix[i]; i++)
fullname[i] = prefix[i];
fullname[i++] = '/';
for (j = 0; i < size && subdir[j]; i++, j++)
fullname[i] = subdir[j];
} else {
/* We didn't append the prefix yet */
strncpy(fullname, prefix, MIN(strlen(prefix), size));
}
if ((strlen(fullname) + strlen(base_name) + 1) < size) {
/* Append the base_name with a slash first. Multiple slashes are
* handled below. */
h5_stat_t buf;
if (HDstat(fullname, &buf) < 0)
/* The directory doesn't exist just yet */
if (mkdir(fullname, (mode_t)0755) < 0 && errno != EEXIST) {
/* We couldn't make the "/tmp/${USER,LOGIN}" subdirectory.
* Default to PREFIX's original prefix value. */
strcpy(fullname, prefix);
}
strcat(fullname, "/");
strcat(fullname, base_name);
} else {
/* Buffer is too small */
return NULL;
}
} else if (strlen(base_name) >= size) {
/* Buffer is too small */
return NULL;
} else {
strcpy(fullname, base_name);
}
/* Append a suffix */
if (suffix) {
if (strlen(fullname) + strlen(suffix) >= size)
return NULL;
strcat(fullname, suffix);
}
/* Remove any double slashes in the filename */
for (ptr = fullname, i = j = 0; ptr && i < size; i++, ptr++) {
if (*ptr != '/' || last != '/')
fullname[j++] = *ptr;
last = *ptr;
}
return fullname;
}
/*
* Function: do_write
* Purpose: Write the required amount of data to the file.
* Return: SUCCESS or FAIL
* Programmer: Albert Cheng, Bill Wendling, 2001/12/13
* Modifications:
*/
static herr_t
do_write(results *res, file_descr *fd, parameters *parms, long ndsets,
off_t nelmts, size_t blk_size, size_t buf_size, void *buffer)
{
int ret_code = SUCCESS;
int rc; /*routine return code */
int mrc; /*MPI return code */
MPI_Offset mpi_offset;
MPI_Status mpi_status;
long ndset;
off_t nelmts_xfer;
size_t nelmts_toxfer;
char dname[64];
off_t dset_offset=0; /*dataset offset in a file */
off_t file_offset; /*file offset of the next transfer */
off_t dset_size; /*one dataset size in bytes */
size_t nelmts_in_buf; /*how many element the buffer holds */
size_t nelmts_in_blk=0; /*how many element a block holds */
off_t elmts_begin; /*first elmt this process transfer */
off_t elmts_count; /*number of elmts this process transfer */
hid_t dcpl = -1; /* Dataset creation property list */
/* HDF5 variables */
herr_t hrc; /*HDF5 return code */
hsize_t h5dims[1]; /*dataset dim sizes */
hid_t h5dset_space_id = -1; /*dataset space ID */
hid_t h5mem_space_id = -1; /*memory dataspace ID */
hid_t h5ds_id = -1; /*dataset handle */
hsize_t h5mem_block[1]; /*memory space selection */
hsize_t h5mem_stride[1];
hsize_t h5mem_count[1];
hssize_t h5mem_start[1];
#if 0
/* for future implementation */
hsize_t h5dset_block[1]; /*dset space selection */
hsize_t h5dset_stride[1];
hsize_t h5dset_count[1];
hssize_t h5dset_start[1];
#endif
/* calculate dataset parameters. data type is always native C int */
dset_size = nelmts * (off_t)ELMT_SIZE;
nelmts_in_buf = buf_size/ELMT_SIZE;
/* hdf5 data space setup */
if (parms->io_type == PHDF5){
if(nelmts>0) {
/* define a contiquous dataset of nelmts native ints */
h5dims[0] = nelmts;
h5dset_space_id = H5Screate_simple(1, h5dims, NULL);
VRFY((h5dset_space_id >= 0), "H5Screate_simple");
} /* end if */
else {
h5dset_space_id = H5Screate(H5S_SCALAR);
VRFY((h5dset_space_id >= 0), "H5Screate");
} /* end else */
/* create the memory dataspace that corresponds to the xfer buffer */
if(nelmts_in_buf>0) {
h5dims[0] = nelmts_in_buf;
h5mem_space_id = H5Screate_simple(1, h5dims, NULL);
VRFY((h5mem_space_id >= 0), "H5Screate_simple");
} /* end if */
else {
h5mem_space_id = H5Screate(H5S_SCALAR);
VRFY((h5mem_space_id >= 0), "H5Screate");
} /* end else */
}
for (ndset = 1; ndset <= ndsets; ++ndset) {
/* Calculate dataset offset within a file */
/* create dataset */
switch (parms->io_type) {
case POSIXIO:
case MPIO:
/* both posix and mpi io just need dataset offset in file*/
dset_offset = (ndset - 1) * dset_size;
break;
case PHDF5:
dcpl = H5Pcreate(H5P_DATASET_CREATE);
if (dcpl < 0) {
fprintf(stderr, "HDF5 Property List Create failed\n");
GOTOERROR(FAIL);
}
/* Make the dataset chunked if asked */
if(parms->h5_use_chunks) {
/* Set the chunk size to be the same as the buffer size */
h5dims[0] = nelmts_in_buf;
hrc = H5Pset_chunk(dcpl, 1, h5dims);
if (hrc < 0) {
fprintf(stderr, "HDF5 Property List Set failed\n");
GOTOERROR(FAIL);
} /* end if */
} /* end if */
#ifdef H5_HAVE_NOFILL
/* Disable writing fill values if asked */
if(parms->h5_no_fill) {
hrc = H5Pset_fill_time(dcpl, H5D_FILL_TIME_NEVER);
if (hrc < 0) {
fprintf(stderr, "HDF5 Property List Set failed\n");
GOTOERROR(FAIL);
} /* end if */
} /* end if */
#endif
sprintf(dname, "Dataset_%ld", ndset);
h5ds_id = H5Dcreate(fd->h5fd, dname, ELMT_H5_TYPE,
h5dset_space_id, dcpl);
if (h5ds_id < 0) {
fprintf(stderr, "HDF5 Dataset Create failed\n");
GOTOERROR(FAIL);
}
hrc = H5Pclose(dcpl);
/* verifying the close of the dcpl */
if (hrc < 0) {
fprintf(stderr, "HDF5 Property List Close failed\n");
GOTOERROR(FAIL);
}
break;
}
/* There are two kinds of transfer patterns, contiguous and interleaved.
* Let 0,1,2,...,n be data accessed by process 0,1,2,...,n
* where n is rank of the last process.
* In contiguous pattern, data are accessed as
* 000...111...222...nnn...
* In interleaved pattern, data are accessed as
* 012...n012...n...
* These are all in the scope of one dataset.
*/
/* Calculate the total number of elements (elmts_count) to be
* transferred by this process. It may be different for different
* transfer pattern due to rounding to integral values.
*/
if (blk_size==0){
/* Contiguous Pattern:
* Calculate the beginning element of this process and the next.
* elmts_count is the difference between these two beginnings.
* This way, it eliminates any rounding errors.
*/
elmts_begin = (off_t)(((double)nelmts)/pio_mpi_nprocs_g*pio_mpi_rank_g);
/* Do not cast elmt_begin to other types, especially non-integral
* types, else it may introduce rounding discrepency. */
if (pio_mpi_rank_g < (pio_mpi_nprocs_g - 1))
elmts_count = (off_t)(((double)nelmts) / pio_mpi_nprocs_g * (pio_mpi_rank_g + 1))
- elmts_begin;
else
/* last process. Take whatever are left */
elmts_count = nelmts - elmts_begin;
}else{
/* Interleaved Pattern:
* Each process takes blk_size of elements, starting with the first
* process. So, the last process may have fewer or even none.
* Calculate the beginning element of this process and the next.
* The elmnts_begin here marks only the beginning of the first
* block accessed by this process.
*/
/* Algorithm:
* First allocate equal blocks per process, i.e. one block each
* process for every block_size*nprocs.
* If there is remaining unallocated, give a block each to process
* starting at proc 0. The last process may get a partial block.
*/
off_t remain_nelmts, remain_begin; /* unallocated remaining*/
nelmts_in_blk = blk_size/ELMT_SIZE;
elmts_begin = (off_t)(nelmts_in_blk*pio_mpi_rank_g);
/* must use integer calculation next */
/* allocate equal blocks per process */
elmts_count = (nelmts / (off_t)(nelmts_in_blk*pio_mpi_nprocs_g)) *
(off_t)nelmts_in_blk;
remain_nelmts = nelmts % ((off_t)(nelmts_in_blk*pio_mpi_nprocs_g));
/* allocate any remaining */
remain_begin = (off_t)(nelmts_in_blk*pio_mpi_rank_g);
if (remain_nelmts > remain_begin){
/* it gets something */
if (remain_nelmts > (remain_begin+(off_t)nelmts_in_blk)){
/* one full block */
elmts_count += nelmts_in_blk;
}else{
/* only a partial block */
elmts_count += remain_nelmts - remain_begin;
}
}
}
/* debug */
if (pio_debug_level >= 4) {
HDprint_rank(output);
HDfprintf(output, "Debug(do_write): "
"nelmts_in_blk=%Hd, elmts_begin=%Hd, elmts_count=%Hd\n",
(long_long)nelmts_in_blk, (long_long)elmts_begin,
(long_long)elmts_count);
}
/* The task is to transfer elmts_count elements, starting at
* elmts_begin position, using transfer buffer of buf_size bytes.
* If blk_size > 0, select blk_size at a time, in round robin
* fashion, according to number of process. Otherwise, select
* all elmt_count in contiguous.
*/
nelmts_xfer = 0 ;
/* Start "raw data" write timer */
set_time(res->timers, HDF5_RAW_WRITE_FIXED_DIMS, START);
while (nelmts_xfer < elmts_count){
/* transfer one buffer of data each round */
/* Note: because size_t is unsigned, avoid expressions that */
/* can be negative. */
if ((nelmts_xfer + (off_t)nelmts_in_buf) <= elmts_count) {
nelmts_toxfer = nelmts_in_buf;
} else {
/* last transfer of a partial buffer */
nelmts_toxfer = elmts_count - nelmts_xfer;
}
if (parms->verify) {
/*Prepare write data for verify later*/
int *intptr = (int *)buffer;
size_t i;
for (i = 0; i < nelmts_toxfer; ++i)
*intptr++ = pio_mpi_rank_g;
}
/* Write */
/* Calculate offset of write within a dataset/file */
switch (parms->io_type) {
case POSIXIO:
if (blk_size==0){
/* Contiguous pattern */
/* need to (off_t) the elmnts_begin expression because they */
/* may be of smaller sized integer types */
file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE;
/* only care if seek returns error */
rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
VRFY((rc==0), "POSIXSEEK");
/* check if all bytes are transferred */
rc = ((ssize_t)(nelmts_toxfer*ELMT_SIZE) ==
POSIXWRITE(fd->posixfd, buffer, nelmts_toxfer*ELMT_SIZE));
VRFY((rc != 0), "POSIXWRITE");
}else{
/* interleaved access pattern */
char *buf_p=buffer;
size_t xferred=0;
size_t toxfer=0;
file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE;
if (pio_debug_level >= 4) {
HDprint_rank(output);
HDfprintf(output,
"Debug(do_write): "
"nelmts_toxfer=%Hd, nelmts_xfer=%Hd\n"
,
(long_long)nelmts_toxfer, (long_long)nelmts_xfer);
}
while (xferred < nelmts_toxfer){
if ((nelmts_toxfer - xferred) >= nelmts_in_blk)
toxfer = nelmts_in_blk;
else
toxfer = nelmts_toxfer - xferred;
/* Skip offset over blocks of other processes */
file_offset = dset_offset +
(off_t)(elmts_begin + (nelmts_xfer+xferred)*pio_mpi_nprocs_g)*(off_t)ELMT_SIZE;
if (pio_debug_level >= 4) {
HDprint_rank(output);
HDfprintf(output,
"Debug(do_write): "
"nelmts_toxfer=%Hd, nelmts_xfer=%Hd"
", toxfer=%Hd, xferred=%Hd"
", file_offset=%Hd"
"\n",
(long_long)nelmts_toxfer, (long_long)nelmts_xfer,
(long_long)toxfer, (long_long)xferred,
(long_long)file_offset);
}
/* only care if seek returns error */
rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
VRFY((rc==0), "POSIXSEEK");
/* check if all bytes are written */
rc = ((ssize_t)(toxfer*ELMT_SIZE) ==
POSIXWRITE(fd->posixfd, buf_p, toxfer*ELMT_SIZE));
VRFY((rc != 0), "POSIXWRITE");
xferred += toxfer;
}
}
break;
case MPIO:
mpi_offset = dset_offset + (elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE;
mrc = MPI_File_write_at(fd->mpifd, mpi_offset, buffer,
(int)(nelmts_toxfer), ELMT_MPI_TYPE,
&mpi_status);
VRFY((mrc==MPI_SUCCESS), "MPIO_WRITE");
break;
case PHDF5:
/*set up the dset space id to select the segment to process */
{
h5mem_start[0] = elmts_begin + nelmts_xfer;
h5mem_stride[0] = h5mem_block[0] = nelmts_toxfer;
h5mem_count[0] = 1;
hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET,
h5mem_start, h5mem_stride, h5mem_count, h5mem_block);
VRFY((hrc >= 0), "H5Sset_hyperslab");
/*setup the memory space id too. Only start is different */
h5mem_start[0] = 0;
hrc = H5Sselect_hyperslab(h5mem_space_id, H5S_SELECT_SET,
h5mem_start, h5mem_stride, h5mem_count, h5mem_block);
VRFY((hrc >= 0), "H5Sset_hyperslab");
}
/* set write time here */
hrc = H5Dwrite(h5ds_id, ELMT_H5_TYPE, h5mem_space_id,
h5dset_space_id, H5P_DEFAULT, buffer);
VRFY((hrc >= 0), "H5Dwrite");
break;
}
nelmts_xfer += nelmts_toxfer;
}
/* Stop "raw data" write timer */
set_time(res->timers, HDF5_RAW_WRITE_FIXED_DIMS, STOP);
/* Calculate write time */
/* Close dataset. Only HDF5 needs to do an explicit close. */
if (parms->io_type == PHDF5){
hrc = H5Dclose(h5ds_id);
if (hrc < 0) {
fprintf(stderr, "HDF5 Dataset Close failed\n");
GOTOERROR(FAIL);
}
h5ds_id = -1;
}
}
done:
/* release HDF5 objects */
if (h5dset_space_id != -1) {
hrc = H5Sclose(h5dset_space_id);
if (hrc < 0){
fprintf(stderr, "HDF5 Dataset Space Close failed\n");
ret_code = FAIL;
} else {
h5dset_space_id = -1;
}
}
if (h5mem_space_id != -1) {
hrc = H5Sclose(h5mem_space_id);
if (hrc < 0) {
fprintf(stderr, "HDF5 Memory Space Close failed\n");
ret_code = FAIL;
} else {
h5mem_space_id = -1;
}
}
return ret_code;
}
/*
* Function: do_read
* Purpose: read the required amount of data from the file.
* Return: SUCCESS or FAIL
* Programmer: Albert Cheng 2001/12/13
* Modifications:
*/
static herr_t
do_read(results *res, file_descr *fd, parameters *parms, long ndsets,
off_t nelmts, size_t blk_size, size_t buf_size, void *buffer /*out*/)
{
int ret_code = SUCCESS;
int rc; /*routine return code */
int mrc; /*MPI return code */
MPI_Offset mpi_offset;
MPI_Status mpi_status;
long ndset;
off_t nelmts_xfer;
size_t nelmts_toxfer;
char dname[64];
off_t dset_offset=0; /*dataset offset in a file */
off_t file_offset; /*file offset of the next transfer */
off_t dset_size; /*one dataset size in bytes */
size_t nelmts_in_buf; /*how many element the buffer holds */
size_t nelmts_in_blk=0; /*how many element a block holds */
off_t elmts_begin; /*first elmt this process transfer */
off_t elmts_count; /*number of elmts this process transfer */
/* HDF5 variables */
herr_t hrc; /*HDF5 return code */
hsize_t h5dims[1]; /*dataset dim sizes */
hid_t h5dset_space_id = -1; /*dataset space ID */
hid_t h5mem_space_id = -1; /*memory dataspace ID */
hid_t h5ds_id = -1; /*dataset handle */
hsize_t h5mem_block[1]; /*memory space selection */
hsize_t h5mem_stride[1];
hsize_t h5mem_count[1];
hssize_t h5mem_start[1];
#if 0
/* for future implementation */
hsize_t h5dset_block[1]; /*dset space selection */
hsize_t h5dset_stride[1];
hsize_t h5dset_count[1];
hssize_t h5dset_start[1];
#endif
/* calculate dataset parameters. data type is always native C int */
dset_size = nelmts * (off_t)ELMT_SIZE;
nelmts_in_buf = buf_size/ELMT_SIZE;
/* hdf5 data space setup */
if (parms->io_type == PHDF5){
if(nelmts>0) {
/* define a contiquous dataset of nelmts native ints */
h5dims[0] = nelmts;
h5dset_space_id = H5Screate_simple(1, h5dims, NULL);
VRFY((h5dset_space_id >= 0), "H5Screate_simple");
} /* end if */
else {
h5dset_space_id = H5Screate(H5S_SCALAR);
VRFY((h5dset_space_id >= 0), "H5Screate");
} /* end else */
/* create the memory dataspace that corresponds to the xfer buffer */
if(nelmts_in_buf>0) {
h5dims[0] = nelmts_in_buf;
h5mem_space_id = H5Screate_simple(1, h5dims, NULL);
VRFY((h5mem_space_id >= 0), "H5Screate_simple");
} /* end if */
else {
h5mem_space_id = H5Screate(H5S_SCALAR);
VRFY((h5mem_space_id >= 0), "H5Screate");
} /* end else */
}
for (ndset = 1; ndset <= ndsets; ++ndset) {
/* Calculate dataset offset within a file */
/* create dataset */
switch (parms->io_type) {
case POSIXIO:
case MPIO:
/* both posix and mpi io just need dataset offset in file*/
dset_offset = (ndset - 1) * dset_size;
break;
case PHDF5:
sprintf(dname, "Dataset_%ld", ndset);
h5ds_id = H5Dopen(fd->h5fd, dname);
if (h5ds_id < 0) {
fprintf(stderr, "HDF5 Dataset open failed\n");
GOTOERROR(FAIL);
}
break;
}
/* There are two kinds of transfer patterns, contiguous and interleaved.
* Let 0,1,2,...,n be data accessed by process 0,1,2,...,n
* where n is rank of the last process.
* In contiguous pattern, data are accessed as
* 000...111...222...nnn...
* In interleaved pattern, data are accessed as
* 012...n012...n...
* These are all in the scope of one dataset.
*/
/* Calculate the total number of elements (elmts_count) to be
* transferred by this process. It may be different for different
* transfer pattern due to rounding to integral values.
*/
if (blk_size==0){
/* Contiguous Pattern:
* Calculate the beginning element of this process and the next.
* elmts_count is the difference between these two beginnings.
* This way, it eliminates any rounding errors.
*/
elmts_begin = (off_t)(((double)nelmts)/pio_mpi_nprocs_g*pio_mpi_rank_g);
/* Do not cast elmt_begin to other types, especially non-integral
* types, else it may introduce rounding discrepency. */
if (pio_mpi_rank_g < (pio_mpi_nprocs_g - 1))
elmts_count = (off_t)(((double)nelmts) / pio_mpi_nprocs_g * (pio_mpi_rank_g + 1))
- elmts_begin;
else
/* last process. Take whatever are left */
elmts_count = nelmts - elmts_begin;
}else{
/* Interleaved Pattern:
* Each process takes blk_size of elements, starting with the first
* process. So, the last process may have fewer or even none.
* Calculate the beginning element of this process and the next.
* The elmnts_begin here marks only the beginning of the first
* block accessed by this process.
*/
/* Algorithm:
* First allocate equal blocks per process, i.e. one block each
* process for every block_size*nprocs.
* If there is remaining unallocated, give a block each to process
* starting at proc 0. The last process may get a partial block.
*/
off_t remain_nelmts, remain_begin; /* unallocated remaining*/
nelmts_in_blk = blk_size/ELMT_SIZE;
elmts_begin = (off_t)(nelmts_in_blk*pio_mpi_rank_g);
/* must use integer calculation next */
/* allocate equal blocks per process */
elmts_count = (nelmts / (off_t)(nelmts_in_blk*pio_mpi_nprocs_g)) *
(off_t)nelmts_in_blk;
remain_nelmts = nelmts % ((off_t)(nelmts_in_blk*pio_mpi_nprocs_g));
/* allocate any remaining */
remain_begin = (off_t)(nelmts_in_blk*pio_mpi_rank_g);
if (remain_nelmts > remain_begin){
/* it gets something */
if (remain_nelmts > (remain_begin+(off_t)nelmts_in_blk)){
/* one full block */
elmts_count += nelmts_in_blk;
}else{
/* only a partial block */
elmts_count += remain_nelmts - remain_begin;
}
}
}
/* debug */
if (pio_debug_level >= 4) {
HDprint_rank(output);
HDfprintf(output, "Debug(do_read): "
"nelmts_in_blk=%Hd, elmts_begin=%Hd, elmts_count=%Hd\n",
(long_long)nelmts_in_blk, (long_long)elmts_begin,
(long_long)elmts_count);
}
/* The task is to transfer elmts_count elements, starting at
* elmts_begin position, using transfer buffer of buf_size bytes.
* If blk_size > 0, select blk_size at a time, in round robin
* fashion, according to number of process. Otherwise, select
* all elmt_count in contiguous.
*/
nelmts_xfer = 0 ;
/* Start "raw data" read timer */
set_time(res->timers, HDF5_RAW_READ_FIXED_DIMS, START);
while (nelmts_xfer < elmts_count){
/* transfer one buffer of data each round */
/* Note: because size_t is unsigned, avoid expressions that */
/* can be negative. */
if ((nelmts_xfer + (off_t)nelmts_in_buf) <= elmts_count) {
nelmts_toxfer = nelmts_in_buf;
} else {
/* last transfer of a partial buffer */
nelmts_toxfer = elmts_count - nelmts_xfer;
}
/* read */
/* Calculate offset of read within a dataset/file */
switch (parms->io_type){
case POSIXIO:
if (blk_size==0){
/* Contiguous pattern */
/* need to (off_t) the elmnts_begin expression because they */
/* may be of smaller sized integer types */
file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE;
/* only care if seek returns error */
rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
VRFY((rc==0), "POSIXSEEK");
/* check if all bytes are transferred */
rc = ((ssize_t)(nelmts_toxfer*ELMT_SIZE) ==
POSIXREAD(fd->posixfd, buffer, nelmts_toxfer*ELMT_SIZE));
VRFY((rc != 0), "POSIXREAD");
}else{
/* interleaved access pattern */
char *buf_p=buffer;
size_t xferred=0;
size_t toxfer=0;
file_offset = dset_offset + (off_t)(elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE;
if (pio_debug_level >= 4) {
HDprint_rank(output);
HDfprintf(output,
"Debug(do_read): "
"nelmts_toxfer=%Hd, nelmts_xfer=%Hd\n"
,
(long_long)nelmts_toxfer, (long_long)nelmts_xfer);
}
while (xferred < nelmts_toxfer){
if ((nelmts_toxfer - xferred) >= nelmts_in_blk)
toxfer = nelmts_in_blk;
else
toxfer = nelmts_toxfer - xferred;
/* Skip offset over blocks of other processes */
file_offset = dset_offset +
(off_t)(elmts_begin + (nelmts_xfer+xferred)*pio_mpi_nprocs_g)*(off_t)ELMT_SIZE;
if (pio_debug_level >= 4) {
HDprint_rank(output);
HDfprintf(output,
"Debug(do_read):"
"nelmts_toxfer=%Hd, nelmts_xfer=%Hd"
", toxfer=%Hd, xferred=%Hd"
", file_offset=%Hd"
"\n",
(long_long)nelmts_toxfer, (long_long)nelmts_xfer,
(long_long)toxfer, (long_long)xferred,
(long_long)file_offset);
}
/* only care if seek returns error */
rc = POSIXSEEK(fd->posixfd, file_offset) < 0 ? -1 : 0;
VRFY((rc==0), "POSIXSEEK");
/* check if all bytes are transferred */
rc = ((ssize_t)(toxfer*ELMT_SIZE) ==
POSIXREAD(fd->posixfd, buf_p, toxfer*ELMT_SIZE));
VRFY((rc != 0), "POSIXREAD");
xferred += toxfer;
}
}
break;
case MPIO:
mpi_offset = dset_offset + (elmts_begin + nelmts_xfer)*(off_t)ELMT_SIZE;
mrc = MPI_File_read_at(fd->mpifd, mpi_offset, buffer,
(int)(nelmts_toxfer), ELMT_MPI_TYPE,
&mpi_status);
VRFY((mrc==MPI_SUCCESS), "MPIO_read");
break;
case PHDF5:
/*set up the dset space id to select the segment to process */
{
h5mem_start[0] = elmts_begin + nelmts_xfer;
h5mem_stride[0] = h5mem_block[0] = nelmts_toxfer;
h5mem_count[0] = 1;
hrc = H5Sselect_hyperslab(h5dset_space_id, H5S_SELECT_SET,
h5mem_start, h5mem_stride, h5mem_count, h5mem_block);
VRFY((hrc >= 0), "H5Sset_hyperslab");
/*setup the memory space id too. Only start is different */
h5mem_start[0] = 0;
hrc = H5Sselect_hyperslab(h5mem_space_id, H5S_SELECT_SET,
h5mem_start, h5mem_stride, h5mem_count, h5mem_block);
VRFY((hrc >= 0), "H5Sset_hyperslab");
}
/* set read time here */
hrc = H5Dread(h5ds_id, ELMT_H5_TYPE, h5mem_space_id,
h5dset_space_id, H5P_DEFAULT, buffer);
VRFY((hrc >= 0), "H5Dread");
break;
} /* switch (parms->io_type) */
if (parms->verify) {
/*verify read data*/
int *intptr = (int *)buffer;
size_t i;
int nerror=0;
for (i = 0; i < nelmts_toxfer; ++i){
if (*intptr++ != pio_mpi_rank_g){
if (++nerror < 20){
/* report at most 20 errors */
HDprint_rank(output);
HDfprintf(output, "read data error, expected (%Hd), "
"got (%Hd)\n",
(long_long)pio_mpi_rank_g,
(long_long)*(intptr-1));
}
}
}
if (nerror >= 20) {
HDprint_rank(output);
HDfprintf(output, "...");
HDfprintf(output, "total read data errors=%Hd\n",
nerror);
}
} /* if (parms->verify) */
nelmts_xfer += nelmts_toxfer;
}
/* Stop "raw data" read timer */
set_time(res->timers, HDF5_RAW_READ_FIXED_DIMS, STOP);
/* Calculate read time */
/* Close dataset. Only HDF5 needs to do an explicit close. */
if (parms->io_type == PHDF5){
hrc = H5Dclose(h5ds_id);
if (hrc < 0) {
fprintf(stderr, "HDF5 Dataset Close failed\n");
GOTOERROR(FAIL);
}
h5ds_id = -1;
}
}
done:
/* release HDF5 objects */
if (h5dset_space_id != -1) {
hrc = H5Sclose(h5dset_space_id);
if (hrc < 0){
fprintf(stderr, "HDF5 Dataset Space Close failed\n");
ret_code = FAIL;
} else {
h5dset_space_id = -1;
}
}
if (h5mem_space_id != -1) {
hrc = H5Sclose(h5mem_space_id);
if (hrc < 0) {
fprintf(stderr, "HDF5 Memory Space Close failed\n");
ret_code = FAIL;
} else {
h5mem_space_id = -1;
}
}
return ret_code;
}
/*
* Function: do_fopen
* Purpose: Open the specified file.
* Return: SUCCESS or FAIL
* Programmer: Albert Cheng, Bill Wendling, 2001/12/13
* Modifications:
*/
static herr_t
do_fopen(parameters *param, char *fname, file_descr *fd /*out*/, int flags)
{
int ret_code = SUCCESS, mrc;
herr_t hrc;
hid_t acc_tpl = -1; /* file access templates */
switch (param->io_type) {
case POSIXIO:
if (flags & (PIO_CREATE | PIO_WRITE))
fd->posixfd = POSIXCREATE(fname);
else
fd->posixfd = POSIXOPEN(fname, O_RDONLY);
if (fd->posixfd < 0 ) {
fprintf(stderr, "POSIX File Open failed(%s)\n", fname);
GOTOERROR(FAIL);
}
/* The perils of POSIX I/O in a parallel environment. The problem is:
*
* - Process n opens a file with truncation and then starts
* writing to the file.
* - Process m also opens the file with truncation, but after
* process n has already started to write to the file. Thus,
* all of the stuff process n wrote is now lost.
*/
MPI_Barrier(pio_comm_g);
break;
case MPIO:
if (flags & (PIO_CREATE | PIO_WRITE)) {
MPI_File_delete(fname, h5_io_info_g);
mrc = MPI_File_open(pio_comm_g, fname, MPI_MODE_CREATE | MPI_MODE_RDWR,
h5_io_info_g, &fd->mpifd);
if (mrc != MPI_SUCCESS) {
fprintf(stderr, "MPI File Open failed(%s)\n", fname);
GOTOERROR(FAIL);
}
/*since MPI_File_open with MPI_MODE_CREATE does not truncate */
/*filesize , set size to 0 explicitedly. */
mrc = MPI_File_set_size(fd->mpifd, (MPI_Offset)0);
if (mrc != MPI_SUCCESS) {
fprintf(stderr, "MPI_File_set_size failed\n");
GOTOERROR(FAIL);
}
} else {
mrc = MPI_File_open(pio_comm_g, fname, MPI_MODE_RDONLY,
h5_io_info_g, &fd->mpifd);
if (mrc != MPI_SUCCESS) {
fprintf(stderr, "MPI File Open failed(%s)\n", fname);
GOTOERROR(FAIL);
}
}
break;
case PHDF5:
acc_tpl = H5Pcreate(H5P_FILE_ACCESS);
if (acc_tpl < 0) {
fprintf(stderr, "HDF5 Property List Create failed\n");
GOTOERROR(FAIL);
}
/* Set the file driver to the MPI-I/O driver */
hrc = H5Pset_fapl_mpio(acc_tpl, pio_comm_g, h5_io_info_g);
if (hrc < 0) {
fprintf(stderr, "HDF5 Property List Set failed\n");
GOTOERROR(FAIL);
}
/* Set the alignment of objects in HDF5 file */
hrc = H5Pset_alignment(acc_tpl, param->h5_thresh, param->h5_align);
if (hrc < 0) {
fprintf(stderr, "HDF5 Property List Set failed\n");
GOTOERROR(FAIL);
}
/* create the parallel file */
if (flags & (PIO_CREATE | PIO_WRITE)) {
fd->h5fd = H5Fcreate(fname, H5F_ACC_TRUNC, H5P_DEFAULT, acc_tpl);
} else {
fd->h5fd = H5Fopen(fname, H5F_ACC_RDONLY, acc_tpl);
}
hrc = H5Pclose(acc_tpl);
if (fd->h5fd < 0) {
fprintf(stderr, "HDF5 File Create failed(%s)\n", fname);
GOTOERROR(FAIL);
}
/* verifying the close of the acc_tpl */
if (hrc < 0) {
fprintf(stderr, "HDF5 Property List Close failed\n");
GOTOERROR(FAIL);
}
break;
}
done:
return ret_code;
}
/*
* Function: do_fclose
* Purpose: Close the specified file descriptor.
* Return: SUCCESS or FAIL
* Programmer: Albert Cheng, Bill Wendling, 2001/12/13
* Modifications:
*/
static herr_t
do_fclose(iotype iot, file_descr *fd /*out*/)
{
herr_t ret_code = SUCCESS, hrc;
int mrc = 0, rc = 0;
switch (iot) {
case POSIXIO:
rc = POSIXCLOSE(fd->posixfd);
if (rc != 0){
fprintf(stderr, "POSIX File Close failed\n");
GOTOERROR(FAIL);
}
fd->posixfd = -1;
break;
case MPIO:
mrc = MPI_File_close(&fd->mpifd);
if (mrc != MPI_SUCCESS){
fprintf(stderr, "MPI File close failed\n");
GOTOERROR(FAIL);
}
fd->mpifd = MPI_FILE_NULL;
break;
case PHDF5:
hrc = H5Fclose(fd->h5fd);
if (hrc < 0) {
fprintf(stderr, "HDF5 File Close failed\n");
GOTOERROR(FAIL);
}
fd->h5fd = -1;
break;
}
done:
return ret_code;
}
/*
* Function: do_fclose
* Purpose: Cleanup temporary file unless HDF5_NOCLEANUP is set.
* Only Proc 0 of the PIO communicator will do the cleanup.
* Other processes just return.
* Return: void
* Programmer: Albert Cheng 2001/12/12
* Modifications:
*/
static void
do_cleanupfile(iotype iot, char *fname)
{
if (pio_mpi_rank_g != 0)
return;
if (clean_file_g == -1)
clean_file_g = (getenv("HDF5_NOCLEANUP")==NULL) ? 1 : 0;
if (clean_file_g){
switch (iot){
case POSIXIO:
remove(fname);
break;
case MPIO:
case PHDF5:
MPI_File_delete(fname, h5_io_info_g);
break;
}
}
}
#ifdef H5_HAVE_GPFS
/* Descriptions here come from the IBM GPFS Manual */
/*
* Function: access_range
* Purpose: Declares an access range within a file for an
* application.
*
* The application will access file offsets within the given
* range, and will not access offsets outside the range.
* Violating this hint may produce worse performance than if
* no hint was specified.
*
* This hint is useful in situations where a file is
* partitioned coarsely among several nodes. If the ranges
* do not overlap, each node can specify which range of the
* file it will access, with a performance improvement in
* some cases, such as for sequential writing within a
* range.
*
* Subsequent GPFS_ACCESS_RANGE hints will replace a hint
* passed earlier.
*
* START - The start of the access range offset, in
* bytes, from the beginning of the file.
* LENGTH - Length of the access range. 0 indicates to
* the end of the file.
* Return: Nothing
* Programmer: Bill Wendling, 03. June 2002
* Modifications:
*/
static void
access_range(int handle, off_t start, off_t length, int is_write)
{
struct {
gpfsFcntlHeader_t hdr;
gpfsAccessRange_t access;
} access_range;
access_range.hdr.totalLength = sizeof(access_range);
access_range.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
access_range.hdr.fcntlReserved = 0;
access_range.start.structLen = sizeof(gpfsAccessRange_t);
access_range.start.structType = GPFS_ACCESS_RANGE;
access_range.start.start = start;
access_range.start.length = length;
access_range.start.isWrite = is_write;
if (gpfs_fcntl(handle, &access_range) != 0) {
fprintf(stderr,
"gpfs_fcntl DS start directive failed. errno=%d errorOffset=%d\n",
errno, ds_start.hdr.errorOffset);
exit(EXIT_FAILURE);
}
}
/*
* Function: free_range
* Purpose: Undeclares an access range within a file for an
* application.
*
* The application will no longer access file offsets within
* the given range. GPFS flushes the data at the file
* offsets and removes it from the cache.
*
* Multi-node applications that have finished one phase of
* their computation may wish to use this hint before the
* file is accessed in a conflicting mode from another node
* in a later phase. The potential performance benefit is
* that GPFS can avoid later synchronous cache consistency
* operations.
*
* START - The start of the access range offset, in
* bytes from the beginning of the file.
* LENGTH - Length of the access range. 0 indicates to
* the end of the file.
* Return: Nothing
* Programmer: Bill Wendling, 03. June 2002
* Modifications:
*/
static void
free_range(int handle, off_t start, off_t length)
{
struct {
gpfsFcntlHeader_t hdr;
gpfsFreeRange_t range;
} free_range;
/* Issue the invalidate hint */
free_range.hdr.totalLength = sizeof(free_range);
free_range.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
free_range.hdr.fcntlReserved = 0;
free_range.range.structLen = sizeof(gpfsFreeRange_t);
free_range.range.structType = GPFS_FREE_RANGE;
free_range.range.start = start;
free_range.range.length = length;
if (gpfs_fcntl(handle, &free_range) != 0) {
fprintf(stderr,
"gpfs_fcntl free range failed for range %d:%d. errno=%d errorOffset=%d\n",
start, length, errno, free_range.hdr.errorOffset);
exit(EXIT_FAILURE);
}
}
/*
* Function: clear_file_cache
* Purpose: Indicates file access in the near future is not expected.
*
* The application does not expect to make any further
* accesses to the file in the near future, so GPFS removes
* any data or metadata pertaining to the file from its
* cache.
*
* Multi-node applications that have finished one phase of
* their computation may wish to use this hint before the
* file is accessed in a conflicting mode from another node
* in a later phase. The potential performance benefit is
* that GPFS can avoid later synchronous cache consistency
* operations.
* Return: Nothing
* Programmer: Bill Wendling, 03. June 2002
* Modifications:
*/
static void
clear_file_cache(int handle)
{
struct {
gpfsFcntlHeader_t hdr;
gpfsClearFileCache_t clear;
} clear_cache;
clear_cache.hdr.totalLength = sizeof(clear_cache);
clear_cache.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
clear_cache.hdr.fcntlReserved = 0;
clear_cache.start.structLen = sizeof(gpfsClearFileCache_t);
clear_cache.start.structType = GPFS_CLEAR_FILE_CACHE;
if (gpfs_fcntl(handle, &clear_cache) != 0) {
fprintf(stderr,
"gpfs_fcntl clear file cache directive failed. errno=%d errorOffset=%d\n",
errno, clear_cache.hdr.errorOffset);
exit(EXIT_FAILURE);
}
}
/*
* Function: cancel_hints
* Purpose: Indicates to remove any hints against the open file
* handle.
*
* GPFS removes any hints that may have been issued against
* this open file handle:
*
* - The hint status of the file is restored ot what it
* would have been immediately after being opened, but
* does not affect the contents of the GPFS file
* cache. Cancelling an earlier hint that resulted in
* data being removed from the GPFS file cache does
* not bring that data back int othe cache; data
* re-enters the cache only pon access by the
* application or by user-driven or automatic
* prefetching.
* - Only the GPFS_MULTIPLE_ACCESS_RANGE hint has a
* state that might be removed by the
* GPFS_CANCEL_HINTS directive.
* Return: Nothing
* Programmer: Bill Wendling, 03. June 2002
* Modifications:
*/
static void
cancel_hints(int handle)
{
struct {
gpfsFcntlHeader_t hdr;
gpfsCancelHints_t cancel;
} cancel_hints;
cancel_hints.hdr.totalLength = sizeof(cancel_hints);
cancel_hints.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
cancel_hints.hdr.fcntlReserved = 0;
cancel_hints.start.structLen = sizeof(gpfsCancelHints_t);
cancel_hints.start.structType = GPFS_CANCEL_HINTS;
if (gpfs_fcntl(handle, &cancel_hints) != 0) {
fprintf(stderr,
"gpfs_fcntl cancel hints directive failed. errno=%d errorOffset=%d\n",
errno, ds_start.hdr.errorOffset);
exit(EXIT_FAILURE);
}
}
/*
* Function: start_data_shipping
* Purpose: Start up data shipping. The second parameter is the total
* number of open instances on all nodes that will be
* operating on the file. Must be called for every such
* instance with the same value of NUM_INSTS.
* Return: Nothing
* Programmer: Bill Wendling, 28. May 2002
* Modifications:
*/
static void
start_data_shipping(int handle, int num_insts)
{
struct {
gpfsFcntlHeader_t hdr;
gpfsDataShipStart_t start;
} ds_start;
ds_start.hdr.totalLength = sizeof(ds_start);
ds_start.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
ds_start.hdr.fcntlReserved = 0;
ds_start.start.structLen = sizeof(gpfsDataShipStart_t);
ds_start.start.structType = GPFS_DATA_SHIP_START;
ds_start.start.numInstances = num_insts;
ds_start.start.reserved = 0;
if (gpfs_fcntl(handle, &ds_start) != 0) {
fprintf(stderr,
"gpfs_fcntl DS start directive failed. errno=%d errorOffset=%d\n",
errno, ds_start.hdr.errorOffset);
exit(EXIT_FAILURE);
}
}
/*
* Function: stop_data_shipping
* Purpose: Shut down data shipping. Must be called for every handle
* for which start_data_shipping was called.
* Return: Nothing
* Programmer: Bill Wendling, 28. May 2002
* Modifications:
*/
static void
stop_data_shipping(int handle)
{
struct {
gpfsFcntlHeader_t hdr;
gpfsDataShipStop_t stop;
} ds_stop;
ds_stop.hdr.totalLength = sizeof(ds_stop);
ds_stop.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
ds_stop.hdr.fcntlReserved = 0;
ds_stop.stop.structLen = sizeof(ds_stop.stop);
ds_stop.stop.structType = GPFS_DATA_SHIP_STOP;
if (gpfs_fcntl(handle, &ds_stop) != 0)
fprintf(stderr,
"gpfs_fcntl DS stop directive failed. errno=%d errorOffset=%d\n",
errno, ds_stop.hdr.errorOffset);
}
/*
* Function: invalidate_file_cache
* Purpose: Invalidate all cached data held on behalf of a file on
* this node.
* Return: Nothing
* Programmer: Bill Wendling, 03. June 2002
* Modifications:
*/
static void
invalidate_file_cache(const char *filename)
{
int handle;
struct {
gpfsFcntlHeader_t hdr;
gpfsClearFileCache_t inv;
} inv_cache_hint;
/* Open the file. If the open fails, the file cannot be cached. */
handle = open(filename, O_RDONLY, 0);
if (handle == -1)
return;
/* Issue the invalidate hint */
inv_cache_hint.hdr.totalLength = sizeof(inv_cache_hint);
inv_cache_hint.hdr.fcntlVersion = GPFS_FCNTL_CURRENT_VERSION;
inv_cache_hint.hdr.fcntlReserved = 0;
inv_cache_hint.inv.structLen = sizeof(gpfsClearFileCache_t);
inv_cache_hint.inv.structType = GPFS_CLEAR_FILE_CACHE;
if (gpfs_fcntl(handle, &inv_cache_hint) != 0) {
fprintf(stderr,
"gpfs_fcntl clear cache hint failed for file '%s'.",
filename);
fprintf(stderr, " errno=%d errorOffset=%d\n",
errno, inv_cache_hint.hdr.errorOffset);
exit(1);
}
/* Close the file */
if (close(handle) == -1) {
fprintf(stderr,
"could not close file '%s' after flushing file cache,",
filename);
fprintf(stderr, "errno=%d\n", errno);
exit(1);
}
}
#else
/* H5_HAVE_GPFS isn't defined...stub functions */
static void
access_range(int UNUSED handle, off_t UNUSED start, off_t UNUSED length, int UNUSED is_write)
{
return;
}
static void
free_range(int UNUSED handle, off_t UNUSED start, off_t UNUSED length)
{
return;
}
static void
clear_file_cache(int UNUSED handle)
{
return;
}
static void
cancel_hints(int UNUSED handle)
{
return;
}
static void
start_data_shipping(int UNUSED handle, int UNUSED num_insts)
{
return;
}
static void
stop_data_shipping(int UNUSED handle)
{
return;
}
static void
invalidate_file_cache(const char UNUSED *filename)
{
return;
}
#endif /* H5_HAVE_GPFS */
#ifdef TIME_MPI
/* instrument the MPI_File_wrirte_xxx and read_xxx calls to measure
* pure time spent in MPI_File code.
*/
int MPI_File_read_at(MPI_File fh, MPI_Offset offset, void *buf,
int count, MPI_Datatype datatype, MPI_Status *status)
{
int err;
set_time(timer_g, HDF5_MPI_READ, START);
err=PMPI_File_read_at(fh, offset, buf, count, datatype, status);
set_time(timer_g, HDF5_MPI_READ, STOP);
return err;
}
int MPI_File_read_at_all(MPI_File fh, MPI_Offset offset, void *buf,
int count, MPI_Datatype datatype, MPI_Status *status)
{
int err;
set_time(timer_g, HDF5_MPI_READ, START);
err=PMPI_File_read_at_all(fh, offset, buf, count, datatype, status);
set_time(timer_g, HDF5_MPI_READ, STOP);
return err;
}
int MPI_File_write_at(MPI_File fh, MPI_Offset offset, void *buf,
int count, MPI_Datatype datatype, MPI_Status *status)
{
int err;
set_time(timer_g, HDF5_MPI_WRITE, START);
err=PMPI_File_write_at(fh, offset, buf, count, datatype, status);
set_time(timer_g, HDF5_MPI_WRITE, STOP);
return err;
}
int MPI_File_write_at_all(MPI_File fh, MPI_Offset offset, void *buf,
int count, MPI_Datatype datatype, MPI_Status *status)
{
int err;
set_time(timer_g, HDF5_MPI_WRITE, START);
err=PMPI_File_write_at_all(fh, offset, buf, count, datatype, status);
set_time(timer_g, HDF5_MPI_WRITE, STOP);
return err;
}
#endif /* TIME_MPI */
#endif /* H5_HAVE_PARALLEL */