Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
/**
|
|
|
|
* @file
|
|
|
|
*
|
|
|
|
* Infer as much as possible from the omode + path.
|
|
|
|
* Possibly rewrite the path.
|
|
|
|
*
|
|
|
|
* Copyright 2018 University Corporation for Atmospheric
|
|
|
|
* Research/Unidata. See COPYRIGHT file for more info.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "config.h"
|
|
|
|
#include <stdlib.h>
|
|
|
|
#ifdef HAVE_UNISTD_H
|
|
|
|
#include <unistd.h>
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include "ncdispatch.h"
|
|
|
|
#include "ncwinpath.h"
|
|
|
|
#include "netcdf_mem.h"
|
|
|
|
#include "fbits.h"
|
|
|
|
#include "ncbytes.h"
|
|
|
|
#include "nclist.h"
|
|
|
|
#include "nclog.h"
|
|
|
|
#ifdef ENABLE_HTTP
|
|
|
|
#include "nchttp.h"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#undef DEBUG
|
|
|
|
|
|
|
|
/**
|
|
|
|
Sort info for open/read/close of
|
|
|
|
file when searching for magic numbers
|
|
|
|
*/
|
|
|
|
struct MagicFile {
|
|
|
|
const char* path;
|
|
|
|
struct NCURI* uri;
|
|
|
|
NCmodel* model;
|
|
|
|
fileoffset_t filelen;
|
|
|
|
int use_parallel;
|
|
|
|
void* parameters; /* !NULL if inmemory && !diskless */
|
|
|
|
FILE* fp;
|
|
|
|
#ifdef USE_PARALLEL
|
|
|
|
MPI_File fh;
|
|
|
|
#endif
|
|
|
|
#ifdef ENABLE_HTTP
|
|
|
|
void* curl; /* avoid need to include curl.h */
|
|
|
|
char* curlurl; /* url to use with CURLOPT_SET_URL */
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
/** @internal Magic number for HDF5 files. To be consistent with
|
|
|
|
* H5Fis_hdf5, use the complete HDF5 magic number */
|
|
|
|
static char HDF5_SIGNATURE[MAGIC_NUMBER_LEN] = "\211HDF\r\n\032\n";
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
|
|
|
|
static void dbgflush(void)
|
|
|
|
{
|
|
|
|
fflush(stdout);
|
|
|
|
fflush(stderr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
fail(int err)
|
|
|
|
{
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
check(int err)
|
|
|
|
{
|
|
|
|
if(err != NC_NOERR)
|
|
|
|
fail(err);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define check(err) (err)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#define modelcomplete(model) ((model)->format != 0 && (model)->iosp != 0 && (model)->impl != 0)
|
|
|
|
|
|
|
|
enum mfield {MF, MI, MIO, MV};
|
|
|
|
|
|
|
|
/* Wrap model field assignment to fail if the
|
|
|
|
existing value is not zero and not same as src value
|
|
|
|
*/
|
|
|
|
#define conflictset(f,dst,src) do {if((dst) != 0 && (src) != (dst)) {stat=conflictfail(f,(dst),(src)); goto done;} else {(dst) = (src);} } while(0)
|
|
|
|
|
|
|
|
/*
|
|
|
|
Define a table of iosp string values for "mode=".
|
|
|
|
Includes cases where the impl or format implies the
|
|
|
|
iosp. Does not includes cases where NC_IOSP_FILE is
|
|
|
|
the inferred iosp.
|
|
|
|
*/
|
|
|
|
static struct IOSPS {
|
|
|
|
const char* tag;
|
|
|
|
const int iosp; /* NC_IOSP_XXX value */
|
|
|
|
} iosps[] = {
|
|
|
|
{"dap2",NC_IOSP_DAP2},
|
|
|
|
{"dap4",NC_IOSP_DAP4},
|
|
|
|
{"bytes",NC_IOSP_HTTP},
|
|
|
|
{NULL,0}
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
Define a table of "mode=" string values.
|
|
|
|
Note that only cases that can currently
|
|
|
|
take URLs are included.
|
|
|
|
*/
|
|
|
|
static struct FORMATMODES {
|
|
|
|
const char* tag;
|
|
|
|
const int format; /* NC_FORMAT_XXX value */
|
|
|
|
const int impl; /* NC_FORMATX_XXX value */
|
|
|
|
} formatmodes[] = {
|
|
|
|
{"dap2",NC_FORMAT_CLASSIC,NC_FORMATX_DAP2},
|
|
|
|
{"dap4",NC_FORMAT_NETCDF4,NC_FORMATX_DAP4},
|
|
|
|
{"netcdf-3",NC_FORMAT_CLASSIC,NC_FORMATX_NC3},
|
|
|
|
{"classic",NC_FORMAT_CLASSIC,NC_FORMATX_NC3},
|
|
|
|
{"netcdf-4",NC_FORMAT_NETCDF4,NC_FORMATX_NC4},
|
|
|
|
{"enhanced",NC_FORMAT_NETCDF4,NC_FORMATX_NC4},
|
|
|
|
{"64bitoffset",NC_FORMAT_64BIT_OFFSET,0},
|
|
|
|
{"64bitdata",NC_FORMAT_64BIT_DATA,0},
|
|
|
|
{"cdf5",NC_FORMAT_64BIT_DATA,0}, /*alias*/
|
|
|
|
#if 0
|
|
|
|
{"hdf4",NC_FORMAT_HDF4,NC_FORMATX_NC4},
|
|
|
|
#endif
|
|
|
|
{NULL,0,0},
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Define the legal singleton mode tags */
|
|
|
|
static const char* modesingles[] = {
|
|
|
|
"dap2", "dap4", "bytes", "zarr", NULL,
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Map IOSP to readability to get magic number */
|
|
|
|
static struct IospRead {
|
|
|
|
int iosp;
|
|
|
|
int readable;
|
|
|
|
} readable[] = {
|
|
|
|
{NC_IOSP_FILE,1},
|
|
|
|
{NC_IOSP_MEMORY,1},
|
|
|
|
{NC_IOSP_UDF,0},
|
|
|
|
{NC_IOSP_HTTP,1},
|
|
|
|
{0,0},
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Define the known URL protocols and their interpretation */
|
|
|
|
static struct NCPROTOCOLLIST {
|
|
|
|
const char* protocol;
|
|
|
|
const char* substitute;
|
|
|
|
const char* mode;
|
|
|
|
} ncprotolist[] = {
|
|
|
|
{"http",NULL,NULL},
|
|
|
|
{"https",NULL,NULL},
|
|
|
|
{"file",NULL,NULL},
|
|
|
|
{"dods","http","dap2"},
|
|
|
|
{"dap4","http","dap4"},
|
|
|
|
{NULL,NULL,NULL} /* Terminate search */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* Forward */
|
|
|
|
static int NC_omodeinfer(int omode, NCmodel*);
|
|
|
|
static int NC_implinfer(int useparallel, NCmodel* model);
|
|
|
|
static int NC_dapinfer(NClist*, NCmodel* model);
|
|
|
|
static int check_file_type(const char *path, int flags, int use_parallel, void *parameters, NCmodel* model, NCURI* uri);
|
|
|
|
static int processuri(const char* path, NCURI** urip, char** newpathp, NClist* modeargs);
|
|
|
|
static int extractiosp(NClist* modeargs, int mode, NCmodel* model);
|
|
|
|
|
|
|
|
static int openmagic(struct MagicFile* file);
|
|
|
|
static int readmagic(struct MagicFile* file, long pos, char* magic);
|
|
|
|
static int closemagic(struct MagicFile* file);
|
|
|
|
static int NC_interpret_magic_number(char* magic, NCmodel* model);
|
|
|
|
#ifdef DEBUG
|
|
|
|
static void printmagic(const char* tag, char* magic,struct MagicFile*);
|
|
|
|
#endif
|
|
|
|
static int isreadable(int iosp);
|
|
|
|
|
|
|
|
/* Report a conflicting model field assignment;
|
|
|
|
see the conflictset macro above */
|
|
|
|
static int
|
|
|
|
conflictfail(enum mfield f, int dst, int src)
|
|
|
|
{
|
|
|
|
const char* sf = NULL;
|
|
|
|
switch (f) {
|
|
|
|
case MF: sf = "format"; break;
|
|
|
|
case MI: sf = "impl"; break;
|
|
|
|
case MIO: sf = "iosp"; break;
|
|
|
|
case MV: sf = "version"; break;
|
|
|
|
default: sf = "?"; break;
|
|
|
|
}
|
|
|
|
nclog(NCLOGERR,"Model inference conflict: field=%s dst=%d src=%d", sf,dst,src);
|
|
|
|
return NC_EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Parse a mode string at the commas and convert to envv form */
|
|
|
|
static int
|
|
|
|
parseurlmode(const char* modestr, NClist* list)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
const char* p = NULL;
|
|
|
|
const char* endp = NULL;
|
|
|
|
|
|
|
|
if(modestr == NULL || *modestr == '\0') goto done;
|
|
|
|
|
|
|
|
/* Split modestr at the commas or EOL */
|
|
|
|
p = modestr;
|
|
|
|
for(;;) {
|
|
|
|
char* s;
|
|
|
|
ptrdiff_t slen;
|
|
|
|
endp = strchr(p,',');
|
|
|
|
if(endp == NULL) endp = p + strlen(p);
|
|
|
|
slen = (endp - p);
|
|
|
|
if((s = malloc(slen+1)) == NULL) {stat = NC_ENOMEM; goto done;}
|
|
|
|
memcpy(s,p,slen);
|
|
|
|
s[slen] = '\0';
|
|
|
|
nclistpush(list,s);
|
|
|
|
if(*endp == '\0') break;
|
|
|
|
p = endp+1;
|
2019-01-29 05:05:19 +08:00
|
|
|
}
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
|
|
|
|
done:
|
|
|
|
return check(stat);
|
|
|
|
}
|
|
|
|
|
2019-01-29 05:05:19 +08:00
|
|
|
/* Given a mode= argument, and the mode flags,
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
infer the iosp part of the model */
|
|
|
|
static int
|
|
|
|
extractiosp(NClist* modeargs, int cmode, NCmodel* model)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
struct IOSPS* io = iosps;
|
|
|
|
|
|
|
|
assert(model->iosp == 0);
|
|
|
|
for(;io->tag;io++) {
|
|
|
|
int i;
|
|
|
|
for(i=0;i<nclistlength(modeargs);i++) {
|
|
|
|
const char* p = nclistget(modeargs,i);
|
|
|
|
if(strcmp(p,io->tag)==0) {
|
|
|
|
conflictset(MIO,model->iosp,io->iosp);
|
|
|
|
goto done;
|
2019-01-29 05:05:19 +08:00
|
|
|
}
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
done:
|
|
|
|
if(model->iosp == 0)
|
|
|
|
model->iosp = (fIsSet(cmode,NC_INMEMORY) ? NC_IOSP_MEMORY:NC_IOSP_FILE);
|
|
|
|
return stat;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Given a mode= argument, fill in the matching part of the model; except IOSP */
|
|
|
|
static int
|
|
|
|
processmodearg(const char* arg, NCmodel* model)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
struct FORMATMODES* format = formatmodes;
|
|
|
|
for(;format->tag;format++) {
|
|
|
|
if(strcmp(format->tag,arg)==0) {
|
|
|
|
conflictset(MF,model->format,format->format);
|
|
|
|
conflictset(MI,model->impl,format->impl);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
done:
|
|
|
|
return check(stat);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Search singleton list */
|
|
|
|
static int
|
|
|
|
issingleton(const char* tag)
|
|
|
|
{
|
|
|
|
const char** p;
|
|
|
|
for(p=modesingles;*p;p++) {
|
|
|
|
if(strcmp(*p,tag)==0) return 1;
|
|
|
|
}
|
2019-01-29 05:05:19 +08:00
|
|
|
return 0;
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* If we have a url, see if we can determine DAP */
|
|
|
|
static int
|
|
|
|
NC_dapinfer(NClist* modeargs, NCmodel* model)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* 1. search modeargs for indicators */
|
|
|
|
for(i=0;i<nclistlength(modeargs);i++) {
|
|
|
|
const char* arg = nclistget(modeargs,i);
|
|
|
|
if(strcasecmp(arg,"bytes")==0
|
2019-01-29 05:05:19 +08:00
|
|
|
|| strcasecmp(arg,"zarr")==0) {
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
/* Ok, we know this is not DAP, so give up */
|
|
|
|
return stat;
|
|
|
|
}
|
2019-01-29 05:05:19 +08:00
|
|
|
if(strcasecmp(arg,"dap2")==0) {
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
model->format = NC_FORMAT_NC3;
|
|
|
|
model->iosp = NC_IOSP_DAP2;
|
|
|
|
model->impl = NC_FORMATX_DAP2;
|
2019-01-29 05:05:19 +08:00
|
|
|
} else if(strcasecmp(arg,"dap4")==0) {
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
model->format = NC_FORMAT_NETCDF4;
|
|
|
|
model->iosp = NC_IOSP_DAP4;
|
|
|
|
model->impl = NC_FORMATX_DAP4;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Ok, we have a URL, but no tags to tell us what it is, so assume DAP2 */
|
|
|
|
if(model->impl == 0) {
|
|
|
|
model->format = NC_FORMAT_NC3;
|
|
|
|
model->iosp = NC_IOSP_DAP2;
|
|
|
|
model->impl = NC_FORMATX_DAP2;
|
|
|
|
}
|
|
|
|
return stat;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
Infer from the mode
|
|
|
|
only call if iscreate or file is not easily readable.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
NC_omodeinfer(int cmode, NCmodel* model)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
|
|
|
|
/* If no format flags are set, then use default */
|
|
|
|
if(!fIsSet(cmode,NC_FORMAT_ALL))
|
|
|
|
conflictset(MF,model->format,nc_get_default_format());
|
|
|
|
|
|
|
|
/* Process the cmode; may override some already set flags */
|
|
|
|
if(fIsSet(cmode,NC_64BIT_OFFSET)) {
|
|
|
|
conflictset(MF,model->format,NC_FORMAT_64BIT_OFFSET);
|
|
|
|
}
|
|
|
|
if(fIsSet(cmode,NC_64BIT_DATA)) {
|
|
|
|
conflictset(MF,model->format,NC_FORMAT_64BIT_DATA);
|
|
|
|
}
|
|
|
|
if(fIsSet(cmode,NC_NETCDF4)) {
|
|
|
|
conflictset(MF,model->format,NC_FORMAT_NETCDF4);
|
|
|
|
}
|
|
|
|
if(fIsSet(cmode,(NC_UDF0|NC_UDF1))) {
|
|
|
|
conflictset(MF,model->format,NC_FORMAT_NETCDF4);
|
|
|
|
/* For user formats, we must back out some previous decisions */
|
|
|
|
model->iosp = NC_IOSP_UDF; /* Do not know anything about this */
|
|
|
|
if(fIsSet(cmode,NC_UDF0)) {
|
|
|
|
conflictset(MI,model->impl,NC_FORMATX_UDF0);
|
|
|
|
} else {
|
|
|
|
conflictset(MI,model->impl,NC_FORMATX_UDF1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* Ignore following flags for now */
|
2019-01-03 05:37:23 +08:00
|
|
|
#if 0 /* keep lgtm happy */
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
if(fIsSet(cmode,NC_CLASSIC_MODEL)) {}
|
|
|
|
if(fIsSet(cmode,NC_DISKLESS)) {}
|
2019-01-02 10:34:12 +08:00
|
|
|
#endif
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
|
|
|
|
done:
|
|
|
|
return check(stat);
|
2019-01-29 05:05:19 +08:00
|
|
|
}
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
|
|
|
|
/* Infer the implementation/dispatcher from format*/
|
|
|
|
static int
|
|
|
|
NC_implinfer(int useparallel, NCmodel* model)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
|
|
|
|
/* If we do not have a format, then use default format */
|
|
|
|
if(model->format == 0)
|
|
|
|
conflictset(MF,model->format,nc_get_default_format());
|
|
|
|
|
|
|
|
/* Try to infer impl based on format; may modify mode flags */
|
|
|
|
if(model->impl == 0) {
|
|
|
|
switch (model->format) {
|
|
|
|
case NC_FORMAT_NETCDF4:
|
|
|
|
conflictset(MI,model->impl,NC_FORMATX_NC4);
|
|
|
|
break;
|
|
|
|
case NC_FORMAT_NETCDF4_CLASSIC:
|
|
|
|
conflictset(MI,model->impl,NC_FORMATX_NC4);
|
|
|
|
break;
|
|
|
|
case NC_FORMAT_CDF5:
|
|
|
|
conflictset(MI,model->impl,NC_FORMATX_NC3);
|
|
|
|
break;
|
|
|
|
case NC_FORMAT_64BIT_OFFSET:
|
|
|
|
conflictset(MI,model->impl,NC_FORMATX_NC3);
|
|
|
|
break;
|
|
|
|
case NC_FORMAT_CLASSIC:
|
|
|
|
conflictset(MI,model->impl,NC_FORMATX_NC3);
|
|
|
|
break;
|
|
|
|
default: break;
|
|
|
|
}
|
|
|
|
/* default dispatcher if above did not infer an implementation */
|
|
|
|
if (model->impl == 0)
|
|
|
|
conflictset(MI,model->impl,NC_FORMATX_NC3); /* Final choice */
|
|
|
|
/* Check for using PNETCDF */
|
|
|
|
if (model->impl== NC_FORMATX_NC3
|
|
|
|
&& useparallel
|
|
|
|
&& model->iosp == NC_IOSP_FILE)
|
|
|
|
model->impl = NC_FORMATX_PNETCDF; /* Use this instead */
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(model->impl != 0);
|
|
|
|
done:
|
|
|
|
return check(stat);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
processuri(const char* path, NCURI** urip, char** newpathp, NClist* modeargs)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
int found = 0;
|
|
|
|
const char** fragp = NULL;
|
|
|
|
struct NCPROTOCOLLIST* protolist;
|
|
|
|
NCURI* uri = NULL;
|
|
|
|
size_t pathlen = strlen(path);
|
|
|
|
|
|
|
|
if(path == NULL || pathlen == 0) {stat = NC_EURL; goto done;}
|
|
|
|
|
|
|
|
/* Defaults */
|
|
|
|
if(newpathp) *newpathp = NULL;
|
|
|
|
if(urip) *urip = NULL;
|
|
|
|
|
|
|
|
if(ncuriparse(path,&uri) != NCU_OK) goto done; /* not url */
|
|
|
|
|
|
|
|
/* Look up the protocol */
|
|
|
|
for(found=0,protolist=ncprotolist;protolist->protocol;protolist++) {
|
|
|
|
if(strcmp(uri->protocol,protolist->protocol) == 0) {
|
|
|
|
found = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if(!found)
|
|
|
|
{stat = NC_EINVAL; goto done;} /* unrecognized URL form */
|
|
|
|
|
|
|
|
/* process the corresponding mode arg */
|
|
|
|
if(protolist->mode != NULL)
|
|
|
|
nclistpush(modeargs,strdup(protolist->mode));
|
|
|
|
|
|
|
|
/* Substitute the protocol in any case */
|
|
|
|
if(protolist->substitute) ncurisetprotocol(uri,protolist->substitute);
|
|
|
|
|
|
|
|
/* Iterate over the url fragment parameters */
|
|
|
|
for(fragp=ncurifragmentparams(uri);fragp && *fragp;fragp+=2) {
|
|
|
|
const char* name = fragp[0];
|
|
|
|
const char* value = fragp[1];
|
|
|
|
if(strcmp(name,"protocol")==0) {
|
|
|
|
nclistpush(modeargs,strdup(value));
|
|
|
|
} else
|
|
|
|
if(strcmp(name,"mode")==0) {
|
2019-01-29 05:05:19 +08:00
|
|
|
if((stat = parseurlmode(value,modeargs))) goto done;
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
} else
|
|
|
|
if(issingleton(name) && (value == NULL || strlen(value)==0)) {
|
|
|
|
nclistpush(modeargs,strdup(name));
|
|
|
|
} /*else ignore*/
|
|
|
|
}
|
|
|
|
|
|
|
|
/* At this point modeargs should contain all mode args from the URL */
|
|
|
|
|
|
|
|
/* Rebuild the path (including fragment)*/
|
|
|
|
if(newpathp)
|
|
|
|
*newpathp = ncuribuild(uri,NULL,NULL,NCURIALL);
|
|
|
|
if(urip) {
|
|
|
|
*urip = uri;
|
|
|
|
uri = NULL;
|
|
|
|
}
|
|
|
|
done:
|
|
|
|
if(uri != NULL) ncurifree(uri);
|
|
|
|
return check(stat);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************/
|
|
|
|
/*
|
|
|
|
Infer model for this dataset using some
|
|
|
|
combination of cmode, path, and reading the dataset.
|
|
|
|
|
|
|
|
The precedence order is:
|
|
|
|
1. file contents -- highest precedence
|
|
|
|
2. path
|
|
|
|
2. isurl -- check for DAP
|
|
|
|
3. mode
|
|
|
|
4. default format -- lowest precedence
|
|
|
|
|
|
|
|
@param path
|
|
|
|
@param omode
|
|
|
|
@param iscreate
|
|
|
|
@param useparallel
|
|
|
|
@param params
|
|
|
|
@param model
|
|
|
|
@param newpathp
|
|
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
int
|
|
|
|
NC_infermodel(const char* path, int* omodep, int iscreate, int useparallel, void* params, NCmodel* model, char** newpathp)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
char* newpath = NULL;
|
|
|
|
NCURI* uri = NULL;
|
|
|
|
int omode = *omodep;
|
|
|
|
int isuri = 0;
|
|
|
|
NClist* modeargs = nclistnew();
|
|
|
|
|
|
|
|
if((stat = processuri(path, &uri, &newpath, modeargs))) goto done;
|
|
|
|
isuri = (uri != NULL);
|
|
|
|
|
|
|
|
/* Phase 1: compute the IOSP */
|
|
|
|
if((stat = extractiosp(modeargs,omode,model))) goto done;
|
|
|
|
assert(model->iosp != 0);
|
|
|
|
|
|
|
|
/* Phase 2: Process the non-iosp mode arguments */
|
2019-02-02 05:31:44 +08:00
|
|
|
if(!modelcomplete(model) && isuri) {
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
int i;
|
|
|
|
for(i=0;i<nclistlength(modeargs);i++) {
|
|
|
|
const char* arg = nclistget(modeargs,i);
|
|
|
|
if((stat=processmodearg(arg,model))) goto done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Phase 3: See if we can infer DAP */
|
2019-02-02 05:31:44 +08:00
|
|
|
if(!modelcomplete(model) && isuri) {
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
if((stat = NC_dapinfer(modeargs,model))) goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Phase 4: mode inference */
|
|
|
|
if(!modelcomplete(model)) {
|
|
|
|
if((stat = NC_omodeinfer(omode,model))) goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Phase 5: Infer from file content, if possible;
|
|
|
|
this has highest precedence, so it may override
|
|
|
|
previous decisions.
|
|
|
|
*/
|
|
|
|
if(!iscreate && isreadable(model->iosp)) {
|
|
|
|
/* Ok, we need to try to read the file */
|
|
|
|
if((stat = check_file_type(path, omode, useparallel, params, model, uri))) goto done;
|
2019-01-29 05:05:19 +08:00
|
|
|
}
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
|
|
|
|
/* Phase 6: Infer impl from format */
|
|
|
|
if(!modelcomplete(model)) {
|
|
|
|
if((stat = NC_implinfer(useparallel, model))) goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(modelcomplete(model));
|
|
|
|
|
|
|
|
/* Force flag consistency */
|
|
|
|
switch (model->impl) {
|
|
|
|
case NC_FORMATX_NC4:
|
|
|
|
case NC_FORMATX_NC_HDF4:
|
|
|
|
case NC_FORMATX_DAP4:
|
|
|
|
case NC_FORMATX_UDF0:
|
|
|
|
case NC_FORMATX_UDF1:
|
|
|
|
omode |= NC_NETCDF4;
|
|
|
|
if(model->format == NC_FORMAT_NETCDF4_CLASSIC)
|
|
|
|
omode |= NC_CLASSIC_MODEL;
|
|
|
|
break;
|
|
|
|
case NC_FORMATX_DAP2:
|
|
|
|
omode &= ~(NC_NETCDF4|NC_64BIT_OFFSET|NC_64BIT_DATA);
|
|
|
|
break;
|
|
|
|
case NC_FORMATX_NC3:
|
|
|
|
omode &= ~NC_NETCDF4; /* must be netcdf-3 (CDF-1, CDF-2, CDF-5) */
|
|
|
|
if(model->format == NC_FORMAT_64BIT_OFFSET) omode |= NC_64BIT_OFFSET;
|
|
|
|
else if(model->format == NC_FORMAT_64BIT_DATA) omode |= NC_64BIT_DATA;
|
|
|
|
break;
|
2019-02-02 05:31:44 +08:00
|
|
|
case NC_FORMATX_PNETCDF:
|
|
|
|
omode &= ~NC_NETCDF4; /* must be netcdf-3 (CDF-1, CDF-2, CDF-5) */
|
|
|
|
if(model->format == NC_FORMAT_64BIT_OFFSET) omode |= NC_64BIT_OFFSET;
|
|
|
|
else if(model->format == NC_FORMAT_64BIT_DATA) omode |= NC_64BIT_DATA;
|
|
|
|
break;
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
default:
|
|
|
|
{stat = NC_ENOTNC; goto done;}
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
if(uri) ncurifree(uri);
|
2019-02-02 05:31:44 +08:00
|
|
|
nclistfreeall(modeargs);
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
if(stat == NC_NOERR && newpathp) {*newpathp = newpath; newpath = NULL;}
|
|
|
|
nullfree(newpath);
|
|
|
|
*omodep = omode; /* in/out */
|
|
|
|
return check(stat);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
isreadable(int iosp)
|
|
|
|
{
|
|
|
|
struct IospRead* r;
|
|
|
|
/* Look up the protocol */
|
|
|
|
for(r=readable;r->iosp;r++) {
|
|
|
|
if(iosp == r->iosp) return r->readable;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************/
|
|
|
|
#if 0
|
|
|
|
/* return 1 if path looks like a url; 0 otherwise */
|
|
|
|
int
|
|
|
|
NC_testurl(const char* path)
|
|
|
|
{
|
|
|
|
int isurl = 0;
|
|
|
|
NCURI* tmpurl = NULL;
|
|
|
|
|
|
|
|
if(path == NULL) return 0;
|
|
|
|
|
|
|
|
/* Ok, try to parse as a url */
|
|
|
|
if(ncuriparse(path,&tmpurl)==NCU_OK) {
|
|
|
|
/* Do some extra testing to make sure this really is a url */
|
|
|
|
/* Look for a known/accepted protocol */
|
|
|
|
struct NCPROTOCOLLIST* protolist;
|
|
|
|
for(protolist=ncprotolist;protolist->protocol;protolist++) {
|
|
|
|
if(strcmp(tmpurl->protocol,protolist->protocol) == 0) {
|
|
|
|
isurl=1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ncurifree(tmpurl);
|
|
|
|
return isurl;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/**************************************************/
|
|
|
|
/**
|
|
|
|
* Provide a hidden interface to allow utilities
|
|
|
|
* to check if a given path name is really a url.
|
|
|
|
* If not, put null in basenamep, else put basename of the url
|
|
|
|
* minus any extension into basenamep; caller frees.
|
|
|
|
* Return 1 if it looks like a url, 0 otherwise.
|
|
|
|
*/
|
|
|
|
|
|
|
|
int
|
|
|
|
nc__testurl(const char* path, char** basenamep)
|
|
|
|
{
|
|
|
|
NCURI* uri;
|
|
|
|
int ok = 0;
|
|
|
|
if(ncuriparse(path,&uri) == NCU_OK) {
|
|
|
|
char* slash = (uri->path == NULL ? NULL : strrchr(uri->path, '/'));
|
|
|
|
char* dot;
|
|
|
|
if(slash == NULL) slash = (char*)path; else slash++;
|
|
|
|
slash = nulldup(slash);
|
|
|
|
if(slash == NULL)
|
|
|
|
dot = NULL;
|
|
|
|
else
|
|
|
|
dot = strrchr(slash, '.');
|
|
|
|
if(dot != NULL && dot != slash) *dot = '\0';
|
|
|
|
if(basenamep)
|
|
|
|
*basenamep=slash;
|
|
|
|
else if(slash)
|
|
|
|
free(slash);
|
|
|
|
ncurifree(uri);
|
|
|
|
ok = 1;
|
|
|
|
}
|
|
|
|
return ok;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************/
|
|
|
|
/**
|
|
|
|
* @internal Given an existing file, figure out its format and return
|
|
|
|
* that format value (NC_FORMATX_XXX) in model arg. Assume any path
|
|
|
|
* conversion was already performed at a higher level.
|
|
|
|
*
|
|
|
|
* @param path File name.
|
|
|
|
* @param flags
|
|
|
|
* @param use_parallel
|
|
|
|
* @param parameters
|
|
|
|
* @param model Pointer that gets the model to use for the dispatch table.
|
|
|
|
* @param version Pointer that gets version of the file.
|
|
|
|
*
|
|
|
|
* @return ::NC_NOERR No error.
|
|
|
|
* @author Dennis Heimbigner
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
check_file_type(const char *path, int flags, int use_parallel,
|
|
|
|
void *parameters, NCmodel* model, NCURI* uri)
|
|
|
|
{
|
|
|
|
char magic[NC_MAX_MAGIC_NUMBER_LEN];
|
|
|
|
int status = NC_NOERR;
|
|
|
|
struct MagicFile magicinfo;
|
|
|
|
|
|
|
|
memset((void*)&magicinfo,0,sizeof(magicinfo));
|
|
|
|
magicinfo.path = path; /* do not free */
|
|
|
|
magicinfo.uri = uri; /* do not free */
|
|
|
|
magicinfo.model = model; /* do not free */
|
|
|
|
magicinfo.parameters = parameters; /* do not free */
|
|
|
|
magicinfo.use_parallel = use_parallel;
|
|
|
|
|
|
|
|
if((status = openmagic(&magicinfo))) goto done;
|
|
|
|
|
|
|
|
/* Verify we have a large enough file */
|
|
|
|
if(magicinfo.filelen < MAGIC_NUMBER_LEN)
|
|
|
|
{status = NC_ENOTNC; goto done;}
|
|
|
|
if((status = readmagic(&magicinfo,0L,magic)) != NC_NOERR) {
|
|
|
|
status = NC_ENOTNC;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Look at the magic number */
|
|
|
|
if(NC_interpret_magic_number(magic,model) == NC_NOERR
|
|
|
|
&& model->format != 0) {
|
|
|
|
if (model->format == NC_FORMAT_NC3 && use_parallel)
|
|
|
|
/* this is called from nc_open_par() and file is classic */
|
|
|
|
model->impl = NC_FORMATX_PNETCDF;
|
|
|
|
goto done; /* found something */
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Remaining case when implementation is an HDF5 file;
|
|
|
|
search forward at starting at 512
|
|
|
|
and doubling to see if we have HDF5 magic number */
|
|
|
|
{
|
|
|
|
long pos = 512L;
|
|
|
|
for(;;) {
|
|
|
|
if((pos+MAGIC_NUMBER_LEN) > magicinfo.filelen)
|
|
|
|
{status = NC_ENOTNC; goto done;}
|
|
|
|
if((status = readmagic(&magicinfo,pos,magic)) != NC_NOERR)
|
|
|
|
{status = NC_ENOTNC; goto done; }
|
|
|
|
NC_interpret_magic_number(magic,model);
|
|
|
|
if(model->impl == NC_FORMATX_NC4) break;
|
|
|
|
/* double and try again */
|
|
|
|
pos = 2*pos;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
done:
|
|
|
|
closemagic(&magicinfo);
|
|
|
|
return check(status);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
\internal
|
|
|
|
\ingroup datasets
|
|
|
|
Provide open, read and close for use when searching for magic numbers
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
openmagic(struct MagicFile* file)
|
|
|
|
{
|
|
|
|
int status = NC_NOERR;
|
|
|
|
|
|
|
|
switch (file->model->iosp) {
|
|
|
|
case NC_IOSP_MEMORY: {
|
|
|
|
/* Get its length */
|
|
|
|
NC_memio* meminfo = (NC_memio*)file->parameters;
|
|
|
|
assert(meminfo != NULL);
|
|
|
|
file->filelen = (long long)meminfo->size;
|
|
|
|
} break;
|
|
|
|
case NC_IOSP_FILE: {
|
|
|
|
#ifdef USE_PARALLEL
|
|
|
|
if (file->use_parallel) {
|
|
|
|
int retval;
|
|
|
|
MPI_Offset size;
|
|
|
|
assert(file->parameters != NULL);
|
|
|
|
if((retval = MPI_File_open(((NC_MPI_INFO*)file->parameters)->comm,
|
|
|
|
(char*)file->path,MPI_MODE_RDONLY,
|
|
|
|
((NC_MPI_INFO*)file->parameters)->info,
|
|
|
|
&file->fh)) != MPI_SUCCESS) {
|
|
|
|
#ifdef MPI_ERR_NO_SUCH_FILE
|
|
|
|
int errorclass;
|
|
|
|
MPI_Error_class(retval, &errorclass);
|
|
|
|
if (errorclass == MPI_ERR_NO_SUCH_FILE)
|
|
|
|
#ifdef NC_ENOENT
|
|
|
|
status = NC_ENOENT;
|
|
|
|
#else
|
|
|
|
status = errno;
|
|
|
|
#endif
|
|
|
|
else
|
|
|
|
#endif
|
|
|
|
status = NC_EPARINIT;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
/* Get its length */
|
|
|
|
if((retval=MPI_File_get_size(file->fh, &size)) != MPI_SUCCESS)
|
|
|
|
{status = NC_EPARINIT; goto done;}
|
|
|
|
file->filelen = (long long)size;
|
|
|
|
} else
|
|
|
|
#endif /* USE_PARALLEL */
|
|
|
|
{
|
|
|
|
if(file->path == NULL || strlen(file->path)==0)
|
|
|
|
{status = NC_EINVAL; goto done;}
|
|
|
|
#ifdef _WIN32
|
|
|
|
file->fp = fopen(file->path, "rb");
|
|
|
|
#else
|
|
|
|
file->fp = fopen(file->path, "r");
|
|
|
|
#endif
|
|
|
|
if(file->fp == NULL)
|
|
|
|
{status = errno; goto done;}
|
|
|
|
/* Get its length */
|
|
|
|
{
|
|
|
|
int fd = fileno(file->fp);
|
|
|
|
#ifdef _WIN32
|
|
|
|
__int64 len64 = _filelengthi64(fd);
|
|
|
|
if(len64 < 0)
|
|
|
|
{status = errno; goto done;}
|
|
|
|
file->filelen = (long long)len64;
|
|
|
|
#else
|
|
|
|
off_t size;
|
|
|
|
size = lseek(fd, 0, SEEK_END);
|
|
|
|
if(size == -1)
|
|
|
|
{status = errno; goto done;}
|
|
|
|
file->filelen = (long long)size;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
rewind(file->fp);
|
|
|
|
}
|
|
|
|
} break;
|
|
|
|
|
|
|
|
#ifdef ENABLE_HTTP
|
|
|
|
case NC_IOSP_HTTP: {
|
|
|
|
/* Construct a URL minus any fragment */
|
|
|
|
file->curlurl = ncuribuild(file->uri,NULL,NULL,NCURISVC);
|
|
|
|
/* Open the curl handle */
|
|
|
|
if((status=nc_http_open(file->curlurl,&file->curl,&file->filelen))) goto done;
|
|
|
|
} break;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
default: assert(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
return check(status);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
readmagic(struct MagicFile* file, long pos, char* magic)
|
|
|
|
{
|
|
|
|
int status = NC_NOERR;
|
|
|
|
memset(magic,0,MAGIC_NUMBER_LEN);
|
|
|
|
switch (file->model->iosp) {
|
|
|
|
case NC_IOSP_MEMORY: {
|
|
|
|
char* mempos;
|
|
|
|
NC_memio* meminfo = (NC_memio*)file->parameters;
|
|
|
|
if((pos + MAGIC_NUMBER_LEN) > meminfo->size)
|
|
|
|
{status = NC_EINMEMORY; goto done;}
|
|
|
|
mempos = ((char*)meminfo->memory) + pos;
|
|
|
|
memcpy((void*)magic,mempos,MAGIC_NUMBER_LEN);
|
|
|
|
#ifdef DEBUG
|
|
|
|
printmagic("XXX: readmagic",magic,file);
|
|
|
|
#endif
|
|
|
|
} break;
|
|
|
|
|
|
|
|
case NC_IOSP_FILE:
|
|
|
|
#ifdef USE_PARALLEL
|
|
|
|
if (file->use_parallel) {
|
|
|
|
MPI_Status mstatus;
|
|
|
|
int retval;
|
|
|
|
if((retval = MPI_File_read_at_all(file->fh, pos, magic,
|
|
|
|
MAGIC_NUMBER_LEN, MPI_CHAR, &mstatus)) != MPI_SUCCESS)
|
|
|
|
{status = NC_EPARINIT; goto done;}
|
|
|
|
} else
|
|
|
|
#endif /* USE_PARALLEL */
|
|
|
|
{
|
|
|
|
int count;
|
|
|
|
int i = fseek(file->fp,pos,SEEK_SET);
|
|
|
|
if(i < 0)
|
|
|
|
{status = errno; goto done;}
|
|
|
|
for(i=0;i<MAGIC_NUMBER_LEN;) {/* make sure to read proper # of bytes */
|
|
|
|
count=fread(&magic[i],1,(size_t)(MAGIC_NUMBER_LEN-i),file->fp);
|
|
|
|
if(count == 0 || ferror(file->fp))
|
|
|
|
{status = errno; goto done;}
|
|
|
|
i += count;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
#ifdef ENABLE_HTTP
|
|
|
|
case NC_IOSP_HTTP: {
|
|
|
|
NCbytes* buf = ncbytesnew();
|
|
|
|
fileoffset_t start = (size_t)pos;
|
|
|
|
fileoffset_t count = MAGIC_NUMBER_LEN;
|
|
|
|
status = nc_http_read(file->curl,file->curlurl,start,count,buf);
|
|
|
|
if(status == NC_NOERR) {
|
|
|
|
if(ncbyteslength(buf) != count)
|
|
|
|
status = NC_EINVAL;
|
|
|
|
else
|
|
|
|
memcpy(magic,ncbytescontents(buf),count);
|
|
|
|
}
|
|
|
|
ncbytesfree(buf);
|
|
|
|
} break;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
default: assert(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
done:
|
|
|
|
if(file && file->fp) clearerr(file->fp);
|
|
|
|
return check(status);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Close the file opened to check for magic number.
|
|
|
|
*
|
|
|
|
* @param file pointer to the MagicFile struct for this open file.
|
|
|
|
* @returns NC_NOERR for success
|
|
|
|
* @returns NC_EPARINIT if there was a problem closing file with MPI
|
|
|
|
* (parallel builds only).
|
|
|
|
* @author Dennis Heimbigner
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
closemagic(struct MagicFile* file)
|
|
|
|
{
|
|
|
|
int status = NC_NOERR;
|
|
|
|
switch (file->model->iosp) {
|
|
|
|
case NC_IOSP_MEMORY:
|
|
|
|
break; /* noop */
|
|
|
|
|
|
|
|
case NC_IOSP_FILE:
|
|
|
|
#ifdef USE_PARALLEL
|
|
|
|
if (file->use_parallel) {
|
|
|
|
int retval;
|
|
|
|
if((retval = MPI_File_close(&file->fh)) != MPI_SUCCESS)
|
2019-01-29 05:05:19 +08:00
|
|
|
{status = NC_EPARINIT; return status;}
|
Provide byte-range reading of remote datasets
re: issue https://github.com/Unidata/netcdf-c/issues/1251
Assume that you have the URL to a remote dataset
which is a normal netcdf-3 or netcdf-4 file.
This PR allows the netcdf-c to read that dataset's
contents as a netcdf file using HTTP byte ranges
if the remote server supports byte-range access.
Originally, this PR was set up to access Amazon S3 objects,
but it can also access other remote datasets such as those
provided by a Thredds server via the HTTPServer access protocol.
It may also work for other kinds of servers.
Note that this is not intended as a true production
capability because, as is known, this kind of access to
can be quite slow. In addition, the byte-range IO drivers
do not currently do any sort of optimization or caching.
An additional goal here is to gain some experience with
the Amazon S3 REST protocol.
This architecture and its use documented in
the file docs/byterange.dox.
There are currently two test cases:
1. nc_test/tst_s3raw.c - this does a simple open, check format, close cycle
for a remote netcdf-3 file and a remote netcdf-4 file.
2. nc_test/test_s3raw.sh - this uses ncdump to investigate some remote
datasets.
This PR also incorporates significantly changed model inference code
(see the superceded PR https://github.com/Unidata/netcdf-c/pull/1259).
1. It centralizes the code that infers the dispatcher.
2. It adds support for byte-range URLs
Other changes:
1. NC_HDF5_finalize was not being properly called by nc_finalize().
2. Fix minor bug in ncgen3.l
3. fix memory leak in nc4info.c
4. add code to walk the .daprc triples and to replace protocol=
fragment tag with a more general mode= tag.
Final Note:
Th inference code is still way too complicated. We need to move
to the validfile() model used by netcdf Java, where each
dispatcher is asked if it can process the file. This decentralizes
the inference code. This will be done after all the major new
dispatchers (PIO, Zarr, etc) have been implemented.
2019-01-02 09:27:36 +08:00
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
|
|
|
if(file->fp) fclose(file->fp);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
#ifdef ENABLE_HTTP
|
|
|
|
case NC_IOSP_HTTP:
|
|
|
|
status = nc_http_close(file->curl);
|
|
|
|
nullfree(file->curlurl);
|
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
default: assert(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Interpret the magic number found in the header of a netCDF file.
|
|
|
|
This function interprets the magic number/string contained in the header of a netCDF file and sets the appropriate NC_FORMATX flags.
|
|
|
|
|
|
|
|
@param[in] magic Pointer to a character array with the magic number block.
|
|
|
|
@param[out] model Pointer to an integer to hold the corresponding netCDF type.
|
|
|
|
@param[out] version Pointer to an integer to hold the corresponding netCDF version.
|
|
|
|
@returns NC_NOERR if a legitimate file type found
|
|
|
|
@returns NC_ENOTNC otherwise
|
|
|
|
|
|
|
|
\internal
|
|
|
|
\ingroup datasets
|
|
|
|
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
NC_interpret_magic_number(char* magic, NCmodel* model)
|
|
|
|
{
|
|
|
|
int status = NC_NOERR;
|
|
|
|
/* Look at the magic number */
|
|
|
|
#ifdef USE_NETCDF4
|
|
|
|
if (strlen(UDF0_magic_number) && !strncmp(UDF0_magic_number, magic,
|
|
|
|
strlen(UDF0_magic_number)))
|
|
|
|
{
|
|
|
|
model->impl = NC_FORMATX_UDF0;
|
|
|
|
model->format = NC_FORMAT_NETCDF4;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
if (strlen(UDF1_magic_number) && !strncmp(UDF1_magic_number, magic,
|
|
|
|
strlen(UDF1_magic_number)))
|
|
|
|
{
|
|
|
|
model->impl = NC_FORMATX_UDF1;
|
|
|
|
model->format = NC_FORMAT_NETCDF4;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
#endif /* USE_NETCDF4 */
|
|
|
|
|
|
|
|
/* Use the complete magic number string for HDF5 */
|
|
|
|
if(memcmp(magic,HDF5_SIGNATURE,sizeof(HDF5_SIGNATURE))==0) {
|
|
|
|
model->impl = NC_FORMATX_NC4;
|
|
|
|
model->format = NC_FORMAT_NETCDF4;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
if(magic[0] == '\016' && magic[1] == '\003'
|
|
|
|
&& magic[2] == '\023' && magic[3] == '\001') {
|
|
|
|
model->impl = NC_FORMATX_NC_HDF4;
|
|
|
|
model->format = NC_FORMAT_NETCDF4;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
if(magic[0] == 'C' && magic[1] == 'D' && magic[2] == 'F') {
|
|
|
|
if(magic[3] == '\001') {
|
|
|
|
model->impl = NC_FORMATX_NC3;
|
|
|
|
model->format = NC_FORMAT_CLASSIC;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
if(magic[3] == '\002') {
|
|
|
|
model->impl = NC_FORMATX_NC3;
|
|
|
|
model->format = NC_FORMAT_64BIT_OFFSET;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
if(magic[3] == '\005') {
|
|
|
|
model->impl = NC_FORMATX_NC3;
|
|
|
|
model->format = NC_FORMAT_64BIT_DATA;
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* No match */
|
|
|
|
status = NC_ENOTNC;
|
|
|
|
goto done;
|
|
|
|
|
|
|
|
done:
|
|
|
|
return check(status);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
static void
|
|
|
|
printmagic(const char* tag, char* magic, struct MagicFile* f)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
fprintf(stderr,"%s: ispar=%d magic=",tag,f->use_parallel);
|
|
|
|
for(i=0;i<MAGIC_NUMBER_LEN;i++) {
|
|
|
|
unsigned int c = (unsigned int)magic[i];
|
|
|
|
c = c & 0x000000FF;
|
|
|
|
if(c == '\n')
|
|
|
|
fprintf(stderr," 0x%0x/'\\n'",c);
|
|
|
|
else if(c == '\r')
|
|
|
|
fprintf(stderr," 0x%0x/'\\r'",c);
|
|
|
|
else if(c < ' ')
|
|
|
|
fprintf(stderr," 0x%0x/'?'",c);
|
|
|
|
else
|
|
|
|
fprintf(stderr," 0x%0x/'%c'",c,c);
|
|
|
|
}
|
|
|
|
fprintf(stderr,"\n");
|
|
|
|
fflush(stderr);
|
|
|
|
}
|
|
|
|
#endif
|