netcdf-c/libdispatch/ds3util.c
Dennis Heimbigner f6e25b695e Fix additional S3 support issues
re: https://github.com/Unidata/netcdf-c/issues/2117
re: https://github.com/Unidata/netcdf-c/issues/2119

* Modify libsrc to allow byte-range reading of netcdf-3 files in private S3 buckets; this required using the aws sdk. Also add a test case.
* The aws sdk can sometimes cause problems if the Awd::ShutdownAPI function is not called. So at optional atexit() support to ensure it is called. This is disabled for Windows.
* Add documentation to nczarr.md on how to build and use the aws sdk under windows. Currently it builds, but testing fails.
* Switch testing from stratus to the Unidata bucket on S3.
* Improve support for the s3: url protocol.
* Add a s3 specific utility code file: ds3util.c
* Modify NC_infermodel to attempt to read the magic number of byte-ranged files in S3.

## Misc.

* Move and rename the core S3 SDK wrapper code (libnczarr/zs3sdk.cpp) to libdispatch since it now used in libsrc as well as libnczarr.
* Add calls to nc_finalize in the utilities in case atexit is disabled.
* Add header only json parser to the distribution rather than as a built source.
2021-10-29 20:06:37 -06:00

251 lines
6.9 KiB
C

/*********************************************************************
* Copyright 2018, UCAR/Unidata
* See netcdf/COPYRIGHT file for copying and redistribution conditions.
*********************************************************************/
#include "config.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef _MSC_VER
#include <io.h>
#endif
#include "netcdf.h"
#include "ncuri.h"
#include "ncrc.h"
#undef AWSDEBUG
#define AWSHOST ".amazonaws.com"
enum URLFORMAT {UF_NONE=0, UF_VIRTUAL=1, UF_PATH=2, UF_S3=3, UF_OTHER=4};
/* Forward */
static int endswith(const char* s, const char* suffix);
/**************************************************/
/* Generic S3 Utilities */
/*
Rebuild an S3 url into a canonical path-style url.
If region is not in the host, then use specified region
if provided, otherwise us-east-1.
@param url (in) the current url
@param region (in) region to use if needed; NULL => us-east-1
(out) region from url or the input region
@param pathurlp (out) the resulting pathified url string
@param bucketp (out) the bucket from the url
*/
int
NC_s3urlrebuild(NCURI* url, NCURI** newurlp, char** bucketp, char** outregionp)
{
int i,stat = NC_NOERR;
NClist* hostsegments = NULL;
NClist* pathsegments = NULL;
NCbytes* buf = ncbytesnew();
NCURI* newurl = NULL;
char* bucket = NULL;
char* host = NULL;
char* path = NULL;
char* region = NULL;
if(url == NULL)
{stat = NC_EURL; goto done;}
/* Parse the hostname */
hostsegments = nclistnew();
/* split the hostname by "." */
if((stat = NC_split_delim(url->host,'.',hostsegments))) goto done;
/* Parse the path*/
pathsegments = nclistnew();
/* split the path by "/" */
if((stat = NC_split_delim(url->path,'/',pathsegments))) goto done;
/* Distinguish path-style from virtual-host style from s3: and from other.
Virtual: https://bucket-name.s3.Region.amazonaws.com/<path>
Path: https://s3.Region.amazonaws.com/bucket-name/<path>
S3: s3://bucket-name/<path>
Other: https://<host>/bucketname/<path>
*/
if(url->host == NULL || strlen(url->host) == 0)
{stat = NC_EURL; goto done;}
if(strcmp(url->protocol,"s3")==0 && nclistlength(hostsegments)==1) {
bucket = strdup(url->host);
region = NULL; /* unknown at this point */
} else if(endswith(url->host,AWSHOST)) { /* Virtual or path */
switch (nclistlength(hostsegments)) {
default: stat = NC_EURL; goto done;
case 4:
if(strcasecmp(nclistget(hostsegments,0),"s3")!=0)
{stat = NC_EURL; goto done;}
region = strdup(nclistget(hostsegments,1));
if(nclistlength(pathsegments) > 0)
bucket = nclistremove(pathsegments,0);
break;
case 5:
if(strcasecmp(nclistget(hostsegments,1),"s3")!=0)
{stat = NC_EURL; goto done;}
region = strdup(nclistget(hostsegments,2));
bucket = strdup(nclistget(hostsegments,0));
break;
}
} else {
if((host = strdup(url->host))==NULL)
{stat = NC_ENOMEM; goto done;}
/* region is unknown */
region = NULL;
/* bucket is assumed to be start of the path */
if(nclistlength(pathsegments) > 0)
bucket = nclistremove(pathsegments,0);
}
/* If region is null, use default */
if(region == NULL) {
const char* region0 = NULL;
/* Get default region */
if((stat = NC_getdefaults3region(url,&region0))) goto done;
region = strdup(region0);
}
/* Construct the revised host */
ncbytescat(buf,"s3.");
ncbytescat(buf,region);
ncbytescat(buf,AWSHOST);
host = ncbytesextract(buf);
/* Construct the revised path */
ncbytesclear(buf);
ncbytescat(buf,"/");
if(bucket == NULL)
{stat = NC_EURL; goto done;}
ncbytescat(buf,bucket);
for(i=0;i<nclistlength(pathsegments);i++) {
ncbytescat(buf,"/");
ncbytescat(buf,nclistget(pathsegments,i));
}
path = ncbytesextract(buf);
/* complete the new url */
if((newurl=ncuriclone(url))==NULL) {stat = NC_ENOMEM; goto done;}
ncurisetprotocol(newurl,"https");
ncurisethost(newurl,host);
ncurisetpath(newurl,path);
/* return various items */
#ifdef AWSDEBUG
{
char* s = ncuribuild(newurl,NULL,NULL,NCURIALL);
fprintf(stderr,">>> NC_s3urlrebuild: final=%s bucket=%s region=%s\n",s,bucket,region);
nullfree(s);
}
#endif
if(newurlp) {*newurlp = newurl; newurl = NULL;}
if(bucketp) {*bucketp = bucket; bucket = NULL;}
if(outregionp) {*outregionp = region; region = NULL;}
done:
nullfree(region);
nullfree(bucket)
nullfree(host)
nullfree(path)
ncurifree(newurl);
ncbytesfree(buf);
nclistfreeall(hostsegments);
nclistfreeall(pathsegments);
return stat;
}
static int
endswith(const char* s, const char* suffix)
{
ssize_t ls, lsf, delta;
if(s == NULL || suffix == NULL) return 0;
ls = strlen(s);
lsf = strlen(suffix);
delta = (ls - lsf);
if(delta < 0) return 0;
if(memcmp(s+delta,suffix,lsf)!=0) return 0;
return 1;
}
/**************************************************/
/* S3 utilities */
EXTERNL int
NC_s3urlprocess(NCURI* url, NCS3INFO* s3)
{
int stat = NC_NOERR;
NCURI* url2 = NULL;
NClist* pathsegments = NULL;
const char* profile0 = NULL;
if(url == NULL || s3 == NULL)
{stat = NC_EURL; goto done;}
/* Get current profile */
if((stat = NC_getactives3profile(url,&profile0))) goto done;
if(profile0 == NULL) profile0 = "none";
s3->profile = strdup(profile0);
/* Rebuild the URL to path format and get a usable region*/
if((stat = NC_s3urlrebuild(url,&url2,&s3->bucket,&s3->region))) goto done;
s3->host = strdup(url2->host);
/* construct the rootkey minus the leading bucket */
pathsegments = nclistnew();
if((stat = NC_split_delim(url2->path,'/',pathsegments))) goto done;
if(nclistlength(pathsegments) > 0) {
char* seg = nclistremove(pathsegments,0);
nullfree(seg);
}
if((stat = NC_join(pathsegments,&s3->rootkey))) goto done;
done:
ncurifree(url2);
nclistfreeall(pathsegments);
return stat;
}
int
NC_s3clear(NCS3INFO* s3)
{
if(s3) {
nullfree(s3->host); s3->host = NULL;
nullfree(s3->region); s3->region = NULL;
nullfree(s3->bucket); s3->bucket = NULL;
nullfree(s3->rootkey); s3->rootkey = NULL;
nullfree(s3->profile); s3->profile = NULL;
}
return NC_NOERR;
}
/*
Check if a url has indicators that signal an S3 url.
*/
int
NC_iss3(NCURI* uri)
{
int iss3 = 0;
if(uri == NULL) goto done; /* not a uri */
/* is the protocol "s3"? */
if(strcasecmp(uri->protocol,"s3")==0) {iss3 = 1; goto done;}
/* Is "s3" in the mode list? */
if(NC_testmode(uri,"s3")) {iss3 = 1; goto done;}
/* Last chance; see if host looks s3'y */
if(endswith(uri->host,AWSHOST)) {iss3 = 1; goto done;}
done:
return iss3;
}