hdf5/src/H5FDs3comms.h
Scot Breitenfeld f859cb732b
Fixed Spelling Errors (#1166)
* fixed missed closing of a dataset

* fixed missed closing of a dataset

* fixed typo in error return

* Committing clang-format changes

* minor edits

* code format

* Committing clang-format changes

* code format

* minor edit

* switched from using MPI_count, to actual bytes written for H5FD_mpio_debug rw debugging

* Committing clang-format changes

* changed size_i in printf to reflect the I/O.

* Committing clang-format changes

* Fixed seg fault with xlf on BE with -qintsize=8

* fixed error function string

* spelling corrections via codespell, added new spell check github actions

* Committing clang-format changes

* misc

* misc

* misc

* misc

* misc

* misc

* misc

* misc

* misc

* misc

* misc

* misc

* misc

* misc

* Committing clang-format changes

* misc

* misc

* misc

* misc

* misc

* misc

* Committing clang-format changes

* misc

* work around for https://github.com/codespell-project/codespell/issues/2137

* misc

* added missing file

* misc

* misc.

* misc

* switch to using Codespell with GitHub Actions

* misc.

* misc.

* fixed more sp errors

* Fix new typos found by codespell.

* fixed proceed with precede

* fixed variable in fortran test

* fixed minnum

* updated spelling list

Co-authored-by: github-actions <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Larry Knox <lrknox@hdfgroup.org>
2021-12-07 08:27:29 -06:00

563 lines
18 KiB
C

/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
* Copyright by The HDF Group. *
* All rights reserved. *
* *
* This file is part of HDF5. The full HDF5 copyright notice, including *
* terms governing use, modification, and redistribution, is contained in *
* the COPYING file, which can be found at the root of the source code *
* distribution tree, or in https://www.hdfgroup.org/licenses. *
* If you do not have access to either file, you may request a copy from *
* help@hdfgroup.org. *
* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
/*****************************************************************************
* Read-Only S3 Virtual File Driver (VFD)
*
* This is the header for the S3 Communications module
*
* ***NOT A FILE DRIVER***
*
* Purpose:
*
* - Provide structures and functions related to communicating with
* Amazon S3 (Simple Storage Service).
* - Abstract away the REST API (HTTP,
* networked communications) behind a series of uniform function calls.
* - Handle AWS4 authentication, if appropriate.
* - Fail predictably in event of errors.
* - Eventually, support more S3 operations, such as creating, writing to,
* and removing Objects remotely.
*
* translates:
* `read(some_file, bytes_offset, bytes_length, &dest_buffer);`
* to:
* ```
* GET myfile HTTP/1.1
* Host: somewhere.me
* Range: bytes=4096-5115
* ```
* and places received bytes from HTTP response...
* ```
* HTTP/1.1 206 Partial-Content
* Content-Range: 4096-5115/63239
*
* <bytes>
* ```
* ...in destination buffer.
*
* TODO: put documentation in a consistent place and point to it from here.
*
* Programmer: Jacob Smith
* 2017-11-30
*
*****************************************************************************/
#include "H5private.h" /* Generic Functions */
#ifdef H5_HAVE_ROS3_VFD
/* Necessary S3 headers */
#include <curl/curl.h>
#include <openssl/evp.h>
#include <openssl/hmac.h>
#include <openssl/sha.h>
/*****************
* PUBLIC MACROS *
*****************/
/* hexadecimal string of pre-computed sha256 checksum of the empty string
* hex(sha256sum(""))
*/
#define EMPTY_SHA256 "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
/* string length (plus null terminator)
* example ISO8601-format string: "20170713T145903Z" (YYYYmmdd'T'HHMMSS'_')
*/
#define ISO8601_SIZE 17
/* string length (plus null terminator)
* example RFC7231-format string: "Fri, 30 Jun 2017 20:41:55 GMT"
*/
#define RFC7231_SIZE 30
/*---------------------------------------------------------------------------
*
* Macro: ISO8601NOW()
*
* Purpose:
*
* write "YYYYmmdd'T'HHMMSS'Z'" (less single-quotes) to dest
* e.g., "20170630T204155Z"
*
* wrapper for strftime()
*
* It is left to the programmer to check return value of
* ISO8601NOW (should equal ISO8601_SIZE - 1).
*
*---------------------------------------------------------------------------
*/
#define ISO8601NOW(dest, now_gm) strftime((dest), ISO8601_SIZE, "%Y%m%dT%H%M%SZ", (now_gm))
/*---------------------------------------------------------------------------
*
* Macro: RFC7231NOW()
*
* Purpose:
*
* write "Day, dd Mmm YYYY HH:MM:SS GMT" to dest
* e.g., "Fri, 30 Jun 2017 20:41:55 GMT"
*
* wrapper for strftime()
*
* It is left to the programmer to check return value of
* RFC7231NOW (should equal RFC7231_SIZE - 1).
*
*---------------------------------------------------------------------------
*/
#define RFC7231NOW(dest, now_gm) strftime((dest), RFC7231_SIZE, "%a, %d %b %Y %H:%M:%S GMT", (now_gm))
/* Reasonable maximum length of a credential string.
* Provided for error-checking S3COMMS_FORMAT_CREDENTIAL (below).
* 17 <- "////aws4_request\0"
* 2 < "s3" (service)
* 8 <- "YYYYmmdd" (date)
* 128 <- (access_id)
* 155 :: sum
*/
#define S3COMMS_MAX_CREDENTIAL_SIZE 155
/*---------------------------------------------------------------------------
*
* Macro: H5FD_S3COMMS_FORMAT_CREDENTIAL()
*
* Purpose:
*
* Format "S3 Credential" string from inputs, for AWS4.
*
* Wrapper for HDsnprintf().
*
* _HAS NO ERROR-CHECKING FACILITIES_
* It is left to programmer to ensure that return value confers success.
* e.g.,
* ```
* assert( S3COMMS_MAX_CREDENTIAL_SIZE >=
* S3COMMS_FORMAT_CREDENTIAL(...) );
* ```
*
* "<access-id>/<date>/<aws-region>/<aws-service>/aws4_request"
* assuming that `dest` has adequate space.
*
* ALL inputs must be null-terminated strings.
*
* `access` should be the user's access key ID.
* `date` must be of format "YYYYmmdd".
* `region` should be relevant AWS region, i.e. "us-east-1".
* `service` should be "s3".
*
*---------------------------------------------------------------------------
*/
#define S3COMMS_FORMAT_CREDENTIAL(dest, access, iso8601_date, region, service) \
HDsnprintf((dest), S3COMMS_MAX_CREDENTIAL_SIZE, "%s/%s/%s/%s/aws4_request", (access), (iso8601_date), \
(region), (service))
/*********************
* PUBLIC STRUCTURES *
*********************/
/*----------------------------------------------------------------------------
*
* Structure: hrb_node_t
*
* HTTP Header Field Node
*
*
*
* Maintain a ordered (linked) list of HTTP Header fields.
*
* Provides efficient access and manipulation of a logical sequence of
* HTTP header fields, of particular use when composing an
* "S3 Canonical Request" for authentication.
*
* - The creation of a Canonical Request involves:
* - convert field names to lower case
* - sort by this lower-case name
* - convert ": " name-value separator in HTTP string to ":"
* - get sorted lowercase names without field or separator
*
* As HTTP headers allow headers in any order (excepting the case of multiple
* headers with the same name), the list ordering can be optimized for Canonical
* Request creation, suggesting alphabtical order. For more expedient insertion
* and removal of elements in the list, linked list seems preferable to a
* dynamically-expanding array. The usually-smaller number of entries (5 or
* fewer) makes performance overhead of traversing the list trivial.
*
* The above requirements of creating at Canonical Request suggests a reasonable
* trade-off of speed for space with the option to compute elements as needed
* or to have the various elements prepared and stored in the structure
* (e.g. name, value, lowername, concatenated name:value)
* The structure currently is implemented to pre-compute.
*
* At all times, the "first" node of the list should be the least,
* alphabetically. For all nodes, the `next` node should be either NULL or
* of greater alphabetical value.
*
* Each node contains its own header field information, plus a pointer to the
* next node.
*
* It is not allowed to have multiple nodes with the same _lowercase_ `name`s
* in the same list
* (i.e., name is case-insensitive for access and modification.)
*
* All data (`name`, `value`, `lowername`, and `cat`) are null-terminated
* strings allocated specifically for their node.
*
*
*
* `magic` (unsigned long)
*
* "unique" idenfier number for the structure type
*
* `name` (char *)
*
* Case-meaningful name of the HTTP field.
* Given case is how it is supplied to networking code.
* e.g., "Range"
*
* `lowername` (char *)
*
* Lowercase copy of name.
* e.g., "range"
*
* `value` (char *)
*
* Case-meaningful value of HTTP field.
* e.g., "bytes=0-9"
*
* `cat` (char *)
*
* Concatenated, null-terminated string of HTTP header line,
* as the field would appear in an HTTP request.
* e.g., "Range: bytes=0-9"
*
* `next` (hrb_node_t *)
*
* Pointers to next node in the list, or NULL sentinel as end of list.
* Next node must have a greater `lowername` as determined by strcmp().
*
*----------------------------------------------------------------------------
*/
typedef struct hrb_node_t {
unsigned long magic;
char * name;
char * value;
char * cat;
char * lowername;
struct hrb_node_t *next;
} hrb_node_t;
#define S3COMMS_HRB_NODE_MAGIC 0x7F5757UL
/*----------------------------------------------------------------------------
*
* Structure: hrb_t
*
* HTTP Request Buffer structure
*
*
*
* Logically represent an HTTP request
*
* GET /myplace/myfile.h5 HTTP/1.1
* Host: over.rainbow.oz
* Date: Fri, 01 Dec 2017 12:35:04 CST
*
* <body>
*
* ...with fast, efficient access to and modification of primary and field
* elements.
*
* Structure for building HTTP requests while hiding much of the string
* processing required "under the hood."
*
* Information about the request target -- the first line -- and the body text,
* if any, are managed directly with this structure. All header fields, e.g.,
* "Host" and "Date" above, are created with a linked list of `hrb_node_t` and
* included in the request by a pointer to the head of the list.
*
*
*
* `magic` (unsigned long)
*
* "Magic" number confirming that this is an hrb_t structure and
* what operations are valid for it.
*
* Must be S3COMMS_HRB_MAGIC to be valid.
*
* `body` (char *) :
*
* Pointer to start of HTTP body.
*
* Can be NULL, in which case it is treated as the empty string, "".
*
* `body_len` (size_t) :
*
* Number of bytes (characters) in `body`. 0 if empty or NULL `body`.
*
* `first_header` (hrb_node_t *) :
*
* Pointer to first SORTED header node, if any.
* It is left to the programmer to ensure that this node and associated
* list is destroyed when done.
*
* `resource` (char *) :
*
* Pointer to resource URL string, e.g., "/folder/page.xhtml".
*
* `verb` (char *) :
*
* Pointer to HTTP verb string, e.g., "GET".
*
* `version` (char *) :
*
* Pointer to HTTP version string, e.g., "HTTP/1.1".
*
*----------------------------------------------------------------------------
*/
typedef struct {
unsigned long magic;
char * body;
size_t body_len;
hrb_node_t * first_header;
char * resource;
char * verb;
char * version;
} hrb_t;
#define S3COMMS_HRB_MAGIC 0x6DCC84UL
/*----------------------------------------------------------------------------
*
* Structure: parsed_url_t
*
*
* Represent a URL with easily-accessed pointers to logical elements within.
* These elements (components) are stored as null-terminated strings (or just
* NULLs). These components should be allocated for the structure, making the
* data as safe as possible from modification. If a component is NULL, it is
* either implicit in or absent from the URL.
*
* "http://mybucket.s3.amazonaws.com:8080/somefile.h5?param=value&arg=value"
* ^--^ ^-----------------------^ ^--^ ^---------^ ^-------------------^
* Scheme Host Port Resource Query/-ies
*
*
*
* `magic` (unsigned long)
*
* Structure identification and validation identifier.
* Identifies as `parsed_url_t` type.
*
* `scheme` (char *)
*
* String representing which protocol is to be expected.
* _Must_ be present.
* "http", "https", "ftp", e.g.
*
* `host` (char *)
*
* String of host, either domain name, IPv4, or IPv6 format.
* _Must_ be present.
* "over.rainbow.oz", "192.168.0.1", "[0000:0000:0000:0001]"
*
* `port` (char *)
*
* String representation of specified port. Must resolve to a valid unsigned
* integer.
* "9000", "80"
*
* `path` (char *)
*
* Path to resource on host. If not specified, assumes root "/".
* "lollipop_guild.wav", "characters/witches/white.dat"
*
* `query` (char *)
*
* Single string of all query parameters in url (if any).
* "arg1=value1&arg2=value2"
*
*----------------------------------------------------------------------------
*/
typedef struct {
unsigned long magic;
char * scheme; /* required */
char * host; /* required */
char * port;
char * path;
char * query;
} parsed_url_t;
#define S3COMMS_PARSED_URL_MAGIC 0x21D0DFUL
/*----------------------------------------------------------------------------
*
* Structure: s3r_t
*
*
*
* S3 request structure "handle".
*
* Holds persistent information for Amazon S3 requests.
*
* Instantiated through `H5FD_s3comms_s3r_open()`, copies data into self.
*
* Intended to be re-used for operations on a remote object.
*
* Cleaned up through `H5FD_s3comms_s3r_close()`.
*
* _DO NOT_ share handle between threads: curl easy handle `curlhandle` has
* undefined behavior if called to perform in multiple threads.
*
*
*
* `magic` (unsigned long)
*
* "magic" number identifying this structure as unique type.
* MUST equal `S3R_MAGIC` to be valid.
*
* `curlhandle` (CURL)
*
* Pointer to the curl_easy handle generated for the request.
*
* `httpverb` (char *)
*
* Pointer to NULL-terminated string. HTTP verb,
* e.g. "GET", "HEAD", "PUT", etc.
*
* Default is NULL, resulting in a "GET" request.
*
* `purl` (parsed_url_t *)
*
* Pointer to structure holding the elements of URL for file open.
*
* e.g., "http://bucket.aws.com:8080/myfile.dat?q1=v1&q2=v2"
* parsed into...
* { scheme: "http"
* host: "bucket.aws.com"
* port: "8080"
* path: "myfile.dat"
* query: "q1=v1&q2=v2"
* }
*
* Cannot be NULL.
*
* `region` (char *)
*
* Pointer to NULL-terminated string, specifying S3 "region",
* e.g., "us-east-1".
*
* Required to authenticate.
*
* `secret_id` (char *)
*
* Pointer to NULL-terminated string for "secret" access id to S3 resource.
*
* Required to authenticate.
*
* `signing_key` (unsigned char *)
*
* Pointer to `SHA256_DIGEST_LENGTH`-long string for "re-usable" signing
* key, generated via
* `HMAC-SHA256(HMAC-SHA256(HMAC-SHA256(HMAC-SHA256("AWS4<secret_key>",
* "<yyyyMMDD"), "<aws-region>"), "<aws-service>"), "aws4_request")`
* which may be re-used for several (up to seven (7)) days from creation?
* Computed once upon file open.
*
* Required to authenticate.
*
*----------------------------------------------------------------------------
*/
typedef struct {
unsigned long magic;
CURL * curlhandle;
size_t filesize;
char * httpverb;
parsed_url_t * purl;
char * region;
char * secret_id;
unsigned char *signing_key;
} s3r_t;
#define S3COMMS_S3R_MAGIC 0x44d8d79
#ifdef __cplusplus
extern "C" {
#endif
/*******************************************
* DECLARATION OF HTTP FIELD LIST ROUTINES *
*******************************************/
H5_DLL herr_t H5FD_s3comms_hrb_node_set(hrb_node_t **L, const char *name, const char *value);
/***********************************************
* DECLARATION OF HTTP REQUEST BUFFER ROUTINES *
***********************************************/
H5_DLL herr_t H5FD_s3comms_hrb_destroy(hrb_t **buf);
H5_DLL hrb_t *H5FD_s3comms_hrb_init_request(const char *verb, const char *resource, const char *host);
/*************************************
* DECLARATION OF S3REQUEST ROUTINES *
*************************************/
H5_DLL herr_t H5FD_s3comms_s3r_close(s3r_t *handle);
H5_DLL size_t H5FD_s3comms_s3r_get_filesize(s3r_t *handle);
H5_DLL s3r_t *H5FD_s3comms_s3r_open(const char url[], const char region[], const char id[],
const unsigned char signing_key[]);
H5_DLL herr_t H5FD_s3comms_s3r_read(s3r_t *handle, haddr_t offset, size_t len, void *dest);
/*********************************
* DECLARATION OF OTHER ROUTINES *
*********************************/
H5_DLL struct tm *gmnow(void);
H5_DLL herr_t H5FD_s3comms_aws_canonical_request(char *canonical_request_dest, int cr_size,
char *signed_headers_dest, int sh_size, hrb_t *http_request);
H5_DLL herr_t H5FD_s3comms_bytes_to_hex(char *dest, const unsigned char *msg, size_t msg_len,
hbool_t lowercase);
H5_DLL herr_t H5FD_s3comms_free_purl(parsed_url_t *purl);
H5_DLL herr_t H5FD_s3comms_HMAC_SHA256(const unsigned char *key, size_t key_len, const char *msg,
size_t msg_len, char *dest);
H5_DLL herr_t H5FD_s3comms_load_aws_profile(const char *name, char *key_id_out, char *secret_access_key_out,
char *aws_region_out);
H5_DLL herr_t H5FD_s3comms_nlowercase(char *dest, const char *s, size_t len);
H5_DLL herr_t H5FD_s3comms_parse_url(const char *str, parsed_url_t **purl);
H5_DLL herr_t H5FD_s3comms_percent_encode_char(char *repr, const unsigned char c, size_t *repr_len);
H5_DLL herr_t H5FD_s3comms_signing_key(unsigned char *md, const char *secret, const char *region,
const char *iso8601now);
H5_DLL herr_t H5FD_s3comms_tostringtosign(char *dest, const char *req_str, const char *now,
const char *region);
H5_DLL herr_t H5FD_s3comms_trim(char *dest, char *s, size_t s_len, size_t *n_written);
H5_DLL herr_t H5FD_s3comms_uriencode(char *dest, const char *s, size_t s_len, hbool_t encode_slash,
size_t *n_written);
#ifdef __cplusplus
}
#endif
#endif /* H5_HAVE_ROS3_VFD */