netcdf-c/libnczarr/zmap_zip.c

793 lines
22 KiB
C
Raw Normal View History

/*
* Copyright 2018, University Corporation for Atmospheric Research
* See netcdf/COPYRIGHT file for copying and redistribution conditions.
*/
#undef DEBUG
/* Not sure this has any effect */
#define _LARGEFILE_SOURCE 1
#define _LARGEFILE64_SOURCE 1
#include "zincludes.h"
#include <errno.h>
#include <zip.h>
#include "fbits.h"
#include "ncpathmgr.h"
#undef CACHESEARCH
#define VERIFY
/*Mnemonic*/
#define FLAG_ISDIR 1
#define FLAG_CREATE 1
#define SKIPLAST 1
#define WHOLEPATH 0
#define NCZM_ZIP_V1 1
Mitigate S3 test interference + Unlimited Dimensions in NCZarr This PR started as an attempt to add unlimited dimensions to NCZarr. It did that, but this exposed significant problems with test interference. So this PR is mostly about fixing -- well mitigating anyway -- test interference. The problem of test interference is now documented in the document docs/internal.md. The solutions implemented here are also describe in that document. The solution is somewhat fragile but multiple cleanup mechanisms are provided. Note that this feature requires that the AWS command line utility must be installed. ## Unlimited Dimensions. The existing NCZarr extensions to Zarr are modified to support unlimited dimensions. NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group". Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2. * Form 1: An integer representing the size of the dimension, which is used for simple named dimensions. * Form 2: A dictionary with the following keys and values" - "size" with an integer value representing the (current) size of the dimension. - "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension. For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases. That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension. This is the standard semantics for unlimited dimensions. Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following. * Did a partial refactor of the slice handling code in zwalk.c to clean it up. * Added a number of tests for unlimited dimensions derived from the same test in nc_test4. * Added several NCZarr specific unlimited tests; more are needed. * Add test of endianness. ## Misc. Other Changes * Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the AWS Transfer Utility mechanism. This is controlled by the ```#define TRANSFER```` command in that file. It defaults to being disabled. * Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE). * Fixed an obscure memory leak in ncdump. * Removed some obsolete unit testing code and test cases. * Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c. * Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4. * Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects. * Modify the semantics of zodom to properly handle stride > 1. * Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
#define ZIP_PROPERTIES (NCZM_WRITEONCE)
/*
Do a simple mapping of our simplified map model
to a zip-file
Every dataset is assumed to be rooted at some directory in the
zip file tree. So, its location is defined by some path to a
zip file representing the dataset.
For the object API, the mapping is as follows:
1. Every content-bearing object (e.g. .zgroup or .zarray) is mapped to a zip entry.
This means that if a key points to a content bearing object then
no other key can have that content bearing key as a suffix.
2. The meta data containing files are assumed to contain
UTF-8 character data.
3. The chunk containing files are assumed to contain raw unsigned 8-bit byte data.
4. The objects may or may not be compressed; this implementation writes uncompressed objects.
*/
/* define the var name containing an objects content */
#define ZCONTENT "data"
/* Define the "subclass" of NCZMAP */
typedef struct ZZMAP {
NCZMAP map;
char* root;
char* dataset; /* prefix for all keys in zip file */
zip_t* archive;
char** searchcache;
} ZZMAP;
typedef zip_int64_t ZINDEX;;
/* Forward */
static NCZMAP_API zapi;
static int zipclose(NCZMAP* map, int delete);
static int zzcreategroup(ZZMAP*, const char* key, int nskip);
static int zzlookupobj(ZZMAP*, const char* key, ZINDEX* fd);
static int zzlen(ZZMAP* zzmap, ZINDEX zindex, size64_t* lenp);
static int zipmaperr(ZZMAP* zzmap);
static int ziperr(zip_error_t* zerror);
static int ziperrno(int zerror);
static void freesearchcache(char** cache);
static int zzinitialized = 0;
static void
zipinitialize(void)
{
if(!zzinitialized) {
ZTRACE(7,NULL);
zzinitialized = 1;
(void)ZUNTRACE(NC_NOERR);
}
}
/* Define the Dataset level API */
/*
@param datasetpath abs path in the file tree of the root of the dataset'
might be a relative path.
@param mode the netcdf-c mode flags
@param flags extra flags
@param flags extra parameters
@param mapp return the map object in this
*/
static int
zipcreate(const char *path, int mode, size64_t flags, void* parameters, NCZMAP** mapp)
{
int stat = NC_NOERR;
ZZMAP* zzmap = NULL;
NCURI* url = NULL;
zip_flags_t zipflags = 0;
int zerrno = ZIP_ER_OK;
ZINDEX zindex = -1;
Support MSYS2/Mingw platform re: The current netcdf-c release has some problems with the mingw platform on windows. Mostly they are path issues. Changes to support mingw+msys2: ------------------------------- * Enable option of looking into the windows registry to find the mingw root path. In aid of proper path handling. * Add mingw+msys as a specific platform in configure.ac and move testing of the platform to the front so it is available early. * Handle mingw X libncpoco (dynamic loader) properly even though mingw does not yet support it. * Handle mingw X plugins properly even though mingw does not yet support it. * Alias pwd='pwd -W' to better handle paths in shell scripts. * Plus a number of other minor compile irritations. * Disallow the use of multiple nc_open's on the same file for windows (and mingw) because windows does not seem to handle these properly. Not sure why we did not catch this earlier. * Add mountpoint info to dpathmgr.c to help support mingw. * Cleanup dpathmgr conversions. Known problems: --------------- * I have not been able to get shared libraries to work, so plugins/filters must be disabled. * There is some kind of problem with libcurl that I have not solved, so all uses of libcurl (currently DAP+Byterange) must be disabled. Misc. other fixes: ------------------ * Cleanup the relationship between ENABLE_PLUGINS and various other flags in CMakeLists.txt and configure.ac. * Re-arrange the TESTDIRS order in Makefile.am. * Add pseudo-breakpoint to nclog.[ch] for debugging. * Improve the documentation of the path manager code in ncpathmgr.h * Add better support for relative paths in dpathmgr.c * Default the mode args to NCfopen to include "b" (binary) for windows. * Add optional debugging output in various places. * Make sure that everything builds with plugins disabled. * Fix numerous (s)printf inconsistencies betweenb the format spec and the arguments.
2021-12-24 13:18:56 +08:00
char* abspath = NULL;
NC_UNUSED(parameters);
ZTRACE(6,"path=%s mode=%d flag=%llu",path,mode,flags);
if(!zzinitialized) zipinitialize();
/* Fixup mode flags */
mode = (NC_NETCDF4 | NC_WRITE | mode);
/* path must be a url with file: protocol*/
ncuriparse(path,&url);
if(url == NULL)
{stat = NC_EURL; goto done;}
if(strcasecmp(url->protocol,"file") != 0)
{stat = NC_EURL; goto done;}
/* Build the zz state */
if((zzmap = calloc(1,sizeof(ZZMAP))) == NULL)
{stat = NC_ENOMEM; goto done;}
zzmap->map.format = NCZM_ZIP;
zzmap->map.url = ncuribuild(url,NULL,NULL,NCURIALL);
zzmap->map.flags = flags;
/* create => NC_WRITE */
zzmap->map.mode = mode;
zzmap->map.api = &zapi;
Support MSYS2/Mingw platform re: The current netcdf-c release has some problems with the mingw platform on windows. Mostly they are path issues. Changes to support mingw+msys2: ------------------------------- * Enable option of looking into the windows registry to find the mingw root path. In aid of proper path handling. * Add mingw+msys as a specific platform in configure.ac and move testing of the platform to the front so it is available early. * Handle mingw X libncpoco (dynamic loader) properly even though mingw does not yet support it. * Handle mingw X plugins properly even though mingw does not yet support it. * Alias pwd='pwd -W' to better handle paths in shell scripts. * Plus a number of other minor compile irritations. * Disallow the use of multiple nc_open's on the same file for windows (and mingw) because windows does not seem to handle these properly. Not sure why we did not catch this earlier. * Add mountpoint info to dpathmgr.c to help support mingw. * Cleanup dpathmgr conversions. Known problems: --------------- * I have not been able to get shared libraries to work, so plugins/filters must be disabled. * There is some kind of problem with libcurl that I have not solved, so all uses of libcurl (currently DAP+Byterange) must be disabled. Misc. other fixes: ------------------ * Cleanup the relationship between ENABLE_PLUGINS and various other flags in CMakeLists.txt and configure.ac. * Re-arrange the TESTDIRS order in Makefile.am. * Add pseudo-breakpoint to nclog.[ch] for debugging. * Improve the documentation of the path manager code in ncpathmgr.h * Add better support for relative paths in dpathmgr.c * Default the mode args to NCfopen to include "b" (binary) for windows. * Add optional debugging output in various places. * Make sure that everything builds with plugins disabled. * Fix numerous (s)printf inconsistencies betweenb the format spec and the arguments.
2021-12-24 13:18:56 +08:00
Add filter support to NCZarr Filter support has three goals: 1. Use the existing HDF5 filter implementations, 2. Allow filter metadata to be stored in the NumCodecs metadata format used by Zarr, 3. Allow filters to be used even when HDF5 is disabled Detailed usage directions are define in docs/filters.md. For now, the existing filter API is left in place. So filters are defined using ''nc_def_var_filter'' using the HDF5 style where the id and parameters are unsigned integers. This is a big change since filters affect many parts of the code. In the following, the terms "compressor" and "filter" and "codec" are generally used synonomously. ### Filter-Related Changes: * In order to support dynamic loading of shared filter libraries, a new library was added in the libncpoco directory; it helps to isolate dynamic loading across multiple platforms. * Provide a json parsing library for use by plugins; this is created by merging libdispatch/ncjson.c with include/ncjson.h. * Add a new _Codecs attribute to allow clients to see what codecs are being used; let ncdump -s print it out. * Provide special headers to help support compilation of HDF5 filters when HDF5 is not enabled: netcdf_filter_hdf5_build.h and netcdf_filter_build.h. * Add a number of new test to test the new nczarr filters. * Let ncgen parse _Codecs attribute, although it is ignored. ### Plugin directory changes: * Add support for the Blosc compressor; this is essential because it is the most common compressor used in Zarr datasets. This also necessitated adding a CMake FindBlosc.cmake file * Add NCZarr support for the big-four filters provided by HDF5: shuffle, fletcher32, deflate (zlib), and szip * Add a Codec defaulter (see docs/filters.md) for the big four filters. * Make plugins work with windows by properly adding __declspec declaration. ### Misc. Non-Filter Changes * Replace most uses of USE_NETCDF4 (deprecated) with USE_HDF5. * Improve support for caching * More fixes for path conversion code * Fix misc. memory leaks * Add new utility -- ncdump/ncpathcvt -- that does more or less the same thing as cygpath. * Add a number of new test to test the non-filter fixes. * Update the parsers * Convert most instances of '#ifdef _MSC_VER' to '#ifdef _WIN32'
2021-09-03 07:04:26 +08:00
/* Since root is in canonical form, we need to convert to local form */
if((zzmap->root = NCpathcvt(url->path))==NULL)
{stat = NC_ENOMEM; goto done;}
Support MSYS2/Mingw platform re: The current netcdf-c release has some problems with the mingw platform on windows. Mostly they are path issues. Changes to support mingw+msys2: ------------------------------- * Enable option of looking into the windows registry to find the mingw root path. In aid of proper path handling. * Add mingw+msys as a specific platform in configure.ac and move testing of the platform to the front so it is available early. * Handle mingw X libncpoco (dynamic loader) properly even though mingw does not yet support it. * Handle mingw X plugins properly even though mingw does not yet support it. * Alias pwd='pwd -W' to better handle paths in shell scripts. * Plus a number of other minor compile irritations. * Disallow the use of multiple nc_open's on the same file for windows (and mingw) because windows does not seem to handle these properly. Not sure why we did not catch this earlier. * Add mountpoint info to dpathmgr.c to help support mingw. * Cleanup dpathmgr conversions. Known problems: --------------- * I have not been able to get shared libraries to work, so plugins/filters must be disabled. * There is some kind of problem with libcurl that I have not solved, so all uses of libcurl (currently DAP+Byterange) must be disabled. Misc. other fixes: ------------------ * Cleanup the relationship between ENABLE_PLUGINS and various other flags in CMakeLists.txt and configure.ac. * Re-arrange the TESTDIRS order in Makefile.am. * Add pseudo-breakpoint to nclog.[ch] for debugging. * Improve the documentation of the path manager code in ncpathmgr.h * Add better support for relative paths in dpathmgr.c * Default the mode args to NCfopen to include "b" (binary) for windows. * Add optional debugging output in various places. * Make sure that everything builds with plugins disabled. * Fix numerous (s)printf inconsistencies betweenb the format spec and the arguments.
2021-12-24 13:18:56 +08:00
/* Make the root path be absolute */
if((abspath = NCpathabsolute(zzmap->root)) == NULL)
{stat = NC_EURL; goto done;}
nullfree(zzmap->root);
zzmap->root = abspath;
abspath = NULL;
Add filter support to NCZarr Filter support has three goals: 1. Use the existing HDF5 filter implementations, 2. Allow filter metadata to be stored in the NumCodecs metadata format used by Zarr, 3. Allow filters to be used even when HDF5 is disabled Detailed usage directions are define in docs/filters.md. For now, the existing filter API is left in place. So filters are defined using ''nc_def_var_filter'' using the HDF5 style where the id and parameters are unsigned integers. This is a big change since filters affect many parts of the code. In the following, the terms "compressor" and "filter" and "codec" are generally used synonomously. ### Filter-Related Changes: * In order to support dynamic loading of shared filter libraries, a new library was added in the libncpoco directory; it helps to isolate dynamic loading across multiple platforms. * Provide a json parsing library for use by plugins; this is created by merging libdispatch/ncjson.c with include/ncjson.h. * Add a new _Codecs attribute to allow clients to see what codecs are being used; let ncdump -s print it out. * Provide special headers to help support compilation of HDF5 filters when HDF5 is not enabled: netcdf_filter_hdf5_build.h and netcdf_filter_build.h. * Add a number of new test to test the new nczarr filters. * Let ncgen parse _Codecs attribute, although it is ignored. ### Plugin directory changes: * Add support for the Blosc compressor; this is essential because it is the most common compressor used in Zarr datasets. This also necessitated adding a CMake FindBlosc.cmake file * Add NCZarr support for the big-four filters provided by HDF5: shuffle, fletcher32, deflate (zlib), and szip * Add a Codec defaulter (see docs/filters.md) for the big four filters. * Make plugins work with windows by properly adding __declspec declaration. ### Misc. Non-Filter Changes * Replace most uses of USE_NETCDF4 (deprecated) with USE_HDF5. * Improve support for caching * More fixes for path conversion code * Fix misc. memory leaks * Add new utility -- ncdump/ncpathcvt -- that does more or less the same thing as cygpath. * Add a number of new test to test the non-filter fixes. * Update the parsers * Convert most instances of '#ifdef _MSC_VER' to '#ifdef _WIN32'
2021-09-03 07:04:26 +08:00
/* Extract the dataset name */
if((stat = nczm_basename(url->path,&zzmap->dataset))) goto done;
/* Set zip openflags */
zipflags |= ZIP_CREATE;
if(fIsSet(mode,NC_NOCLOBBER))
zipflags |= ZIP_EXCL;
else
zipflags |= ZIP_TRUNCATE;
#ifdef VERIFY
zipflags |= ZIP_CHECKCONS;
#endif
if((zzmap->archive = zip_open(zzmap->root,zipflags,&zerrno))==NULL)
{stat = ziperrno(zerrno); goto done;}
/* Tell it about the dataset as a dir */
if((zindex = zip_dir_add(zzmap->archive, zzmap->dataset, ZIP_FL_ENC_UTF_8))<0)
{stat = zipmaperr(zzmap); goto done;}
/* Dataset superblock will be written by higher layer */
if(mapp) {*mapp = (NCZMAP*)zzmap; zzmap = NULL;}
done:
Support MSYS2/Mingw platform re: The current netcdf-c release has some problems with the mingw platform on windows. Mostly they are path issues. Changes to support mingw+msys2: ------------------------------- * Enable option of looking into the windows registry to find the mingw root path. In aid of proper path handling. * Add mingw+msys as a specific platform in configure.ac and move testing of the platform to the front so it is available early. * Handle mingw X libncpoco (dynamic loader) properly even though mingw does not yet support it. * Handle mingw X plugins properly even though mingw does not yet support it. * Alias pwd='pwd -W' to better handle paths in shell scripts. * Plus a number of other minor compile irritations. * Disallow the use of multiple nc_open's on the same file for windows (and mingw) because windows does not seem to handle these properly. Not sure why we did not catch this earlier. * Add mountpoint info to dpathmgr.c to help support mingw. * Cleanup dpathmgr conversions. Known problems: --------------- * I have not been able to get shared libraries to work, so plugins/filters must be disabled. * There is some kind of problem with libcurl that I have not solved, so all uses of libcurl (currently DAP+Byterange) must be disabled. Misc. other fixes: ------------------ * Cleanup the relationship between ENABLE_PLUGINS and various other flags in CMakeLists.txt and configure.ac. * Re-arrange the TESTDIRS order in Makefile.am. * Add pseudo-breakpoint to nclog.[ch] for debugging. * Improve the documentation of the path manager code in ncpathmgr.h * Add better support for relative paths in dpathmgr.c * Default the mode args to NCfopen to include "b" (binary) for windows. * Add optional debugging output in various places. * Make sure that everything builds with plugins disabled. * Fix numerous (s)printf inconsistencies betweenb the format spec and the arguments.
2021-12-24 13:18:56 +08:00
nullfree(abspath);
ncurifree(url);
if(zzmap) zipclose((NCZMAP*)zzmap,1);
return ZUNTRACE(stat);
}
/*
@param datasetpath abs path in the file tree of the root of the dataset'
might be a relative path.
@param mode the netcdf-c mode flags
@param flags extra flags
@param flags extra parameters
@param mapp return the map object in this
*/
static int
zipopen(const char *path, int mode, size64_t flags, void* parameters, NCZMAP** mapp)
{
int stat = NC_NOERR;
ZZMAP* zzmap = NULL;
NCURI*url = NULL;
zip_flags_t zipflags = 0;
int zerrno = ZIP_ER_OK;
Support MSYS2/Mingw platform re: The current netcdf-c release has some problems with the mingw platform on windows. Mostly they are path issues. Changes to support mingw+msys2: ------------------------------- * Enable option of looking into the windows registry to find the mingw root path. In aid of proper path handling. * Add mingw+msys as a specific platform in configure.ac and move testing of the platform to the front so it is available early. * Handle mingw X libncpoco (dynamic loader) properly even though mingw does not yet support it. * Handle mingw X plugins properly even though mingw does not yet support it. * Alias pwd='pwd -W' to better handle paths in shell scripts. * Plus a number of other minor compile irritations. * Disallow the use of multiple nc_open's on the same file for windows (and mingw) because windows does not seem to handle these properly. Not sure why we did not catch this earlier. * Add mountpoint info to dpathmgr.c to help support mingw. * Cleanup dpathmgr conversions. Known problems: --------------- * I have not been able to get shared libraries to work, so plugins/filters must be disabled. * There is some kind of problem with libcurl that I have not solved, so all uses of libcurl (currently DAP+Byterange) must be disabled. Misc. other fixes: ------------------ * Cleanup the relationship between ENABLE_PLUGINS and various other flags in CMakeLists.txt and configure.ac. * Re-arrange the TESTDIRS order in Makefile.am. * Add pseudo-breakpoint to nclog.[ch] for debugging. * Improve the documentation of the path manager code in ncpathmgr.h * Add better support for relative paths in dpathmgr.c * Default the mode args to NCfopen to include "b" (binary) for windows. * Add optional debugging output in various places. * Make sure that everything builds with plugins disabled. * Fix numerous (s)printf inconsistencies betweenb the format spec and the arguments.
2021-12-24 13:18:56 +08:00
char* abspath = NULL;
NC_UNUSED(parameters);
ZTRACE(6,"path=%s mode=%d flags=%llu",path,mode,flags);
if(!zzinitialized) zipinitialize();
/* Fixup mode flags */
mode = (NC_NETCDF4 | mode);
/* path must be a url with file: protocol*/
ncuriparse(path,&url);
if(url == NULL)
{stat = NC_EURL; goto done;}
if(strcasecmp(url->protocol,"file") != 0)
{stat = NC_EURL; goto done;}
/* Build the zz state */
if((zzmap = calloc(1,sizeof(ZZMAP))) == NULL)
{stat = NC_ENOMEM; goto done;}
zzmap->map.format = NCZM_ZIP;
zzmap->map.url = ncuribuild(url,NULL,NULL,NCURIALL);
zzmap->map.flags = flags;
zzmap->map.mode = mode;
zzmap->map.api = (NCZMAP_API*)&zapi;
Add filter support to NCZarr Filter support has three goals: 1. Use the existing HDF5 filter implementations, 2. Allow filter metadata to be stored in the NumCodecs metadata format used by Zarr, 3. Allow filters to be used even when HDF5 is disabled Detailed usage directions are define in docs/filters.md. For now, the existing filter API is left in place. So filters are defined using ''nc_def_var_filter'' using the HDF5 style where the id and parameters are unsigned integers. This is a big change since filters affect many parts of the code. In the following, the terms "compressor" and "filter" and "codec" are generally used synonomously. ### Filter-Related Changes: * In order to support dynamic loading of shared filter libraries, a new library was added in the libncpoco directory; it helps to isolate dynamic loading across multiple platforms. * Provide a json parsing library for use by plugins; this is created by merging libdispatch/ncjson.c with include/ncjson.h. * Add a new _Codecs attribute to allow clients to see what codecs are being used; let ncdump -s print it out. * Provide special headers to help support compilation of HDF5 filters when HDF5 is not enabled: netcdf_filter_hdf5_build.h and netcdf_filter_build.h. * Add a number of new test to test the new nczarr filters. * Let ncgen parse _Codecs attribute, although it is ignored. ### Plugin directory changes: * Add support for the Blosc compressor; this is essential because it is the most common compressor used in Zarr datasets. This also necessitated adding a CMake FindBlosc.cmake file * Add NCZarr support for the big-four filters provided by HDF5: shuffle, fletcher32, deflate (zlib), and szip * Add a Codec defaulter (see docs/filters.md) for the big four filters. * Make plugins work with windows by properly adding __declspec declaration. ### Misc. Non-Filter Changes * Replace most uses of USE_NETCDF4 (deprecated) with USE_HDF5. * Improve support for caching * More fixes for path conversion code * Fix misc. memory leaks * Add new utility -- ncdump/ncpathcvt -- that does more or less the same thing as cygpath. * Add a number of new test to test the non-filter fixes. * Update the parsers * Convert most instances of '#ifdef _MSC_VER' to '#ifdef _WIN32'
2021-09-03 07:04:26 +08:00
/* Since root is in canonical form, we need to convert to local form */
if((zzmap->root = NCpathcvt(url->path))==NULL)
{stat = NC_ENOMEM; goto done;}
Support MSYS2/Mingw platform re: The current netcdf-c release has some problems with the mingw platform on windows. Mostly they are path issues. Changes to support mingw+msys2: ------------------------------- * Enable option of looking into the windows registry to find the mingw root path. In aid of proper path handling. * Add mingw+msys as a specific platform in configure.ac and move testing of the platform to the front so it is available early. * Handle mingw X libncpoco (dynamic loader) properly even though mingw does not yet support it. * Handle mingw X plugins properly even though mingw does not yet support it. * Alias pwd='pwd -W' to better handle paths in shell scripts. * Plus a number of other minor compile irritations. * Disallow the use of multiple nc_open's on the same file for windows (and mingw) because windows does not seem to handle these properly. Not sure why we did not catch this earlier. * Add mountpoint info to dpathmgr.c to help support mingw. * Cleanup dpathmgr conversions. Known problems: --------------- * I have not been able to get shared libraries to work, so plugins/filters must be disabled. * There is some kind of problem with libcurl that I have not solved, so all uses of libcurl (currently DAP+Byterange) must be disabled. Misc. other fixes: ------------------ * Cleanup the relationship between ENABLE_PLUGINS and various other flags in CMakeLists.txt and configure.ac. * Re-arrange the TESTDIRS order in Makefile.am. * Add pseudo-breakpoint to nclog.[ch] for debugging. * Improve the documentation of the path manager code in ncpathmgr.h * Add better support for relative paths in dpathmgr.c * Default the mode args to NCfopen to include "b" (binary) for windows. * Add optional debugging output in various places. * Make sure that everything builds with plugins disabled. * Fix numerous (s)printf inconsistencies betweenb the format spec and the arguments.
2021-12-24 13:18:56 +08:00
/* Make the root path be absolute */
if((abspath = NCpathabsolute(zzmap->root)) == NULL)
{stat = NC_EURL; goto done;}
nullfree(zzmap->root);
zzmap->root = abspath;
abspath = NULL;
/* Set zip open flags */
zipflags |= ZIP_CHECKCONS;
if(!fIsSet(mode,NC_WRITE))
zipflags |= ZIP_RDONLY;
#ifdef VERIFY
zipflags |= ZIP_CHECKCONS;
#endif
/* Open the file */
if((zzmap->archive = zip_open(zzmap->root,zipflags,&zerrno))==NULL)
{stat = ziperrno(zerrno); goto done;}
/* Use entry 0 to obtain the dataset name */
{
const char* name;
zip_int64_t num_entries;
num_entries = zip_get_num_entries(zzmap->archive, (zip_flags_t)0);
if(num_entries == 0) {stat = NC_EEMPTY; goto done;}
/* get 0'th entry name */
if((name = zip_get_name(zzmap->archive, 0, (zip_flags_t)0))==NULL)
{stat = zipmaperr(zzmap); goto done;}
if(name[0] == '\0' || name[0] == '/')
{stat = NC_EBADID; goto done;}
/* Extract the first segment as the dataset name */
if((nczm_segment1(name,&zzmap->dataset))) goto done;
}
/* Dataset superblock will be read by higher layer */
if(mapp) {*mapp = (NCZMAP*)zzmap; zzmap = NULL;}
done:
Support MSYS2/Mingw platform re: The current netcdf-c release has some problems with the mingw platform on windows. Mostly they are path issues. Changes to support mingw+msys2: ------------------------------- * Enable option of looking into the windows registry to find the mingw root path. In aid of proper path handling. * Add mingw+msys as a specific platform in configure.ac and move testing of the platform to the front so it is available early. * Handle mingw X libncpoco (dynamic loader) properly even though mingw does not yet support it. * Handle mingw X plugins properly even though mingw does not yet support it. * Alias pwd='pwd -W' to better handle paths in shell scripts. * Plus a number of other minor compile irritations. * Disallow the use of multiple nc_open's on the same file for windows (and mingw) because windows does not seem to handle these properly. Not sure why we did not catch this earlier. * Add mountpoint info to dpathmgr.c to help support mingw. * Cleanup dpathmgr conversions. Known problems: --------------- * I have not been able to get shared libraries to work, so plugins/filters must be disabled. * There is some kind of problem with libcurl that I have not solved, so all uses of libcurl (currently DAP+Byterange) must be disabled. Misc. other fixes: ------------------ * Cleanup the relationship between ENABLE_PLUGINS and various other flags in CMakeLists.txt and configure.ac. * Re-arrange the TESTDIRS order in Makefile.am. * Add pseudo-breakpoint to nclog.[ch] for debugging. * Improve the documentation of the path manager code in ncpathmgr.h * Add better support for relative paths in dpathmgr.c * Default the mode args to NCfopen to include "b" (binary) for windows. * Add optional debugging output in various places. * Make sure that everything builds with plugins disabled. * Fix numerous (s)printf inconsistencies betweenb the format spec and the arguments.
2021-12-24 13:18:56 +08:00
nullfree(abspath);
ncurifree(url);
if(zzmap) zipclose((NCZMAP*)zzmap,0);
return ZUNTRACE(stat);
}
Mitigate S3 test interference + Unlimited Dimensions in NCZarr This PR started as an attempt to add unlimited dimensions to NCZarr. It did that, but this exposed significant problems with test interference. So this PR is mostly about fixing -- well mitigating anyway -- test interference. The problem of test interference is now documented in the document docs/internal.md. The solutions implemented here are also describe in that document. The solution is somewhat fragile but multiple cleanup mechanisms are provided. Note that this feature requires that the AWS command line utility must be installed. ## Unlimited Dimensions. The existing NCZarr extensions to Zarr are modified to support unlimited dimensions. NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group". Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2. * Form 1: An integer representing the size of the dimension, which is used for simple named dimensions. * Form 2: A dictionary with the following keys and values" - "size" with an integer value representing the (current) size of the dimension. - "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension. For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases. That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension. This is the standard semantics for unlimited dimensions. Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following. * Did a partial refactor of the slice handling code in zwalk.c to clean it up. * Added a number of tests for unlimited dimensions derived from the same test in nc_test4. * Added several NCZarr specific unlimited tests; more are needed. * Add test of endianness. ## Misc. Other Changes * Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the AWS Transfer Utility mechanism. This is controlled by the ```#define TRANSFER```` command in that file. It defaults to being disabled. * Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE). * Fixed an obscure memory leak in ncdump. * Removed some obsolete unit testing code and test cases. * Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c. * Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4. * Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects. * Modify the semantics of zodom to properly handle stride > 1. * Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
static int
ziptruncate(const char* surl)
{
int stat = NC_NOERR;
NCURI* url = NULL;
int errorp = 0;
zip_t *zip = NULL;
ZTRACE(6,"url=%s",surl);
ncuriparse(surl,&url);
if(url == NULL) {stat = NC_EURL; goto done;}
zip = zip_open(url->path, ZIP_CREATE | ZIP_TRUNCATE, &errorp);
zip_close(zip);
done:
ncurifree(url);
return stat;
}
/**************************************************/
/* Map API */
static int
zipclose(NCZMAP* map, int delete)
{
int stat = NC_NOERR;
int zerrno = 0;
ZZMAP* zzmap = (ZZMAP*)map;
if(zzmap == NULL) return NC_NOERR;
ZTRACE(6,"map=%s delete=%d",map->url,delete);
/* Close the zip */
if(delete)
zip_discard(zzmap->archive);
else {
if((zerrno=zip_close(zzmap->archive)))
stat = ziperrno(zerrno);
}
if(delete)
NCremove(zzmap->root);
zzmap->archive = NULL;
nczm_clear(map);
nullfree(zzmap->root);
nullfree(zzmap->dataset);
zzmap->root = NULL;
freesearchcache(zzmap->searchcache);
free(zzmap);
return ZUNTRACE(stat);
}
/**************************************************/
/* Object API */
static int
zipexists(NCZMAP* map, const char* key)
{
int stat = NC_NOERR;
ZZMAP* zzmap = (ZZMAP*)map;
ZINDEX zindex = -1;
ZTRACE(6,"map=%s key=%s",map->url,key);
switch(stat=zzlookupobj(zzmap,key,&zindex)) {
case NC_NOERR: break;
case NC_ENOOBJECT: stat = NC_EEMPTY; break;
case NC_EEMPTY: break;
default: break;
}
return ZUNTRACE(stat);
}
static int
ziplen(NCZMAP* map, const char* key, size64_t* lenp)
{
int stat = NC_NOERR;
ZZMAP* zzmap = (ZZMAP*)map;
size64_t len = 0;
ZINDEX zindex = -1;
ZTRACE(6,"map=%s key=%s",map->url,key);
switch(stat = zzlookupobj(zzmap,key,&zindex)) {
case NC_NOERR:
if((stat = zzlen(zzmap,zindex,&len))) goto done;
break;
case NC_ENOOBJECT: stat = NC_EEMPTY; len = 0; break;
case NC_EEMPTY: len = 0; break; /* |dir|==0 */
default: goto done;
}
if(lenp) *lenp = len;
done:
return ZUNTRACEX(stat,"len=%llu",(lenp?*lenp:777777777777));
}
static int
zipread(NCZMAP* map, const char* key, size64_t start, size64_t count, void* content)
{
int stat = NC_NOERR;
ZZMAP* zzmap = (ZZMAP*)map; /* cast to true type */
zip_file_t* zfile = NULL;
ZINDEX zindex = -1;
zip_flags_t zipflags = 0;
int zerrno;
size64_t endpoint;
char* buffer = NULL;
char* truekey = NULL;
zip_int64_t red = 0;
Upgrade the nczarr code to match Zarr V2 Re: https://github.com/zarr-developers/zarr-python/pull/716 The Zarr version 2 spec has been extended to include the ability to choose the dimension separator in chunk name keys. The legal separators has been extended from {'.'} to {'.' '/'}. So now it is possible to use a key like "0/1/2/0" for chunk names. This PR implements this for NCZarr. The V2 spec now says that this separator can be set on a per-variable basis. For now, I have chosen to allow this be set only globally by adding a key named "ZARR.DIMENSION_SEPARATOR=<char>" in the .daprc/.dodsrc/ncrc file. Currently, the only legal separator characters are '.' (the default) and '/'. On writing, this key will only be written if its value is different than the default. This change caused problems because supporting a separator of '/' is difficult to parse when keys/paths use '/' as the path separator. A test case was added for this. Additionally, make nczarr be enabled default by default. This required some additional changes so that if zip and/or AWS S3 sdk are unavailable, then they are disabled for NCZarr. In addition the following unrelated changes were made. 1. Tested that pure-zarr mode could read an nczarr formatted store. 1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added. 1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl is not found then the other options that depend upon it properly are disabled. 1. I decided that xarray support should be enabled by default for pure zarr. In order to allow disabling, I added a new mode flag "noxarray". 1. Certain test in nczarr_test depend on use of .dodsrc. In order for these to work when testing in parallel, some inter-test dependencies needed to be added. 1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
ZTRACE(6,"map=%s key=%s start=%llu count=%llu",map->url,key,start,count);
switch(stat = zzlookupobj(zzmap,key,&zindex)) {
case NC_NOERR: break;
case NC_ENOOBJECT: stat = NC_EEMPTY; /* fall thru */
case NC_EEMPTY: /* its a dir; fall thru*/
default: goto done;
}
/* Note, assume key[0] == '/' */
if((stat = nczm_appendn(&truekey,2,zzmap->dataset,key)))
goto done;
zfile = zip_fopen(zzmap->archive, truekey, zipflags);
if(zfile == NULL)
{stat = (zipmaperr(zzmap)); goto done;}
/* Ideally, we would like to seek to the start point,
but that will fail if the file is compressed, so
we need to read whole thing and extract what we need
*/
/* read data starting at zero */
if(start == 0) { /*optimize to read directly into content */
if((red = zip_fread(zfile, content, (zip_uint64_t)count)) < 0)
{stat = (zipmaperr(zzmap)); goto done;}
if(red < count) {stat = NC_EINTERNAL; goto done;}
} else {
endpoint = start + count;
if((buffer = malloc(endpoint))==NULL) /* consider caching this */
{stat = NC_ENOMEM; goto done;}
if((red = zip_fread(zfile, buffer, (zip_uint64_t)endpoint)) < 0)
{stat = (zipmaperr(zzmap)); goto done;}
if(red < endpoint) {stat = NC_EINTERNAL; goto done;}
/* Extract what we need */
memcpy(content,buffer+start,count);
}
done:
nullfree(truekey);
nullfree(buffer);
if(zfile != NULL && (zerrno=zip_fclose(zfile)) != 0)
{stat = ziperrno(zerrno);}
return ZUNTRACE(stat);
}
static int
Mitigate S3 test interference + Unlimited Dimensions in NCZarr This PR started as an attempt to add unlimited dimensions to NCZarr. It did that, but this exposed significant problems with test interference. So this PR is mostly about fixing -- well mitigating anyway -- test interference. The problem of test interference is now documented in the document docs/internal.md. The solutions implemented here are also describe in that document. The solution is somewhat fragile but multiple cleanup mechanisms are provided. Note that this feature requires that the AWS command line utility must be installed. ## Unlimited Dimensions. The existing NCZarr extensions to Zarr are modified to support unlimited dimensions. NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group". Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2. * Form 1: An integer representing the size of the dimension, which is used for simple named dimensions. * Form 2: A dictionary with the following keys and values" - "size" with an integer value representing the (current) size of the dimension. - "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension. For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases. That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension. This is the standard semantics for unlimited dimensions. Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following. * Did a partial refactor of the slice handling code in zwalk.c to clean it up. * Added a number of tests for unlimited dimensions derived from the same test in nc_test4. * Added several NCZarr specific unlimited tests; more are needed. * Add test of endianness. ## Misc. Other Changes * Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the AWS Transfer Utility mechanism. This is controlled by the ```#define TRANSFER```` command in that file. It defaults to being disabled. * Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE). * Fixed an obscure memory leak in ncdump. * Removed some obsolete unit testing code and test cases. * Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c. * Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4. * Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects. * Modify the semantics of zodom to properly handle stride > 1. * Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
zipwrite(NCZMAP* map, const char* key, size64_t count, const void* content)
{
int stat = NC_NOERR;
ZZMAP* zzmap = (ZZMAP*)map; /* cast to true type */
char* truekey = NULL;
zip_flags_t zflags = 0;
zip_source_t* zs = NULL;
ZINDEX zindex = -1;
zip_int32_t compression = 0;
zip_error_t zerror;
void* localbuffer = NULL;
Upgrade the nczarr code to match Zarr V2 Re: https://github.com/zarr-developers/zarr-python/pull/716 The Zarr version 2 spec has been extended to include the ability to choose the dimension separator in chunk name keys. The legal separators has been extended from {'.'} to {'.' '/'}. So now it is possible to use a key like "0/1/2/0" for chunk names. This PR implements this for NCZarr. The V2 spec now says that this separator can be set on a per-variable basis. For now, I have chosen to allow this be set only globally by adding a key named "ZARR.DIMENSION_SEPARATOR=<char>" in the .daprc/.dodsrc/ncrc file. Currently, the only legal separator characters are '.' (the default) and '/'. On writing, this key will only be written if its value is different than the default. This change caused problems because supporting a separator of '/' is difficult to parse when keys/paths use '/' as the path separator. A test case was added for this. Additionally, make nczarr be enabled default by default. This required some additional changes so that if zip and/or AWS S3 sdk are unavailable, then they are disabled for NCZarr. In addition the following unrelated changes were made. 1. Tested that pure-zarr mode could read an nczarr formatted store. 1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added. 1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl is not found then the other options that depend upon it properly are disabled. 1. I decided that xarray support should be enabled by default for pure zarr. In order to allow disabling, I added a new mode flag "noxarray". 1. Certain test in nczarr_test depend on use of .dodsrc. In order for these to work when testing in parallel, some inter-test dependencies needed to be added. 1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
Mitigate S3 test interference + Unlimited Dimensions in NCZarr This PR started as an attempt to add unlimited dimensions to NCZarr. It did that, but this exposed significant problems with test interference. So this PR is mostly about fixing -- well mitigating anyway -- test interference. The problem of test interference is now documented in the document docs/internal.md. The solutions implemented here are also describe in that document. The solution is somewhat fragile but multiple cleanup mechanisms are provided. Note that this feature requires that the AWS command line utility must be installed. ## Unlimited Dimensions. The existing NCZarr extensions to Zarr are modified to support unlimited dimensions. NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group". Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2. * Form 1: An integer representing the size of the dimension, which is used for simple named dimensions. * Form 2: A dictionary with the following keys and values" - "size" with an integer value representing the (current) size of the dimension. - "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension. For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases. That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension. This is the standard semantics for unlimited dimensions. Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following. * Did a partial refactor of the slice handling code in zwalk.c to clean it up. * Added a number of tests for unlimited dimensions derived from the same test in nc_test4. * Added several NCZarr specific unlimited tests; more are needed. * Add test of endianness. ## Misc. Other Changes * Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the AWS Transfer Utility mechanism. This is controlled by the ```#define TRANSFER```` command in that file. It defaults to being disabled. * Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE). * Fixed an obscure memory leak in ncdump. * Removed some obsolete unit testing code and test cases. * Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c. * Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4. * Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects. * Modify the semantics of zodom to properly handle stride > 1. * Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
ZTRACE(6,"map=%s key=%s count=%llu",map->url,key,count);
zip_error_init(&zerror);
/* Create directories */
if((stat = zzcreategroup(zzmap,key,SKIPLAST))) goto done;
switch(stat = zzlookupobj(zzmap,key,&zindex)) {
case NC_NOERR:
stat = NC_EOBJECT; //goto done; /* Zip files are write once */
zflags |= ZIP_FL_OVERWRITE;
break;
case NC_ENOOBJECT: stat = NC_NOERR; break;
case NC_EEMPTY: /* its a dir; fall thru */
default: goto done;
}
zflags |= ZIP_FL_ENC_UTF_8;
compression = ZIP_CM_STORE;
/* prepend the dataset to get truekey */
/* Note, assume key[0] == '/' */
if((stat = nczm_appendn(&truekey,2,zzmap->dataset,key)))
goto done;
if(count > 0) {
/* Apparently, the buffer to be written needs to be around at zip_close
so we need to make a local copy that will be freed by libzip after it is
no longer needed */
/* Duplicate the buffer */
if((localbuffer = malloc((size_t)count))==NULL)
{stat = NC_ENOMEM; goto done;}
memcpy(localbuffer,content,count);
}
if((zs = zip_source_buffer(zzmap->archive, localbuffer, (zip_uint64_t)count, 1)) == NULL)
{stat = zipmaperr(zzmap); goto done;}
if((zindex = zip_file_add(zzmap->archive, truekey, zs, zflags))<0)
{stat = zipmaperr(zzmap); goto done;}
zs = NULL; localbuffer = NULL;
if(zip_set_file_compression(zzmap->archive, zindex, compression, 0) < 0)
{stat = zipmaperr(zzmap); goto done;}
freesearchcache(zzmap->searchcache); zzmap->searchcache = NULL;
done:
if(zs) zip_source_free(zs);
nullfree(localbuffer);
zip_error_fini(&zerror);
nullfree(truekey);
return ZUNTRACE(stat);
}
/*
Return a list of full keys immediately under a specified prefix key.
In theory, the returned list should be sorted in lexical order,
but it possible that it is not.
Note that for zip, it is not possible to get just the keys of length n+1,
so, this search must get all keys and process them one by one.
@return NC_NOERR if success, even if no keys returned.
@return NC_EXXX return true error
*/
int
zipsearch(NCZMAP* map, const char* prefix0, NClist* matches)
{
int stat = NC_NOERR;
ZZMAP* zzmap = (ZZMAP*)map;
char* trueprefix = NULL;
size_t truelen;
zip_int64_t num_entries, i;
char** cache = NULL;
size_t prefixlen;
NClist* tmp = NULL;
ZTRACE(6,"map=%s prefix0=%s",map->url,prefix0);
/* prefix constraints:
1. prefix is "/"
2. or prefix has leading '/' and no trailing '/'
*/
/* Fix up the prefix; including adding the dataset name to the front */
if(prefix0 == NULL || strlen(prefix0)==0)
prefix0 = "/";
/* make sure that prefix0 has leading '/' */
if(prefix0[0] != '/')
{stat = NC_EINVAL; goto done;}
prefixlen = strlen(prefix0);
truelen = prefixlen+strlen(zzmap->dataset)+1; /* possible trailing '/'*/
if((trueprefix = (char*)malloc(truelen+1+1))==NULL) /* nul term */
{stat = NC_ENOMEM; goto done;}
/* Build the true prefix */
trueprefix[0] = '\0';
strlcat(trueprefix,zzmap->dataset,truelen+1);
strlcat(trueprefix,prefix0,truelen+1); /* recall prefix starts with '/' */
/* If the prefix did not end in '/', then add it */
if(prefixlen > 1 && prefix0[prefixlen-1] != '/')
strlcat(trueprefix,"/",truelen+1);
truelen = strlen(trueprefix);
/* Get number of entries */
num_entries = zip_get_num_entries(zzmap->archive, (zip_flags_t)0);
#ifdef CACHESEARCH
if(num_entries > 0 && zzmap->searchcache == NULL) {
/* Release the current cache */
freesearchcache(zzmap->searchcache);
zzmap->searchcache = NULL;
/* Re-build the searchcache */
if((cache = calloc(sizeof(char*),num_entries+1))==NULL)
{stat = NC_ENOMEM; goto done;}
for(i=0;i < num_entries; i++) {
const char *name = NULL;
/* get ith entry */
name = zip_get_name(zzmap->archive, i, (zip_flags_t)0);
/* Add to cache */
if((cache[i] = strdup(name))==NULL)
{stat = NC_ENOMEM; goto done;}
}
cache[num_entries] = NULL;
zzmap->searchcache = cache; cache = NULL;
}
#endif
#ifdef CACHESEARCH
if(zzmap->searchcache != NULL)
#endif
{
const char *key = NULL;
size_t keylen = 0;
char* match = NULL;
const char* p;
tmp = nclistnew();
/* Walk cache looking for names with prefix plus exactly one other segment */
for(i=0;i < num_entries; i++) {
/* get ith entry */
#ifdef CACHESEARCH
key = zzmap->searchcache[i];
#else
key = zip_get_name(zzmap->archive, i, (zip_flags_t)0);
#endif
keylen = strlen(key);
/* Does this name begin with trueprefix? */
if(keylen > 0
&& (keylen <= truelen || strncmp(key,trueprefix,truelen) != 0))
continue; /* no match */
/* skip trueprefix and extract first segment */
p = (key+truelen);
if(*p == '\0') continue; /* key is all there is, so ignore it */
/* get seg 1 */
if((nczm_segment1(p,&match))) goto done;
nclistpush(tmp,match); match = NULL;
}
/* Now remove duplicates */
for(i=0;i<nclistlength(tmp);i++) {
int j;
int duplicate = 0;
const char* is = nclistget(tmp,i);
for(j=0;j<nclistlength(matches);j++) {
const char* js = nclistget(matches,j);
if(strcmp(js,is)==0) {duplicate = 1; break;} /* duplicate */
}
if(!duplicate)
nclistpush(matches,strdup(is));
}
}
done:
nclistfreeall(tmp);
if(cache != NULL) freesearchcache(cache);
nullfree(trueprefix);
return ZUNTRACEX(stat,"|matches|=%d",(int)nclistlength(matches));
}
/**************************************************/
/* Utilities */
/* Guarantee existence of a group */
static int
zzcreategroup(ZZMAP* zzmap, const char* key, int nskip)
{
int stat = NC_NOERR;
int i, len;
char* fullpath = NULL;
NCbytes* path = ncbytesnew();
NClist* segments = nclistnew();
ZINDEX zindex;
zip_flags_t zipflags = ZIP_FL_ENC_UTF_8;
ZTRACE(7,"map=%s key=%s nskip=%d",zzmap->map.url,key,nskip);
if((stat=nczm_split(key,segments)))
goto done;
len = nclistlength(segments);
len -= nskip; /* leave off last nskip segments */
/* Start with the dataset */
ncbytescat(path,zzmap->dataset);
for(i=0;i<len;i++) {
const char* seg = nclistget(segments,i);
ncbytescat(path,"/");
ncbytescat(path,seg);
/* open and/or create the directory */
if((zindex = zip_dir_add(zzmap->archive, ncbytescontents(path), zipflags))<0) {
switch(stat = zipmaperr(zzmap)) {
case NC_EOBJECT: stat = NC_NOERR; break; /* ok */
default:
goto done;
}
}
}
done:
nullfree(fullpath);
ncbytesfree(path);
nclistfreeall(segments);
return ZUNTRACE(stat);
}
/* Lookup a key
@return NC_NOERR if found and is a content-bearing object
@return NC_ENOOBJECT if not found
@return NC_EEMPTY if a dir
*/
static int
zzlookupobj(ZZMAP* zzmap, const char* key, ZINDEX* zindex)
{
int stat = NC_NOERR;
char* zipfile = NULL;
char* zipdir = NULL;
ZTRACE(7,"map=%s key=%s",zzmap->map.url,key);
if(key == NULL)
{stat = NC_EINVAL; goto done;}
/* Note, assume key[0] == '/' */
if((stat = nczm_appendn(&zipfile,2,zzmap->dataset,key)))
goto done;
/* See if exists as a file */
if((*zindex = zip_name_locate(zzmap->archive, zipfile, 0))<0) {
/* do a second check to see if zippath as a dir */
if((stat = nczm_appendn(&zipdir,2,zipfile,"/")))
goto done;
if((*zindex = zip_name_locate(zzmap->archive, zipdir, 0))<0)
{stat = zipmaperr(zzmap); goto done;}
stat = NC_EEMPTY; /* signal a directory */
}
done:
nullfree(zipfile);
nullfree(zipdir);
return ZUNTRACE(stat);
}
/* Get length given the index */
static int
zzlen(ZZMAP* zzmap, ZINDEX zindex, size64_t* lenp)
{
int stat = NC_NOERR;
size64_t len = 0;
zip_stat_t statbuf;
zip_flags_t zipflags = 0;
assert(zindex >= 0);
ZTRACE(6,"zzmap=%s index=%llu",zzmap,zindex);
zip_stat_init(&statbuf);
if(zip_stat_index(zzmap->archive,zindex,zipflags,&statbuf) < 0)
{stat = (zipmaperr(zzmap)); goto done;}
assert(statbuf.valid & ZIP_STAT_SIZE);
len = statbuf.size; /* Always return uncompressed size */
if(lenp) *lenp = len;
done:
return ZUNTRACEX(stat,"len=%llu",(lenp?*lenp:777777777777));
}
static void
freesearchcache(char** cache)
{
char** p;
if(cache == NULL) return;
for(p=cache;*p;p++) {
free(*p);
}
free(cache);
}
/**************************************************/
/* External API objects */
NCZMAP_DS_API zmap_zip = {
NCZM_ZIP_V1,
ZIP_PROPERTIES,
zipcreate,
zipopen,
Mitigate S3 test interference + Unlimited Dimensions in NCZarr This PR started as an attempt to add unlimited dimensions to NCZarr. It did that, but this exposed significant problems with test interference. So this PR is mostly about fixing -- well mitigating anyway -- test interference. The problem of test interference is now documented in the document docs/internal.md. The solutions implemented here are also describe in that document. The solution is somewhat fragile but multiple cleanup mechanisms are provided. Note that this feature requires that the AWS command line utility must be installed. ## Unlimited Dimensions. The existing NCZarr extensions to Zarr are modified to support unlimited dimensions. NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group". Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2. * Form 1: An integer representing the size of the dimension, which is used for simple named dimensions. * Form 2: A dictionary with the following keys and values" - "size" with an integer value representing the (current) size of the dimension. - "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension. For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases. That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension. This is the standard semantics for unlimited dimensions. Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following. * Did a partial refactor of the slice handling code in zwalk.c to clean it up. * Added a number of tests for unlimited dimensions derived from the same test in nc_test4. * Added several NCZarr specific unlimited tests; more are needed. * Add test of endianness. ## Misc. Other Changes * Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the AWS Transfer Utility mechanism. This is controlled by the ```#define TRANSFER```` command in that file. It defaults to being disabled. * Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE). * Fixed an obscure memory leak in ncdump. * Removed some obsolete unit testing code and test cases. * Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c. * Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4. * Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects. * Modify the semantics of zodom to properly handle stride > 1. * Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
ziptruncate,
};
static NCZMAP_API zapi = {
NCZM_ZIP_V1,
zipclose,
zipexists,
ziplen,
zipread,
zipwrite,
zipsearch,
};
static int
zipmaperr(ZZMAP* zzmap)
{
zip_error_t* zerr = (zip_error_t*)zip_get_error(zzmap->archive);
return ziperr(zerr);
}
static int
ziperr(zip_error_t* zerror)
{
int zerrno = zip_error_code_zip(zerror);
return ziperrno(zerrno);
}
static int
ziperrno(int zerror)
{
int stat = NC_NOERR;
switch (zerror) {
case ZIP_ER_OK: stat = NC_NOERR; break;
case ZIP_ER_EXISTS: stat = NC_EOBJECT; break;
case ZIP_ER_MEMORY: stat = NC_ENOMEM; break;
case ZIP_ER_SEEK:
case ZIP_ER_READ:
case ZIP_ER_WRITE:
case ZIP_ER_TMPOPEN:
case ZIP_ER_CRC: stat = NC_EIO; break;
case ZIP_ER_ZIPCLOSED: stat = NC_EBADID; break;
case ZIP_ER_NOENT: stat = NC_ENOOBJECT; break;
case ZIP_ER_OPEN: stat = NC_EACCESS; break;
case ZIP_ER_INVAL: stat = NC_EINVAL; break;
case ZIP_ER_INTERNAL: stat = NC_EINTERNAL; break;
case ZIP_ER_REMOVE: stat = NC_ECANTREMOVE; break;
case ZIP_ER_DELETED: stat = NC_ENOOBJECT; break;
case ZIP_ER_RDONLY: stat = NC_EPERM; break;
case ZIP_ER_CHANGED: stat = NC_EOBJECT; break;
default: stat = NC_ENCZARR; break;
}
return stat;
}