2020-11-20 08:01:04 +08:00
|
|
|
/* Copyright 2018, University Corporation for Atmospheric
|
|
|
|
* Research. See COPYRIGHT file for copying and redistribution
|
|
|
|
* conditions. */
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @file @internal The functions which control NCZ
|
|
|
|
* caching. These caching controls allow the user to change the cache
|
|
|
|
* sizes of ZARR before opening files.
|
|
|
|
*
|
|
|
|
* @author Dennis Heimbigner, Ed Hartnett
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "zincludes.h"
|
|
|
|
#include "zcache.h"
|
|
|
|
#include "ncxcache.h"
|
2021-09-03 07:04:26 +08:00
|
|
|
#include "zfilter.h"
|
2023-11-25 02:20:52 +08:00
|
|
|
#include <stddef.h>
|
2020-11-20 08:01:04 +08:00
|
|
|
|
|
|
|
#undef DEBUG
|
|
|
|
|
|
|
|
#undef FLUSH
|
|
|
|
|
|
|
|
#define LEAFLEN 32
|
|
|
|
|
2023-08-17 13:07:05 +08:00
|
|
|
#define USEPARAMSIZE 0xffffffffffffffff
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
/* Forward */
|
|
|
|
static int get_chunk(NCZChunkCache* cache, NCZCacheEntry* entry);
|
2021-09-03 07:04:26 +08:00
|
|
|
static int put_chunk(NCZChunkCache* cache, NCZCacheEntry*);
|
2023-08-17 13:07:05 +08:00
|
|
|
static int verifycache(NCZChunkCache* cache);
|
2021-09-03 07:04:26 +08:00
|
|
|
static int flushcache(NCZChunkCache* cache);
|
2023-08-17 13:07:05 +08:00
|
|
|
static int constraincache(NCZChunkCache* cache, size64_t needed);
|
2020-11-20 08:01:04 +08:00
|
|
|
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
static void
|
|
|
|
setmodified(NCZCacheEntry* e, int tf)
|
|
|
|
{
|
|
|
|
e->modified = tf;
|
|
|
|
}
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
/**************************************************/
|
|
|
|
/* Dispatch table per-var cache functions */
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @internal Set chunk cache size for a variable. This is the internal
|
|
|
|
* function called by nc_set_var_chunk_cache().
|
|
|
|
*
|
|
|
|
* @param ncid File ID.
|
|
|
|
* @param varid Variable ID.
|
|
|
|
* @param size Size in bytes to set cache.
|
|
|
|
* @param nelems # of entries in cache
|
|
|
|
* @param preemption Controls cache swapping.
|
|
|
|
*
|
|
|
|
* @returns ::NC_NOERR No error.
|
|
|
|
* @returns ::NC_EBADID Bad ncid.
|
|
|
|
* @returns ::NC_ENOTVAR Invalid variable ID.
|
|
|
|
* @returns ::NC_ESTRICTNC3 Attempting netcdf-4 operation on strict
|
|
|
|
* nc3 netcdf-4 file.
|
|
|
|
* @returns ::NC_EINVAL Invalid input.
|
|
|
|
* @returns ::NC_EHDFERR HDF5 error.
|
|
|
|
* @author Ed Hartnett
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
NCZ_set_var_chunk_cache(int ncid, int varid, size_t cachesize, size_t nelems, float preemption)
|
|
|
|
{
|
|
|
|
NC_GRP_INFO_T *grp;
|
|
|
|
NC_FILE_INFO_T *h5;
|
|
|
|
NC_VAR_INFO_T *var;
|
|
|
|
NCZ_VAR_INFO_T *zvar;
|
2021-09-03 07:04:26 +08:00
|
|
|
int retval = NC_NOERR;
|
2020-11-20 08:01:04 +08:00
|
|
|
|
|
|
|
/* Check input for validity. */
|
|
|
|
if (preemption < 0 || preemption > 1)
|
2021-09-03 07:04:26 +08:00
|
|
|
{retval = NC_EINVAL; goto done;}
|
2020-11-20 08:01:04 +08:00
|
|
|
|
|
|
|
/* Find info for this file and group, and set pointer to each. */
|
|
|
|
if ((retval = nc4_find_nc_grp_h5(ncid, NULL, &grp, &h5)))
|
2021-09-03 07:04:26 +08:00
|
|
|
goto done;
|
2020-11-20 08:01:04 +08:00
|
|
|
assert(grp && h5);
|
|
|
|
|
|
|
|
/* Find the var. */
|
|
|
|
if (!(var = (NC_VAR_INFO_T *)ncindexith(grp->vars, varid)))
|
2021-09-03 07:04:26 +08:00
|
|
|
{retval = NC_ENOTVAR; goto done;}
|
2020-11-20 08:01:04 +08:00
|
|
|
assert(var && var->hdr.id == varid);
|
|
|
|
|
|
|
|
zvar = (NCZ_VAR_INFO_T*)var->format_var_info;
|
|
|
|
assert(zvar != NULL && zvar->cache != NULL);
|
|
|
|
|
|
|
|
/* Set the values. */
|
2022-01-30 06:27:52 +08:00
|
|
|
var->chunkcache.size = cachesize;
|
|
|
|
var->chunkcache.nelems = nelems;
|
|
|
|
var->chunkcache.preemption = preemption;
|
2020-11-20 08:01:04 +08:00
|
|
|
|
2021-09-03 07:04:26 +08:00
|
|
|
/* Fix up cache */
|
|
|
|
if((retval = NCZ_adjust_var_cache(var))) goto done;
|
|
|
|
done:
|
|
|
|
return retval;
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @internal Adjust the chunk cache of a var for better
|
|
|
|
* performance.
|
|
|
|
*
|
|
|
|
* @note For contiguous and compact storage vars, or when parallel I/O
|
|
|
|
* is in use, this function will do nothing and return ::NC_NOERR;
|
|
|
|
*
|
|
|
|
* @param grp Pointer to group info struct.
|
|
|
|
* @param var Pointer to var info struct.
|
|
|
|
*
|
|
|
|
* @return ::NC_NOERR No error.
|
|
|
|
* @author Ed Hartnett
|
|
|
|
*/
|
|
|
|
int
|
2021-09-03 07:04:26 +08:00
|
|
|
NCZ_adjust_var_cache(NC_VAR_INFO_T *var)
|
2020-11-20 08:01:04 +08:00
|
|
|
{
|
2021-09-03 07:04:26 +08:00
|
|
|
int stat = NC_NOERR;
|
2020-11-20 08:01:04 +08:00
|
|
|
NCZ_VAR_INFO_T* zvar = (NCZ_VAR_INFO_T*)var->format_var_info;
|
2022-01-25 06:22:24 +08:00
|
|
|
NCZChunkCache* zcache = NULL;
|
|
|
|
|
|
|
|
zcache = zvar->cache;
|
|
|
|
if(zcache->valid) goto done;
|
2021-09-03 07:04:26 +08:00
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
fprintf(stderr,"xxx: adjusting cache for: %s\n",var->hdr.name);
|
|
|
|
#endif
|
2020-11-20 08:01:04 +08:00
|
|
|
|
2022-01-25 06:22:24 +08:00
|
|
|
/* completely empty the cache */
|
|
|
|
flushcache(zcache);
|
|
|
|
|
|
|
|
/* Reclaim any existing fill_chunk */
|
|
|
|
if((stat = NCZ_reclaim_fill_chunk(zcache))) goto done;
|
2020-11-20 08:01:04 +08:00
|
|
|
/* Reset the parameters */
|
2023-08-11 06:57:57 +08:00
|
|
|
zvar->cache->params.size = var->chunkcache.size;
|
|
|
|
zvar->cache->params.nelems = var->chunkcache.nelems;
|
|
|
|
zvar->cache->params.preemption = var->chunkcache.preemption;
|
2020-11-20 08:01:04 +08:00
|
|
|
#ifdef DEBUG
|
|
|
|
fprintf(stderr,"%s.cache.adjust: size=%ld nelems=%ld\n",
|
2021-09-03 07:04:26 +08:00
|
|
|
var->hdr.name,(unsigned long)zvar->cache->maxsize,(unsigned long)zvar->cache->maxentries);
|
2020-11-20 08:01:04 +08:00
|
|
|
#endif
|
2022-01-25 06:22:24 +08:00
|
|
|
/* One more thing, adjust the chunksize and count*/
|
|
|
|
zcache->chunksize = zvar->chunksize;
|
|
|
|
zcache->chunkcount = 1;
|
|
|
|
if(var->ndims > 0) {
|
|
|
|
int i;
|
|
|
|
for(i=0;i<var->ndims;i++) {
|
|
|
|
zcache->chunkcount *= var->chunksizes[i];
|
|
|
|
}
|
2021-09-03 07:04:26 +08:00
|
|
|
}
|
2022-01-25 06:22:24 +08:00
|
|
|
zcache->valid = 1;
|
|
|
|
done:
|
2021-09-03 07:04:26 +08:00
|
|
|
return stat;
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************/
|
|
|
|
/**
|
|
|
|
* Create a chunk cache object
|
|
|
|
*
|
|
|
|
* @param var containing var
|
|
|
|
* @param entrysize Size in bytes of an entry
|
|
|
|
* @param cachep return cache pointer
|
|
|
|
*
|
|
|
|
* @return ::NC_NOERR No error.
|
|
|
|
* @return ::NC_EINVAL Bad preemption.
|
|
|
|
* @author Dennis Heimbigner, Ed Hartnett
|
|
|
|
*/
|
|
|
|
int
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
NCZ_create_chunk_cache(NC_VAR_INFO_T* var, size64_t chunksize, char dimsep, NCZChunkCache** cachep)
|
2020-11-20 08:01:04 +08:00
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
NCZChunkCache* cache = NULL;
|
|
|
|
void* fill = NULL;
|
|
|
|
NCZ_VAR_INFO_T* zvar = NULL;
|
|
|
|
|
|
|
|
if(chunksize == 0) return NC_EINVAL;
|
|
|
|
|
|
|
|
zvar = (NCZ_VAR_INFO_T*)var->format_var_info;
|
|
|
|
if((cache = calloc(1,sizeof(NCZChunkCache))) == NULL)
|
|
|
|
{stat = NC_ENOMEM; goto done;}
|
|
|
|
cache->var = var;
|
|
|
|
cache->ndims = var->ndims + zvar->scalar;
|
|
|
|
cache->fillchunk = NULL;
|
|
|
|
cache->chunksize = chunksize;
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
cache->dimension_separator = dimsep;
|
2021-09-03 07:04:26 +08:00
|
|
|
zvar->cache = cache;
|
2020-11-20 08:01:04 +08:00
|
|
|
|
2022-01-25 06:22:24 +08:00
|
|
|
cache->chunkcount = 1;
|
|
|
|
if(var->ndims > 0) {
|
|
|
|
int i;
|
|
|
|
for(i=0;i<var->ndims;i++) {
|
|
|
|
cache->chunkcount *= var->chunksizes[i];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-08-11 06:57:57 +08:00
|
|
|
/* Set default cache parameters */
|
|
|
|
cache->params = NC_getglobalstate()->chunkcache;
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
#ifdef FLUSH
|
|
|
|
cache->maxentries = 1;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef DEBUG
|
|
|
|
fprintf(stderr,"%s.cache: nelems=%ld size=%ld\n",
|
2021-09-03 07:04:26 +08:00
|
|
|
var->hdr.name,(unsigned long)cache->maxentries,(unsigned long)cache->maxsize);
|
2020-11-20 08:01:04 +08:00
|
|
|
#endif
|
|
|
|
if((stat = ncxcachenew(LEAFLEN,&cache->xcache))) goto done;
|
|
|
|
if((cache->mru = nclistnew()) == NULL)
|
|
|
|
{stat = NC_ENOMEM; goto done;}
|
2023-08-11 06:57:57 +08:00
|
|
|
nclistsetalloc(cache->mru,cache->params.nelems);
|
2022-01-25 06:22:24 +08:00
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
if(cachep) {*cachep = cache; cache = NULL;}
|
|
|
|
done:
|
|
|
|
nullfree(fill);
|
|
|
|
NCZ_free_chunk_cache(cache);
|
|
|
|
return THROW(stat);
|
|
|
|
}
|
|
|
|
|
2021-09-03 07:04:26 +08:00
|
|
|
static void
|
2022-08-28 10:21:13 +08:00
|
|
|
free_cache_entry(NCZChunkCache* cache, NCZCacheEntry* entry)
|
2021-09-03 07:04:26 +08:00
|
|
|
{
|
|
|
|
if(entry) {
|
2022-08-28 10:21:13 +08:00
|
|
|
int tid = cache->var->type_info->hdr.id;
|
|
|
|
if(tid == NC_STRING && !entry->isfixedstring) {
|
Improve performance of the nc_reclaim_data and nc_copy_data functions.
re: Issue https://github.com/Unidata/netcdf-c/issues/2685
re: PR https://github.com/Unidata/netcdf-c/pull/2179
As noted in PR https://github.com/Unidata/netcdf-c/pull/2179,
the old code did not allow for reclaiming instances of types,
nor for properly copying them. That PR provided new functions
capable of reclaiming/copying instances of arbitrary types.
However, as noted by Issue https://github.com/Unidata/netcdf-c/issues/2685, using these
most general functions resulted in a significant performance
degradation, even for common cases.
This PR attempts to mitigate the cost of using the general
reclaim/copy functions in two ways.
First, the previous functions operating at the top level by
using ncid and typeid arguments. These functions were augmented
with equivalent versions that used the netcdf-c library internal
data structures to allow direct access to needed information.
These new functions are used internally to the library.
The second mitigation involves optimizing the internal functions
by providing early tests for common cases. This avoids
unnecessary recursive function calls.
The overall result is a significant improvement in speed by a
factor of roughly twenty -- your mileage may vary. These
optimized functions are still not as fast as the original (more
limited) functions, but they are getting close. Additional optimizations are
possible. But the cost is a significant "uglification" of the
code that I deemed a step too far, at least for now.
## Misc. Changes
1. Added a test case to check the proper reclamation/copy of complex types.
2. Found and fixed some places where nc_reclaim/copy should have been used.
3. Replaced, in the netcdf-c library, (almost all) occurrences of nc_reclaim_copy with calls to NC_reclaim/copy. This plus the optimizations is the primary speed-up mechanism.
4. In DAP4, the metadata is held in a substrate in-memory file; this required some changes so that the reclaim/copy code accessed that substrate dispatcher rather than the DAP4 dispatcher.
5. Re-factored and isolated the code that computes if a type is (transitively) variable-sized or not.
6. Clean up the reclamation code in ncgen; adding the use of nc_reclaim exposed some memory problems.
2023-05-21 07:11:25 +08:00
|
|
|
NC_reclaim_data(cache->var->container->nc4_info->controller,tid,entry->data,cache->chunkcount);
|
2022-08-28 10:21:13 +08:00
|
|
|
}
|
2021-09-03 07:04:26 +08:00
|
|
|
nullfree(entry->data);
|
|
|
|
nullfree(entry->key.varkey);
|
|
|
|
nullfree(entry->key.chunkkey);
|
|
|
|
nullfree(entry);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
void
|
|
|
|
NCZ_free_chunk_cache(NCZChunkCache* cache)
|
|
|
|
{
|
|
|
|
if(cache == NULL) return;
|
2021-01-29 11:11:01 +08:00
|
|
|
|
|
|
|
ZTRACE(4,"cache.var=%s",cache->var->hdr.name);
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
/* Iterate over the entries */
|
|
|
|
while(nclistlength(cache->mru) > 0) {
|
|
|
|
void* ptr;
|
|
|
|
NCZCacheEntry* entry = nclistremove(cache->mru,0);
|
|
|
|
(void)ncxcacheremove(cache->xcache,entry->hashkey,&ptr);
|
|
|
|
assert(ptr == entry);
|
2022-08-28 10:21:13 +08:00
|
|
|
free_cache_entry(cache,entry);
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
|
|
fprintf(stderr,"|cache.free|=%ld\n",nclistlength(cache->mru));
|
|
|
|
#endif
|
|
|
|
ncxcachefree(cache->xcache);
|
|
|
|
nclistfree(cache->mru);
|
|
|
|
cache->mru = NULL;
|
2022-01-25 06:22:24 +08:00
|
|
|
(void)NCZ_reclaim_fill_chunk(cache);
|
2020-11-20 08:01:04 +08:00
|
|
|
nullfree(cache);
|
2021-01-29 11:11:01 +08:00
|
|
|
(void)ZUNTRACE(NC_NOERR);
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
size64_t
|
|
|
|
NCZ_cache_entrysize(NCZChunkCache* cache)
|
|
|
|
{
|
|
|
|
assert(cache);
|
|
|
|
return cache->chunksize;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return number of active entries in cache */
|
|
|
|
size64_t
|
|
|
|
NCZ_cache_size(NCZChunkCache* cache)
|
|
|
|
{
|
|
|
|
assert(cache);
|
|
|
|
return nclistlength(cache->mru);
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
NCZ_read_cache_chunk(NCZChunkCache* cache, const size64_t* indices, void** datap)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
int rank = cache->ndims;
|
|
|
|
NCZCacheEntry* entry = NULL;
|
|
|
|
ncexhashkey_t hkey = 0;
|
|
|
|
int created = 0;
|
2021-09-03 07:04:26 +08:00
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
/* the hash key */
|
|
|
|
hkey = ncxcachekey(indices,sizeof(size64_t)*cache->ndims);
|
|
|
|
/* See if already in cache */
|
2020-12-17 11:48:02 +08:00
|
|
|
stat = ncxcachelookup(cache->xcache,hkey,(void**)&entry);
|
|
|
|
switch(stat) {
|
2020-11-20 08:01:04 +08:00
|
|
|
case NC_NOERR:
|
|
|
|
/* Move to front of the lru */
|
2020-12-17 11:48:02 +08:00
|
|
|
(void)ncxcachetouch(cache->xcache,hkey);
|
|
|
|
break;
|
2021-07-18 06:55:30 +08:00
|
|
|
case NC_ENOOBJECT:
|
2020-11-20 08:01:04 +08:00
|
|
|
entry = NULL; /* not found; */
|
|
|
|
break;
|
|
|
|
default: goto done;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(entry == NULL) { /*!found*/
|
|
|
|
/* Create a new entry */
|
|
|
|
if((entry = calloc(1,sizeof(NCZCacheEntry)))==NULL)
|
|
|
|
{stat = NC_ENOMEM; goto done;}
|
|
|
|
memcpy(entry->indices,indices,rank*sizeof(size64_t));
|
|
|
|
/* Create the key for this cache */
|
|
|
|
if((stat = NCZ_buildchunkpath(cache,indices,&entry->key))) goto done;
|
|
|
|
entry->hashkey = hkey;
|
2022-01-25 06:22:24 +08:00
|
|
|
assert(entry->data == NULL && entry->size == 0);
|
|
|
|
/* Try to read the object from "disk"; might change size; will create if non-existent */
|
2021-09-03 07:04:26 +08:00
|
|
|
if((stat=get_chunk(cache,entry))) goto done;
|
2022-01-25 06:22:24 +08:00
|
|
|
assert(entry->data != NULL);
|
|
|
|
/* Ensure cache constraints not violated; but do it before entry is added */
|
2023-08-17 13:07:05 +08:00
|
|
|
if((stat=verifycache(cache))) goto done;
|
2020-11-20 08:01:04 +08:00
|
|
|
nclistpush(cache->mru,entry);
|
|
|
|
if((stat = ncxcacheinsert(cache->xcache,entry->hashkey,entry))) goto done;
|
|
|
|
}
|
2021-09-03 07:04:26 +08:00
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
#ifdef DEBUG
|
|
|
|
fprintf(stderr,"|cache.read.lru|=%ld\n",nclistlength(cache->mru));
|
|
|
|
#endif
|
|
|
|
if(datap) *datap = entry->data;
|
|
|
|
entry = NULL;
|
2021-01-29 11:11:01 +08:00
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
done:
|
2021-01-29 11:11:01 +08:00
|
|
|
if(created && stat == NC_NOERR) stat = NC_EEMPTY; /* tell upper layers */
|
2022-08-28 10:21:13 +08:00
|
|
|
if(entry) free_cache_entry(cache,entry);
|
2020-11-20 08:01:04 +08:00
|
|
|
return THROW(stat);
|
|
|
|
}
|
|
|
|
|
2021-09-03 07:04:26 +08:00
|
|
|
#if 0
|
2020-11-20 08:01:04 +08:00
|
|
|
int
|
2021-09-03 07:04:26 +08:00
|
|
|
NCZ_write_cache_chunk(NCZChunkCache* cache, const size64_t* indices, void* content)
|
2020-11-20 08:01:04 +08:00
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
int rank = cache->ndims;
|
|
|
|
NCZCacheEntry* entry = NULL;
|
|
|
|
ncexhashkey_t hkey;
|
|
|
|
|
2021-09-03 07:04:26 +08:00
|
|
|
/* create the hash key */
|
2020-11-20 08:01:04 +08:00
|
|
|
hkey = ncxcachekey(indices,sizeof(size64_t)*cache->ndims);
|
|
|
|
|
|
|
|
if(entry == NULL) { /*!found*/
|
|
|
|
/* Create a new entry */
|
|
|
|
if((entry = calloc(1,sizeof(NCZCacheEntry)))==NULL)
|
|
|
|
{stat = NC_ENOMEM; goto done;}
|
|
|
|
memcpy(entry->indices,indices,rank*sizeof(size64_t));
|
2021-09-03 07:04:26 +08:00
|
|
|
if((stat = NCZ_buildchunkpath(cache,indices,&entry->key))) goto done;
|
|
|
|
entry->hashkey = hkey;
|
2020-11-20 08:01:04 +08:00
|
|
|
/* Create the local copy space */
|
2021-09-03 07:04:26 +08:00
|
|
|
entry->size = cache->chunksize;
|
2020-11-20 08:01:04 +08:00
|
|
|
if((entry->data = calloc(1,cache->chunksize)) == NULL)
|
|
|
|
{stat = NC_ENOMEM; goto done;}
|
2021-09-03 07:04:26 +08:00
|
|
|
memcpy(entry->data,content,cache->chunksize);
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
setmodified(entry,1);
|
2020-11-20 08:01:04 +08:00
|
|
|
nclistpush(cache->mru,entry); /* MRU order */
|
|
|
|
#ifdef DEBUG
|
|
|
|
fprintf(stderr,"|cache.write|=%ld\n",nclistlength(cache->mru));
|
|
|
|
#endif
|
|
|
|
entry = NULL;
|
|
|
|
|
2021-09-03 07:04:26 +08:00
|
|
|
/* Ensure cache constraints not violated */
|
2023-08-17 13:07:05 +08:00
|
|
|
if((stat=verifycache(cache))) goto done;
|
2021-09-03 07:04:26 +08:00
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
done:
|
2022-08-28 10:21:13 +08:00
|
|
|
if(entry) free_cache_entry(cache,entry);
|
2020-11-20 08:01:04 +08:00
|
|
|
return THROW(stat);
|
|
|
|
}
|
2021-09-03 07:04:26 +08:00
|
|
|
#endif
|
2020-11-20 08:01:04 +08:00
|
|
|
|
2023-08-17 13:07:05 +08:00
|
|
|
/* Constrain cache */
|
2020-11-20 08:01:04 +08:00
|
|
|
static int
|
2023-08-17 13:07:05 +08:00
|
|
|
verifycache(NCZChunkCache* cache)
|
2020-11-20 08:01:04 +08:00
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
2021-09-03 07:04:26 +08:00
|
|
|
|
2023-08-17 13:07:05 +08:00
|
|
|
#if 0
|
2021-09-03 07:04:26 +08:00
|
|
|
/* Sanity check; make sure at least one entry is always allowed */
|
|
|
|
if(nclistlength(cache->mru) == 1)
|
|
|
|
goto done;
|
2023-08-17 13:07:05 +08:00
|
|
|
#endif
|
|
|
|
if((stat = constraincache(cache,USEPARAMSIZE))) goto done;
|
2021-09-03 07:04:26 +08:00
|
|
|
done:
|
|
|
|
return stat;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Completely flush cache */
|
|
|
|
|
|
|
|
static int
|
|
|
|
flushcache(NCZChunkCache* cache)
|
|
|
|
{
|
2023-08-11 06:57:57 +08:00
|
|
|
int stat = NC_NOERR;
|
2023-08-17 13:07:05 +08:00
|
|
|
#if 0
|
2023-08-11 06:57:57 +08:00
|
|
|
size_t oldsize = cache->params.size;
|
|
|
|
cache->params.size = 0;
|
2023-08-17 13:07:05 +08:00
|
|
|
stat = constraincache(cache,USEPARAMSIZE);
|
2023-08-11 06:57:57 +08:00
|
|
|
cache->params.size = oldsize;
|
2023-08-17 13:07:05 +08:00
|
|
|
#else
|
|
|
|
stat = constraincache(cache,USEPARAMSIZE);
|
|
|
|
#endif
|
2023-08-11 06:57:57 +08:00
|
|
|
return stat;
|
2021-09-03 07:04:26 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Remove entries to ensure cache is not
|
|
|
|
violating any of its constraints.
|
|
|
|
On entry, constraints might be violated.
|
2022-01-25 06:22:24 +08:00
|
|
|
Make sure that the entryinuse (NULL => no constraint) is not reclaimed.
|
2023-08-17 13:07:05 +08:00
|
|
|
@param cache
|
|
|
|
@param needed make sure there is room for this much space; USEPARAMSIZE => ensure no more than cache params is used.
|
2021-09-03 07:04:26 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
static int
|
2023-08-17 13:07:05 +08:00
|
|
|
constraincache(NCZChunkCache* cache, size64_t needed)
|
2021-09-03 07:04:26 +08:00
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
2023-08-17 13:07:05 +08:00
|
|
|
size64_t final_size;
|
2021-09-03 07:04:26 +08:00
|
|
|
|
2022-01-25 06:22:24 +08:00
|
|
|
/* If the cache is empty then do nothing */
|
|
|
|
if(cache->used == 0) goto done;
|
|
|
|
|
2023-08-17 13:07:05 +08:00
|
|
|
if(needed == USEPARAMSIZE)
|
|
|
|
final_size = cache->params.size;
|
|
|
|
else if(cache->used > needed)
|
|
|
|
final_size = cache->used - needed;
|
|
|
|
else
|
|
|
|
final_size = 0;
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
/* Flush from LRU end if we are at capacity */
|
2023-08-17 13:07:05 +08:00
|
|
|
while(nclistlength(cache->mru) > cache->params.nelems || cache->used > final_size) {
|
2023-11-27 19:36:03 +08:00
|
|
|
size_t i;
|
2020-11-20 08:01:04 +08:00
|
|
|
void* ptr;
|
|
|
|
NCZCacheEntry* e = ncxcachelast(cache->xcache); /* last entry is the least recently used */
|
2023-08-17 13:07:05 +08:00
|
|
|
if(e == NULL) break;
|
2020-11-20 08:01:04 +08:00
|
|
|
if((stat = ncxcacheremove(cache->xcache,e->hashkey,&ptr))) goto done;
|
2022-01-25 06:22:24 +08:00
|
|
|
assert(e == ptr);
|
2020-11-20 08:01:04 +08:00
|
|
|
for(i=0;i<nclistlength(cache->mru);i++) {
|
|
|
|
e = nclistget(cache->mru,i);
|
|
|
|
if(ptr == e) break;
|
|
|
|
}
|
2022-01-25 06:22:24 +08:00
|
|
|
assert(e != NULL);
|
2020-11-20 08:01:04 +08:00
|
|
|
assert(i >= 0 && i < nclistlength(cache->mru));
|
|
|
|
nclistremove(cache->mru,i);
|
2022-01-25 06:22:24 +08:00
|
|
|
assert(cache->used >= e->size);
|
|
|
|
/* Note that |old chunk data| may not be same as |new chunk data| because of filters */
|
|
|
|
cache->used -= e->size; /* old size */
|
2020-11-20 08:01:04 +08:00
|
|
|
if(e->modified) /* flush to file */
|
|
|
|
stat=put_chunk(cache,e);
|
|
|
|
/* reclaim */
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
nullfree(e->data); nullfree(e->key.varkey); nullfree(e->key.chunkkey); nullfree(e);
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
|
|
fprintf(stderr,"|cache.makeroom|=%ld\n",nclistlength(cache->mru));
|
|
|
|
#endif
|
|
|
|
done:
|
|
|
|
return stat;
|
|
|
|
}
|
|
|
|
|
2023-08-17 13:07:05 +08:00
|
|
|
/**
|
|
|
|
Push modified cache entries to disk.
|
|
|
|
Also make sure the cache size is correct.
|
|
|
|
@param cache
|
|
|
|
@return NC_EXXX error
|
|
|
|
*/
|
2020-11-20 08:01:04 +08:00
|
|
|
int
|
|
|
|
NCZ_flush_chunk_cache(NCZChunkCache* cache)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
size_t i;
|
|
|
|
|
2021-01-29 11:11:01 +08:00
|
|
|
ZTRACE(4,"cache.var=%s |cache|=%d",cache->var->hdr.name,(int)nclistlength(cache->mru));
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
if(NCZ_cache_size(cache) == 0) goto done;
|
|
|
|
|
|
|
|
/* Iterate over the entries in hashmap */
|
|
|
|
for(i=0;i<nclistlength(cache->mru);i++) {
|
|
|
|
NCZCacheEntry* entry = nclistget(cache->mru,i);
|
|
|
|
if(entry->modified) {
|
|
|
|
/* Write out this chunk in toto*/
|
|
|
|
if((stat=put_chunk(cache,entry)))
|
|
|
|
goto done;
|
|
|
|
}
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
setmodified(entry,0);
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
2023-08-17 13:07:05 +08:00
|
|
|
/* Re-compute space used */
|
|
|
|
cache->used = 0;
|
|
|
|
for(i=0;i<nclistlength(cache->mru);i++) {
|
|
|
|
NCZCacheEntry* entry = nclistget(cache->mru,i);
|
|
|
|
cache->used += entry->size;
|
|
|
|
}
|
|
|
|
/* Make sure cache size and nelems are correct */
|
|
|
|
if((stat=verifycache(cache))) goto done;
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
|
|
|
|
done:
|
2021-01-29 11:11:01 +08:00
|
|
|
return ZUNTRACE(stat);
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
|
|
|
|
2022-01-25 06:22:24 +08:00
|
|
|
/* Ensure existence of some kind of fill chunk */
|
|
|
|
int
|
|
|
|
NCZ_ensure_fill_chunk(NCZChunkCache* cache)
|
|
|
|
{
|
|
|
|
int i, stat = NC_NOERR;
|
|
|
|
NC_VAR_INFO_T* var = cache->var;
|
2022-08-28 10:21:13 +08:00
|
|
|
nc_type typeid = var->type_info->hdr.id;
|
2022-01-25 06:22:24 +08:00
|
|
|
size_t typesize = var->type_info->size;
|
|
|
|
|
|
|
|
if(cache->fillchunk) goto done;
|
|
|
|
|
|
|
|
if((cache->fillchunk = malloc(cache->chunksize))==NULL)
|
|
|
|
{stat = NC_ENOMEM; goto done;}
|
|
|
|
if(var->no_fill) {
|
|
|
|
/* use zeros */
|
|
|
|
memset(cache->fillchunk,0,cache->chunksize);
|
|
|
|
goto done;
|
|
|
|
}
|
|
|
|
if((stat = NCZ_ensure_fill_value(var))) goto done;
|
2022-08-28 10:21:13 +08:00
|
|
|
if(typeid == NC_STRING) {
|
|
|
|
char* src = *((char**)(var->fill_value));
|
|
|
|
char** dst = (char**)(cache->fillchunk);
|
|
|
|
for(i=0;i<cache->chunkcount;i++) dst[i] = strdup(src);
|
|
|
|
} else
|
2022-01-25 06:22:24 +08:00
|
|
|
switch (typesize) {
|
|
|
|
case 1: {
|
|
|
|
unsigned char c = *((unsigned char*)var->fill_value);
|
|
|
|
memset(cache->fillchunk,c,cache->chunksize);
|
|
|
|
} break;
|
|
|
|
case 2: {
|
|
|
|
unsigned short fv = *((unsigned short*)var->fill_value);
|
|
|
|
unsigned short* p2 = (unsigned short*)cache->fillchunk;
|
|
|
|
for(i=0;i<cache->chunksize;i+=typesize) *p2++ = fv;
|
|
|
|
} break;
|
|
|
|
case 4: {
|
|
|
|
unsigned int fv = *((unsigned int*)var->fill_value);
|
|
|
|
unsigned int* p4 = (unsigned int*)cache->fillchunk;
|
|
|
|
for(i=0;i<cache->chunksize;i+=typesize) *p4++ = fv;
|
|
|
|
} break;
|
|
|
|
case 8: {
|
|
|
|
unsigned long long fv = *((unsigned long long*)var->fill_value);
|
|
|
|
unsigned long long* p8 = (unsigned long long*)cache->fillchunk;
|
|
|
|
for(i=0;i<cache->chunksize;i+=typesize) *p8++ = fv;
|
|
|
|
} break;
|
|
|
|
default: {
|
|
|
|
unsigned char* p;
|
|
|
|
for(p=cache->fillchunk,i=0;i<cache->chunksize;i+=typesize,p+=typesize)
|
|
|
|
memcpy(p,var->fill_value,typesize);
|
|
|
|
} break;
|
|
|
|
}
|
|
|
|
done:
|
|
|
|
return NC_NOERR;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
NCZ_reclaim_fill_chunk(NCZChunkCache* zcache)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
if(zcache && zcache->fillchunk) {
|
|
|
|
NC_VAR_INFO_T* var = zcache->var;
|
|
|
|
int tid = var->type_info->hdr.id;
|
|
|
|
size_t chunkcount = zcache->chunkcount;
|
Improve performance of the nc_reclaim_data and nc_copy_data functions.
re: Issue https://github.com/Unidata/netcdf-c/issues/2685
re: PR https://github.com/Unidata/netcdf-c/pull/2179
As noted in PR https://github.com/Unidata/netcdf-c/pull/2179,
the old code did not allow for reclaiming instances of types,
nor for properly copying them. That PR provided new functions
capable of reclaiming/copying instances of arbitrary types.
However, as noted by Issue https://github.com/Unidata/netcdf-c/issues/2685, using these
most general functions resulted in a significant performance
degradation, even for common cases.
This PR attempts to mitigate the cost of using the general
reclaim/copy functions in two ways.
First, the previous functions operating at the top level by
using ncid and typeid arguments. These functions were augmented
with equivalent versions that used the netcdf-c library internal
data structures to allow direct access to needed information.
These new functions are used internally to the library.
The second mitigation involves optimizing the internal functions
by providing early tests for common cases. This avoids
unnecessary recursive function calls.
The overall result is a significant improvement in speed by a
factor of roughly twenty -- your mileage may vary. These
optimized functions are still not as fast as the original (more
limited) functions, but they are getting close. Additional optimizations are
possible. But the cost is a significant "uglification" of the
code that I deemed a step too far, at least for now.
## Misc. Changes
1. Added a test case to check the proper reclamation/copy of complex types.
2. Found and fixed some places where nc_reclaim/copy should have been used.
3. Replaced, in the netcdf-c library, (almost all) occurrences of nc_reclaim_copy with calls to NC_reclaim/copy. This plus the optimizations is the primary speed-up mechanism.
4. In DAP4, the metadata is held in a substrate in-memory file; this required some changes so that the reclaim/copy code accessed that substrate dispatcher rather than the DAP4 dispatcher.
5. Re-factored and isolated the code that computes if a type is (transitively) variable-sized or not.
6. Clean up the reclamation code in ncgen; adding the use of nc_reclaim exposed some memory problems.
2023-05-21 07:11:25 +08:00
|
|
|
stat = NC_reclaim_data_all(var->container->nc4_info->controller,tid,zcache->fillchunk,chunkcount);
|
2022-01-25 06:22:24 +08:00
|
|
|
zcache->fillchunk = NULL;
|
|
|
|
}
|
|
|
|
return stat;
|
|
|
|
}
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
int
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
NCZ_chunk_cache_modify(NCZChunkCache* cache, const size64_t* indices)
|
2020-11-20 08:01:04 +08:00
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
ncexhashkey_t hkey = 0;
|
2020-11-20 08:01:04 +08:00
|
|
|
NCZCacheEntry* entry = NULL;
|
|
|
|
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
/* the hash key */
|
|
|
|
hkey = ncxcachekey(indices,sizeof(size64_t)*cache->ndims);
|
2020-11-20 08:01:04 +08:00
|
|
|
|
|
|
|
/* See if already in cache */
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
if((stat=ncxcachelookup(cache->xcache, hkey, (void**)&entry))) {stat = NC_EINTERNAL; goto done;}
|
|
|
|
setmodified(entry,1);
|
2020-11-20 08:01:04 +08:00
|
|
|
|
|
|
|
done:
|
|
|
|
return THROW(stat);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**************************************************/
|
|
|
|
/*
|
|
|
|
From Zarr V2 Specification:
|
|
|
|
"The compressed sequence of bytes for each chunk is stored under
|
|
|
|
a key formed from the index of the chunk within the grid of
|
|
|
|
chunks representing the array. To form a string key for a
|
|
|
|
chunk, the indices are converted to strings and concatenated
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
with the dimension_separator character ('.' or '/') separating
|
|
|
|
each index. For example, given an array with shape (10000,
|
|
|
|
10000) and chunk shape (1000, 1000) there will be 100 chunks
|
|
|
|
laid out in a 10 by 10 grid. The chunk with indices (0, 0)
|
|
|
|
provides data for rows 0-1000 and columns 0-1000 and is stored
|
|
|
|
under the key "0.0"; the chunk with indices (2, 4) provides data
|
|
|
|
for rows 2000-3000 and columns 4000-5000 and is stored under the
|
|
|
|
key "2.4"; etc."
|
2020-11-20 08:01:04 +08:00
|
|
|
*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param R Rank
|
|
|
|
* @param chunkindices The chunk indices
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
* @param dimsep the dimension separator
|
2020-11-20 08:01:04 +08:00
|
|
|
* @param keyp Return the chunk key string
|
|
|
|
*/
|
2020-12-17 11:48:02 +08:00
|
|
|
int
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
NCZ_buildchunkkey(size_t R, const size64_t* chunkindices, char dimsep, char** keyp)
|
2020-11-20 08:01:04 +08:00
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
int r;
|
|
|
|
NCbytes* key = ncbytesnew();
|
|
|
|
|
|
|
|
if(keyp) *keyp = NULL;
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
|
|
|
|
assert(islegaldimsep(dimsep));
|
2020-11-20 08:01:04 +08:00
|
|
|
|
|
|
|
for(r=0;r<R;r++) {
|
|
|
|
char sindex[64];
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
if(r > 0) ncbytesappend(key,dimsep);
|
2020-11-20 08:01:04 +08:00
|
|
|
/* Print as decimal with no leading zeros */
|
|
|
|
snprintf(sindex,sizeof(sindex),"%lu",(unsigned long)chunkindices[r]);
|
|
|
|
ncbytescat(key,sindex);
|
|
|
|
}
|
|
|
|
ncbytesnull(key);
|
|
|
|
if(keyp) *keyp = ncbytesextract(key);
|
|
|
|
|
|
|
|
ncbytesfree(key);
|
|
|
|
return THROW(stat);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @internal Push data to chunk of a file.
|
|
|
|
* If chunk does not exist, create it
|
|
|
|
*
|
|
|
|
* @param file Pointer to file info struct.
|
|
|
|
* @param proj Chunk projection
|
|
|
|
* @param datalen size of data
|
|
|
|
* @param data Buffer containing the chunk data to write
|
|
|
|
*
|
|
|
|
* @return ::NC_NOERR No error.
|
|
|
|
* @author Dennis Heimbigner
|
|
|
|
*/
|
|
|
|
static int
|
2021-09-03 07:04:26 +08:00
|
|
|
put_chunk(NCZChunkCache* cache, NCZCacheEntry* entry)
|
2020-11-20 08:01:04 +08:00
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
2021-09-03 07:04:26 +08:00
|
|
|
NC_FILE_INFO_T* file = NULL;
|
2020-11-20 08:01:04 +08:00
|
|
|
NCZ_FILE_INFO_T* zfile = NULL;
|
|
|
|
NCZMAP* map = NULL;
|
2021-09-03 07:04:26 +08:00
|
|
|
char* path = NULL;
|
2022-08-28 10:21:13 +08:00
|
|
|
nc_type tid = NC_NAT;
|
|
|
|
void* strchunk = NULL;
|
2020-11-20 08:01:04 +08:00
|
|
|
|
2021-01-29 11:11:01 +08:00
|
|
|
ZTRACE(5,"cache.var=%s entry.key=%s",cache->var->hdr.name,entry->key);
|
2020-11-20 08:01:04 +08:00
|
|
|
LOG((3, "%s: var: %p", __func__, cache->var));
|
|
|
|
|
2021-09-03 07:04:26 +08:00
|
|
|
file = (cache->var->container)->nc4_info;
|
|
|
|
zfile = file->format_file_info;
|
2020-11-20 08:01:04 +08:00
|
|
|
map = zfile->map;
|
|
|
|
|
2022-08-28 10:21:13 +08:00
|
|
|
/* Collect some info */
|
|
|
|
tid = cache->var->type_info->hdr.id;
|
|
|
|
|
|
|
|
if(tid == NC_STRING && !entry->isfixedstring) {
|
|
|
|
/* Convert from char* to char[strlen] format */
|
|
|
|
int maxstrlen = NCZ_get_maxstrlen((NC_OBJ*)cache->var);
|
|
|
|
assert(maxstrlen > 0);
|
2023-11-25 02:20:52 +08:00
|
|
|
if((strchunk = malloc((size_t)cache->chunkcount * (size_t)maxstrlen))==NULL) {stat = NC_ENOMEM; goto done;}
|
2022-08-28 10:21:13 +08:00
|
|
|
/* copy char* to char[] format */
|
|
|
|
if((stat = NCZ_char2fixed((const char**)entry->data,strchunk,cache->chunkcount,maxstrlen))) goto done;
|
|
|
|
/* Reclaim the old chunk */
|
Improve performance of the nc_reclaim_data and nc_copy_data functions.
re: Issue https://github.com/Unidata/netcdf-c/issues/2685
re: PR https://github.com/Unidata/netcdf-c/pull/2179
As noted in PR https://github.com/Unidata/netcdf-c/pull/2179,
the old code did not allow for reclaiming instances of types,
nor for properly copying them. That PR provided new functions
capable of reclaiming/copying instances of arbitrary types.
However, as noted by Issue https://github.com/Unidata/netcdf-c/issues/2685, using these
most general functions resulted in a significant performance
degradation, even for common cases.
This PR attempts to mitigate the cost of using the general
reclaim/copy functions in two ways.
First, the previous functions operating at the top level by
using ncid and typeid arguments. These functions were augmented
with equivalent versions that used the netcdf-c library internal
data structures to allow direct access to needed information.
These new functions are used internally to the library.
The second mitigation involves optimizing the internal functions
by providing early tests for common cases. This avoids
unnecessary recursive function calls.
The overall result is a significant improvement in speed by a
factor of roughly twenty -- your mileage may vary. These
optimized functions are still not as fast as the original (more
limited) functions, but they are getting close. Additional optimizations are
possible. But the cost is a significant "uglification" of the
code that I deemed a step too far, at least for now.
## Misc. Changes
1. Added a test case to check the proper reclamation/copy of complex types.
2. Found and fixed some places where nc_reclaim/copy should have been used.
3. Replaced, in the netcdf-c library, (almost all) occurrences of nc_reclaim_copy with calls to NC_reclaim/copy. This plus the optimizations is the primary speed-up mechanism.
4. In DAP4, the metadata is held in a substrate in-memory file; this required some changes so that the reclaim/copy code accessed that substrate dispatcher rather than the DAP4 dispatcher.
5. Re-factored and isolated the code that computes if a type is (transitively) variable-sized or not.
6. Clean up the reclamation code in ncgen; adding the use of nc_reclaim exposed some memory problems.
2023-05-21 07:11:25 +08:00
|
|
|
if((stat = NC_reclaim_data_all(file->controller,tid,entry->data,cache->chunkcount))) goto done;
|
2022-08-28 10:21:13 +08:00
|
|
|
entry->data = NULL;
|
|
|
|
entry->data = strchunk; strchunk = NULL;
|
|
|
|
entry->size = cache->chunkcount * maxstrlen;
|
|
|
|
entry->isfixedstring = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-12-24 13:18:56 +08:00
|
|
|
#ifdef ENABLE_NCZARR_FILTERS
|
2021-09-03 07:04:26 +08:00
|
|
|
/* Make sure the entry is in filtered state */
|
|
|
|
if(!entry->isfiltered) {
|
|
|
|
NC_VAR_INFO_T* var = cache->var;
|
|
|
|
void* filtered = NULL; /* pointer to the filtered data */
|
|
|
|
size_t flen; /* length of filtered data */
|
|
|
|
/* Get the filter chain to apply */
|
|
|
|
NClist* filterchain = (NClist*)var->filters;
|
|
|
|
if(nclistlength(filterchain) > 0) {
|
2022-01-25 06:22:24 +08:00
|
|
|
/* Apply the filter chain to get the filtered data; will reclaim entry->data */
|
2021-09-03 07:04:26 +08:00
|
|
|
if((stat = NCZ_applyfilterchain(file,var,filterchain,entry->size,entry->data,&flen,&filtered,ENCODING))) goto done;
|
|
|
|
/* Fix up the cache entry */
|
|
|
|
/* Note that if filtered is different from entry->data, then entry->data will have been freed */
|
|
|
|
entry->data = filtered;
|
|
|
|
entry->size = flen;
|
|
|
|
entry->isfiltered = 1;
|
|
|
|
}
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
}
|
2021-12-24 13:18:56 +08:00
|
|
|
#endif
|
2021-09-03 07:04:26 +08:00
|
|
|
|
|
|
|
path = NCZ_chunkpath(entry->key);
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
stat = nczmap_write(map,path,entry->size,entry->data);
|
2021-09-03 07:04:26 +08:00
|
|
|
nullfree(path); path = NULL;
|
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
switch(stat) {
|
2021-01-29 11:11:01 +08:00
|
|
|
case NC_NOERR:
|
2020-11-20 08:01:04 +08:00
|
|
|
break;
|
2021-01-29 11:11:01 +08:00
|
|
|
case NC_EEMPTY:
|
2020-11-20 08:01:04 +08:00
|
|
|
default: goto done;
|
|
|
|
}
|
|
|
|
done:
|
2022-08-28 10:21:13 +08:00
|
|
|
nullfree(strchunk);
|
2021-09-03 07:04:26 +08:00
|
|
|
nullfree(path);
|
2021-01-29 11:11:01 +08:00
|
|
|
return ZUNTRACE(stat);
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @internal Push data from memory to file.
|
|
|
|
*
|
|
|
|
* @param cache Pointer to parent cache
|
|
|
|
* @param key chunk key
|
|
|
|
* @param entry cache entry to read into
|
|
|
|
*
|
|
|
|
* @return ::NC_NOERR No error.
|
|
|
|
* @author Dennis Heimbigner
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
get_chunk(NCZChunkCache* cache, NCZCacheEntry* entry)
|
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
NCZMAP* map = NULL;
|
|
|
|
NC_FILE_INFO_T* file = NULL;
|
|
|
|
NCZ_FILE_INFO_T* zfile = NULL;
|
2022-08-28 10:21:13 +08:00
|
|
|
NC_TYPE_INFO_T* xtype = NULL;
|
|
|
|
char** strchunk = NULL;
|
2021-09-03 07:04:26 +08:00
|
|
|
size64_t size;
|
|
|
|
int empty = 0;
|
|
|
|
char* path = NULL;
|
2022-08-28 10:21:13 +08:00
|
|
|
int tid;
|
2020-11-20 08:01:04 +08:00
|
|
|
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
ZTRACE(5,"cache.var=%s entry.key=%s sep=%d",cache->var->hdr.name,entry->key,cache->dimension_separator);
|
2021-01-29 11:11:01 +08:00
|
|
|
|
2020-11-20 08:01:04 +08:00
|
|
|
LOG((3, "%s: file: %p", __func__, file));
|
|
|
|
|
|
|
|
file = (cache->var->container)->nc4_info;
|
|
|
|
zfile = file->format_file_info;
|
|
|
|
map = zfile->map;
|
2021-09-03 07:04:26 +08:00
|
|
|
assert(map);
|
2020-11-20 08:01:04 +08:00
|
|
|
|
2022-08-28 10:21:13 +08:00
|
|
|
/* Collect some info */
|
|
|
|
xtype = cache->var->type_info;
|
|
|
|
tid = xtype->hdr.id;
|
|
|
|
|
2021-09-03 07:04:26 +08:00
|
|
|
/* get size of the "raw" data on "disk" */
|
|
|
|
path = NCZ_chunkpath(entry->key);
|
|
|
|
stat = nczmap_len(map,path,&size);
|
|
|
|
nullfree(path); path = NULL;
|
|
|
|
switch(stat) {
|
2022-08-28 10:21:13 +08:00
|
|
|
case NC_NOERR: entry->size = size; break;
|
2021-09-03 07:04:26 +08:00
|
|
|
case NC_EEMPTY: empty = 1; stat = NC_NOERR; break;
|
|
|
|
default: goto done;
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
}
|
2020-11-20 08:01:04 +08:00
|
|
|
|
2023-08-17 13:07:05 +08:00
|
|
|
/* make room in the cache */
|
|
|
|
if((stat = constraincache(cache,size))) goto done;
|
|
|
|
|
2021-09-03 07:04:26 +08:00
|
|
|
if(!empty) {
|
|
|
|
/* Make sure we have a place to read it */
|
2022-08-28 10:21:13 +08:00
|
|
|
if((entry->data = (void*)calloc(1,entry->size)) == NULL)
|
|
|
|
{stat = NC_ENOMEM; goto done;}
|
2021-09-03 07:04:26 +08:00
|
|
|
/* Read the raw data */
|
|
|
|
path = NCZ_chunkpath(entry->key);
|
|
|
|
stat = nczmap_read(map,path,0,entry->size,(char*)entry->data);
|
|
|
|
nullfree(path); path = NULL;
|
|
|
|
switch (stat) {
|
|
|
|
case NC_NOERR: break;
|
|
|
|
case NC_EEMPTY: empty = 1; stat = NC_NOERR;break;
|
|
|
|
default: goto done;
|
|
|
|
}
|
2023-11-27 19:36:03 +08:00
|
|
|
entry->isfiltered = (int)FILTERED(cache); /* Is the data being read filtered? */
|
2022-08-28 10:21:13 +08:00
|
|
|
if(tid == NC_STRING)
|
|
|
|
entry->isfixedstring = 1; /* fill cache is in char[maxstrlen] format */
|
2021-09-03 07:04:26 +08:00
|
|
|
}
|
|
|
|
if(empty) {
|
|
|
|
/* fake the chunk */
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
setmodified(entry,(file->no_write?0:1));
|
2021-09-03 07:04:26 +08:00
|
|
|
entry->size = cache->chunksize;
|
2022-08-28 10:21:13 +08:00
|
|
|
entry->data = NULL;
|
|
|
|
entry->isfixedstring = 0;
|
|
|
|
entry->isfiltered = 0;
|
2021-09-03 07:04:26 +08:00
|
|
|
/* apply fill value */
|
2022-01-25 06:22:24 +08:00
|
|
|
if(cache->fillchunk == NULL)
|
|
|
|
{if((stat = NCZ_ensure_fill_chunk(cache))) goto done;}
|
2022-08-28 10:21:13 +08:00
|
|
|
if((entry->data = calloc(1,entry->size))==NULL) {stat = NC_ENOMEM; goto done;}
|
Mitigate S3 test interference + Unlimited Dimensions in NCZarr
This PR started as an attempt to add unlimited dimensions to NCZarr.
It did that, but this exposed significant problems with test interference.
So this PR is mostly about fixing -- well mitigating anyway -- test
interference.
The problem of test interference is now documented in the document docs/internal.md.
The solutions implemented here are also describe in that document.
The solution is somewhat fragile but multiple cleanup mechanisms
are provided. Note that this feature requires that the
AWS command line utility must be installed.
## Unlimited Dimensions.
The existing NCZarr extensions to Zarr are modified to support unlimited dimensions.
NCzarr extends the Zarr meta-data for the ".zgroup" object to include netcdf-4 model extensions. This information is stored in ".zgroup" as dictionary named "_nczarr_group".
Inside "_nczarr_group", there is a key named "dims" that stores information about netcdf-4 named dimensions. The value of "dims" is a dictionary whose keys are the named dimensions. The value associated with each dimension name has one of two forms
Form 1 is a special case of form 2, and is kept for backward compatibility. Whenever a new file is written, it uses format 1 if possible, otherwise format 2.
* Form 1: An integer representing the size of the dimension, which is used for simple named dimensions.
* Form 2: A dictionary with the following keys and values"
- "size" with an integer value representing the (current) size of the dimension.
- "unlimited" with a value of either "1" or "0" to indicate if this dimension is an unlimited dimension.
For Unlimited dimensions, the size is initially zero, and as variables extend the length of that dimension, the size value for the dimension increases.
That dimension size is shared by all arrays referencing that dimension, so if one array extends an unlimited dimension, it is implicitly extended for all other arrays that reference that dimension.
This is the standard semantics for unlimited dimensions.
Adding unlimited dimensions required a number of other changes to the NCZarr code-base. These included the following.
* Did a partial refactor of the slice handling code in zwalk.c to clean it up.
* Added a number of tests for unlimited dimensions derived from the same test in nc_test4.
* Added several NCZarr specific unlimited tests; more are needed.
* Add test of endianness.
## Misc. Other Changes
* Modify libdispatch/ncs3sdk_aws.cpp to optionally support use of the
AWS Transfer Utility mechanism. This is controlled by the
```#define TRANSFER```` command in that file. It defaults to being disabled.
* Parameterize both the standard Unidata S3 bucket (S3TESTBUCKET) and the netcdf-c test data prefix (S3TESTSUBTREE).
* Fixed an obscure memory leak in ncdump.
* Removed some obsolete unit testing code and test cases.
* Uncovered a bug in the netcdf-c handling of big-endian floats and doubles. Have not fixed yet. See tst_h5_endians.c.
* Renamed some nczarr_tests testcases to avoid name conflicts with nc_test4.
* Modify the semantics of zmap\#ncsmap_write to only allow total rewrite of objects.
* Modify the semantics of zodom to properly handle stride > 1.
* Add a truncate operation to the libnczarr zmap code.
2023-09-27 06:56:48 +08:00
|
|
|
if((stat = NCZ_copy_data(file,cache->var,cache->fillchunk,cache->chunkcount,ZREADING,entry->data))) goto done;
|
2021-09-03 07:04:26 +08:00
|
|
|
stat = NC_NOERR;
|
|
|
|
}
|
2021-12-24 13:18:56 +08:00
|
|
|
#ifdef ENABLE_NCZARR_FILTERS
|
2021-09-03 07:04:26 +08:00
|
|
|
/* Make sure the entry is in unfiltered state */
|
2022-08-28 10:21:13 +08:00
|
|
|
if(!empty && entry->isfiltered) {
|
2021-09-03 07:04:26 +08:00
|
|
|
NC_VAR_INFO_T* var = cache->var;
|
|
|
|
void* unfiltered = NULL; /* pointer to the unfiltered data */
|
|
|
|
void* filtered = NULL; /* pointer to the filtered data */
|
|
|
|
size_t unflen; /* length of unfiltered data */
|
2022-08-28 10:21:13 +08:00
|
|
|
assert(tid != NC_STRING || entry->isfixedstring);
|
2021-09-03 07:04:26 +08:00
|
|
|
/* Get the filter chain to apply */
|
|
|
|
NClist* filterchain = (NClist*)var->filters;
|
|
|
|
if(nclistlength(filterchain) == 0) {stat = NC_EFILTER; goto done;}
|
|
|
|
/* Apply the filter chain to get the unfiltered data */
|
|
|
|
filtered = entry->data;
|
|
|
|
entry->data = NULL;
|
|
|
|
if((stat = NCZ_applyfilterchain(file,var,filterchain,entry->size,filtered,&unflen,&unfiltered,!ENCODING))) goto done;
|
|
|
|
/* Fix up the cache entry */
|
|
|
|
entry->data = unfiltered;
|
|
|
|
entry->size = unflen;
|
|
|
|
entry->isfiltered = 0;
|
|
|
|
}
|
2021-12-24 13:18:56 +08:00
|
|
|
#endif
|
2021-09-03 07:04:26 +08:00
|
|
|
|
2022-08-28 10:21:13 +08:00
|
|
|
if(tid == NC_STRING && entry->isfixedstring) {
|
|
|
|
/* Convert from char[strlen] to char* format */
|
|
|
|
int maxstrlen = NCZ_get_maxstrlen((NC_OBJ*)cache->var);
|
|
|
|
assert(maxstrlen > 0);
|
|
|
|
/* copy char[] to char* format */
|
|
|
|
if((strchunk = (char**)malloc(sizeof(char*)*cache->chunkcount))==NULL)
|
|
|
|
{stat = NC_ENOMEM; goto done;}
|
|
|
|
if((stat = NCZ_fixed2char(entry->data,strchunk,cache->chunkcount,maxstrlen))) goto done;
|
|
|
|
/* Reclaim the old chunk */
|
|
|
|
nullfree(entry->data);
|
|
|
|
entry->data = NULL;
|
|
|
|
entry->data = strchunk; strchunk = NULL;
|
|
|
|
entry->size = cache->chunkcount * sizeof(char*);
|
|
|
|
entry->isfixedstring = 0;
|
|
|
|
}
|
|
|
|
|
2023-08-17 13:07:05 +08:00
|
|
|
/* track new chunk */
|
|
|
|
cache->used += entry->size;
|
|
|
|
|
2021-09-03 07:04:26 +08:00
|
|
|
done:
|
2022-08-28 10:21:13 +08:00
|
|
|
nullfree(strchunk);
|
2021-09-03 07:04:26 +08:00
|
|
|
nullfree(path);
|
2021-01-29 11:11:01 +08:00
|
|
|
return ZUNTRACE(stat);
|
2020-11-20 08:01:04 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
NCZ_buildchunkpath(NCZChunkCache* cache, const size64_t* chunkindices, struct ChunkKey* key)
|
2020-11-20 08:01:04 +08:00
|
|
|
{
|
|
|
|
int stat = NC_NOERR;
|
|
|
|
char* chunkname = NULL;
|
|
|
|
char* varkey = NULL;
|
|
|
|
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
assert(key != NULL);
|
2020-11-20 08:01:04 +08:00
|
|
|
/* Get the chunk object name */
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
if((stat = NCZ_buildchunkkey(cache->ndims, chunkindices, cache->dimension_separator, &chunkname))) goto done;
|
2020-11-20 08:01:04 +08:00
|
|
|
/* Get the var object key */
|
|
|
|
if((stat = NCZ_varkey(cache->var,&varkey))) goto done;
|
Upgrade the nczarr code to match Zarr V2
Re: https://github.com/zarr-developers/zarr-python/pull/716
The Zarr version 2 spec has been extended to include the ability
to choose the dimension separator in chunk name keys. The legal
separators has been extended from {'.'} to {'.' '/'}. So now it
is possible to use a key like "0/1/2/0" for chunk names.
This PR implements this for NCZarr. The V2 spec now says that
this separator can be set on a per-variable basis. For now, I
have chosen to allow this be set only globally by adding a key
named "ZARR.DIMENSION_SEPARATOR=<char>" in the
.daprc/.dodsrc/ncrc file. Currently, the only legal separator
characters are '.' (the default) and '/'. On writing, this key
will only be written if its value is different than the default.
This change caused problems because supporting a separator of '/'
is difficult to parse when keys/paths use '/' as the path separator.
A test case was added for this.
Additionally, make nczarr be enabled default by default. This required
some additional changes so that if zip and/or AWS S3 sdk are unavailable,
then they are disabled for NCZarr.
In addition the following unrelated changes were made.
1. Tested that pure-zarr mode could read an nczarr formatted store.
1. The .rc file handling now merges all known .rc files (.ncrc,.daprc, and .dodsrc) in that order and using those in HOME first, then in current directory. For duplicate entries, the later ones override the earlier ones. This change is to remove some of the conflicts inherent in the current .rc file load process. A set of test cases was also added.
1. Re-order tests in configure.ac and CMakeLists.txt so that if libcurl
is not found then the other options that depend upon it properly
are disabled.
1. I decided that xarray support should be enabled by default for pure
zarr. In order to allow disabling, I added a new mode flag "noxarray".
1. Certain test in nczarr_test depend on use of .dodsrc. In order for these
to work when testing in parallel, some inter-test dependencies needed to
be added.
1. Improved authorization testing to use changes in thredds.ucar.edu
2021-04-25 09:48:15 +08:00
|
|
|
key->varkey = varkey; varkey = NULL;
|
|
|
|
key->chunkkey = chunkname; chunkname = NULL;
|
2020-11-20 08:01:04 +08:00
|
|
|
|
|
|
|
done:
|
|
|
|
nullfree(chunkname);
|
|
|
|
nullfree(varkey);
|
|
|
|
return THROW(stat);
|
|
|
|
}
|
2022-01-25 06:22:24 +08:00
|
|
|
|
|
|
|
void
|
|
|
|
NCZ_dumpxcacheentry(NCZChunkCache* cache, NCZCacheEntry* e, NCbytes* buf)
|
|
|
|
{
|
|
|
|
char s[8192];
|
|
|
|
char idx[64];
|
|
|
|
int i;
|
|
|
|
|
|
|
|
ncbytescat(buf,"{");
|
|
|
|
snprintf(s,sizeof(s),"modified=%u isfiltered=%u indices=",
|
|
|
|
(unsigned)e->modified,
|
|
|
|
(unsigned)e->isfiltered
|
|
|
|
);
|
|
|
|
ncbytescat(buf,s);
|
|
|
|
for(i=0;i<cache->ndims;i++) {
|
|
|
|
snprintf(idx,sizeof(idx),"%s%llu",(i==0?"":"."),e->indices[i]);
|
|
|
|
ncbytescat(buf,idx);
|
|
|
|
}
|
|
|
|
snprintf(s,sizeof(s),"size=%llu data=%p",
|
|
|
|
e->size,
|
|
|
|
e->data
|
|
|
|
);
|
|
|
|
ncbytescat(buf,s);
|
|
|
|
ncbytescat(buf,"}");
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
NCZ_printxcache(NCZChunkCache* cache)
|
|
|
|
{
|
|
|
|
static char xs[20000];
|
|
|
|
NCbytes* buf = ncbytesnew();
|
|
|
|
char s[8192];
|
2023-11-27 19:36:03 +08:00
|
|
|
size_t i;
|
2022-01-25 06:22:24 +08:00
|
|
|
|
|
|
|
ncbytescat(buf,"NCZChunkCache:\n");
|
|
|
|
snprintf(s,sizeof(s),"\tvar=%s\n\tndims=%u\n\tchunksize=%u\n\tchunkcount=%u\n\tfillchunk=%p\n",
|
|
|
|
cache->var->hdr.name,
|
|
|
|
(unsigned)cache->ndims,
|
|
|
|
(unsigned)cache->chunksize,
|
|
|
|
(unsigned)cache->chunkcount,
|
|
|
|
cache->fillchunk
|
|
|
|
);
|
|
|
|
ncbytescat(buf,s);
|
|
|
|
|
|
|
|
snprintf(s,sizeof(s),"\tmaxentries=%u\n\tmaxsize=%u\n\tused=%u\n\tdimsep='%c'\n",
|
2023-08-11 06:57:57 +08:00
|
|
|
(unsigned)cache->params.nelems,
|
|
|
|
(unsigned)cache->params.size,
|
2022-01-25 06:22:24 +08:00
|
|
|
(unsigned)cache->used,
|
|
|
|
cache->dimension_separator
|
|
|
|
);
|
|
|
|
ncbytescat(buf,s);
|
|
|
|
|
|
|
|
snprintf(s,sizeof(s),"\tmru: (%u)\n",(unsigned)nclistlength(cache->mru));
|
|
|
|
ncbytescat(buf,s);
|
|
|
|
if(nclistlength(cache->mru)==0)
|
|
|
|
ncbytescat(buf,"\t\t<empty>\n");
|
|
|
|
for(i=0;i<nclistlength(cache->mru);i++) {
|
|
|
|
NCZCacheEntry* e = (NCZCacheEntry*)nclistget(cache->mru,i);
|
2023-11-27 19:36:03 +08:00
|
|
|
snprintf(s,sizeof(s),"\t\t[%zu] ", i);
|
2022-01-25 06:22:24 +08:00
|
|
|
ncbytescat(buf,s);
|
|
|
|
if(e == NULL)
|
|
|
|
ncbytescat(buf,"<null>");
|
|
|
|
else
|
|
|
|
NCZ_dumpxcacheentry(cache, e, buf);
|
|
|
|
ncbytescat(buf,"\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
xs[0] = '\0';
|
|
|
|
strlcat(xs,ncbytescontents(buf),sizeof(xs));
|
|
|
|
ncbytesfree(buf);
|
|
|
|
fprintf(stderr,"%s\n",xs);
|
|
|
|
}
|