netcdf-c/ncgen/bindata.c
Dennis Heimbigner 231ae96c4b Add support for Zarr string type to NCZarr
* re: https://github.com/Unidata/netcdf-c/pull/2278
* re: https://github.com/Unidata/netcdf-c/issues/2485
* re: https://github.com/Unidata/netcdf-c/issues/2474

This PR subsumes PR https://github.com/Unidata/netcdf-c/pull/2278.
Actually is a bit an omnibus covering several issues.

## PR https://github.com/Unidata/netcdf-c/pull/2278
Add support for the Zarr string type.
Zarr strings are restricted currently to be of fixed size.
The primary issue to be addressed is to provide a way for user to
specify the size of the fixed length strings. This is handled by providing
the following new attributes special:
1. **_nczarr_default_maxstrlen** —
This is an attribute of the root group. It specifies the default
maximum string length for string types. If not specified, then
it has the value of 64 characters.
2. **_nczarr_maxstrlen** —
This is a per-variable attribute. It specifies the maximum
string length for the string type associated with the variable.
If not specified, then it is assigned the value of
**_nczarr_default_maxstrlen**.

This PR also requires some hacking to handle the existing netcdf-c NC_CHAR
type, which does not exist in zarr. The goal was to choose numpy types for
both the netcdf-c NC_STRING type and the netcdf-c NC_CHAR type such that
if a pure zarr implementation read them, it would still work and an
NC_CHAR type would be handled by zarr as a string of length 1.

For writing variables and NCZarr attributes, the type mapping is as follows:
* "|S1" for NC_CHAR.
* ">S1" for NC_STRING && MAXSTRLEN==1
* ">Sn" for NC_STRING && MAXSTRLEN==n

Note that it is a bit of a hack to use endianness, but it should be ok since for
string/char, the endianness has no meaning.

For reading attributes with pure zarr (i.e. with no nczarr
atribute types defined), they will always be interpreted as of
type NC_CHAR.

## Issue: https://github.com/Unidata/netcdf-c/issues/2474
This PR partly fixes this issue because it provided more
comprehensive support for Zarr attributes that are JSON valued expressions.
This PR still does not address the problem in that issue where the
_ARRAY_DIMENSION attribute is incorrectly set. Than can only be
fixed by the creator of the datasets.

## Issue: https://github.com/Unidata/netcdf-c/issues/2485
This PR also fixes the scalar failure shown in this issue.
It generally cleans up scalar handling.
It also adds a note to the documentation describing that
NCZarr supports scalars while Zarr does not and also how
scalar interoperability is achieved.

## Misc. Other Changes
1. Convert the nczarr special attributes and keys to be all lower case. So "_NCZARR_ATTR" now used "_nczarr_attr. Support back compatibility for the upper case names.
2. Cleanup my too-clever-by-half handling of scalars in libnczarr.
2022-08-27 20:21:13 -06:00

600 lines
18 KiB
C

/*********************************************************************
* Copyright 2018, UCAR/Unidata
* See netcdf/COPYRIGHT file for copying and redistribution conditions.
*********************************************************************/
#include "includes.h"
#include "nclog.h"
#ifdef ENABLE_BINARY
/* Forward */
static void alignto(int alignment, Bytebuffer* buf, ptrdiff_t base);
static int bin_uid = 0;
static int
bin_charconstant(Generator* generator, Symbol* sym, Bytebuffer* buf, ...)
{
/* Just transfer charbuf to codebuf */
Bytebuffer* charbuf;
va_list ap;
va_start(ap,buf);
charbuf = va_arg(ap, Bytebuffer*);
va_end(ap);
bbNull(charbuf);
bbCatbuf(buf,charbuf);
return 1;
}
static int
bin_constant(Generator* generator, Symbol* sym, NCConstant* con, Bytebuffer* buf,...)
{
if(con->nctype != NC_ECONST) {
alignbuffer(con,buf);
}
switch (con->nctype) {
case NC_OPAQUE: {
unsigned char* bytes = NULL;
size_t len;
/* Assume the opaque string has been normalized */
bytes=makebytestring(con->value.opaquev.stringv,&len);
bbAppendn(buf,(void*)bytes,len);
efree(bytes);
} break;
case NC_CHAR:
bbAppendn(buf,&con->value.charv,sizeof(con->value.charv));
break;
case NC_BYTE:
bbAppendn(buf,(void*)&con->value.int8v,sizeof(con->value.int8v));
break;
case NC_SHORT:
bbAppendn(buf,(void*)&con->value.int16v,sizeof(con->value.int16v));
break;
case NC_INT:
bbAppendn(buf,(void*)&con->value.int32v,sizeof(con->value.int32v));
break;
case NC_FLOAT:
bbAppendn(buf,(void*)&con->value.floatv,sizeof(con->value.floatv));
break;
case NC_DOUBLE:
bbAppendn(buf,(void*)&con->value.doublev,sizeof(con->value.doublev));
break;
case NC_UBYTE:
bbAppendn(buf,(void*)&con->value.uint8v,sizeof(con->value.uint8v));
break;
case NC_USHORT:
bbAppendn(buf,(void*)&con->value.uint16v,sizeof(con->value.uint16v));
break;
case NC_UINT:
bbAppendn(buf,(void*)&con->value.uint32v,sizeof(con->value.uint32v));
break;
case NC_INT64: {
union SI64 { char ch[8]; long long i64;} si64;
si64.i64 = con->value.int64v;
bbAppendn(buf,(void*)si64.ch,sizeof(si64.ch));
} break;
case NC_UINT64: {
union SU64 { char ch[8]; unsigned long long i64;} su64;
su64.i64 = con->value.uint64v;
bbAppendn(buf,(void*)su64.ch,sizeof(su64.ch));
} break;
case NC_NIL:
case NC_STRING: {
int len = (size_t)con->value.stringv.len;
if(len == 0 && con->value.stringv.stringv == NULL) {
char* nil = NULL;
bbAppendn(buf,(void*)&nil,sizeof(nil));
} else {
char* ptr = (char*)ecalloc(len+1);
memcpy(ptr,con->value.stringv.stringv,len);
ptr[len] = '\0';
bbAppendn(buf,(void*)&ptr,sizeof(ptr));
ptr = NULL;
}
} break;
default: PANIC1("bin_constant: unexpected type: %d",con->nctype);
}
return 1;
}
static int
bin_listbegin(Generator* generator, Symbol* tsym, void* liststate, ListClass lc, size_t size, Bytebuffer* buf, int* uidp, ...)
{
if(uidp) *uidp = ++bin_uid;
if(lc == LISTCOMPOUND)
*((int*)liststate) = bbLength(buf);
return 1;
}
static int
bin_list(Generator* generator, Symbol* tsym, void* liststate, ListClass lc, int uid, size_t count, Bytebuffer* buf, ...)
{
if(lc == LISTCOMPOUND) {
int offsetbase = *((int*)liststate);
/* Pad for the alignment */
alignto(tsym->typ.alignment,buf,offsetbase);
}
return 1;
}
static int
bin_listend(Generator* generator, Symbol* tsym, void* liststate, ListClass lc, int uid, size_t count, Bytebuffer* buf, ...)
{
if(lc == LISTCOMPOUND) {
int offsetbase = *((int*)liststate);
/* Pad out the whole instance */
alignto(tsym->typ.cmpdalign,buf,offsetbase);
}
return 1;
}
static int
bin_vlendecl(Generator* generator, Symbol* tsym, Bytebuffer* buf, int uid, size_t count,...)
{
va_list ap;
Bytebuffer* vlenmem;
nc_vlen_t ptr;
va_start(ap,count);
vlenmem = va_arg(ap, Bytebuffer*);
va_end(ap);
ptr.len = count;
ptr.p = bbExtract(vlenmem);
bbAppendn(buf,(char*)&ptr,sizeof(ptr));
return 1;
}
static int
bin_vlenstring(Generator* generator, Symbol* sym, Bytebuffer* codebuf, int* uidp, size_t* sizep,...)
{
Bytebuffer* vlenmem;
nc_vlen_t ptr;
va_list ap;
if(uidp) *uidp = ++bin_uid;
va_start(ap,sizep);
vlenmem = va_arg(ap, Bytebuffer*);
va_end(ap);
ptr.len = bbLength(vlenmem);
ptr.p = bbDup(vlenmem);
bbAppendn(codebuf,(char*)&ptr,sizeof(ptr));
return 1;
}
static const char zeros[] =
"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
static void
alignto(int alignment, Bytebuffer* buf, ptrdiff_t base)
{
int pad = 0;
ptrdiff_t offset = bbLength(buf);
offset -= base; /* Need to actually align wrt to the base */
pad = getpadding(offset,alignment);
if(pad > 0) {
bbAppendn(buf,(void*)zeros,pad);
}
}
/* Define the single static bin data generator */
static Generator bin_generator_singleton = {
NULL,
bin_charconstant,
bin_constant,
bin_listbegin,
bin_list,
bin_listend,
bin_vlendecl,
bin_vlenstring
};
Generator* bin_generator = &bin_generator_singleton;
/**************************************************/
static int bin_generate_data_r(NCConstant* instance, Symbol* tsym, Datalist* fillvalue, Bytebuffer* databuf);
static void
write_alignment(int alignment, Bytebuffer* buf)
{
int pad = 0;
ptrdiff_t offset = bbLength(buf);
pad = getpadding(offset,alignment);
if(pad > 0) {
bbAppendn(buf,(void*)zeros,pad);
}
}
/**
Alternate binary data generator.
Inputs:
Datalist* data - to use to generate the binary data
Symbol* tsym - the top-level type for which instances
are to be generated
Datalist* fillvalue - the fillvalue for the toplevel type
Bytebuffer* databuf - the buffer into which instances are to be stored
*/
int
binary_generate_data(Datalist* data, Symbol* tsym, Datalist* fillvalue, Bytebuffer* databuf)
{
int stat = NC_NOERR;
size_t count = data->length;
size_t i;
bbClear(databuf);
for(i=0;i<count;i++) {
NCConstant* instance = datalistith(data,i);
if((stat = bin_generate_data_r(instance, tsym, fillvalue, databuf))) goto done;
}
done:
return stat;
}
/* Recursive helper that does the bulk of the work */
static int
bin_generate_data_r(NCConstant* instance, Symbol* tsym, Datalist* fillvalue, Bytebuffer* databuf)
{
int stat = NC_NOERR;
if(instance->nctype == NC_FILLVALUE) {
/* replace with fillvalue for the type */
Datalist* filllist = (fillvalue == NULL ? getfiller(tsym) : fillvalue);
ASSERT(datalistlen(filllist)==1)
instance = datalistith(filllist,0);
}
switch (tsym->subclass) {
case NC_PRIM: {
switch (tsym->nc_id) {
case NC_CHAR: {
char* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_CHAR;
convert1(instance,tmp);
p = &tmp->value.charv;;
bbAppendn(databuf,p,sizeof(char));
reclaimconstant(tmp);
} break;
case NC_BYTE: {
signed char* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_BYTE;
convert1(instance,tmp);
p = &tmp->value.int8v;
bbAppendn(databuf,p,sizeof(signed char));
reclaimconstant(tmp);
} break;
case NC_UBYTE: {
unsigned char* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_UBYTE;
convert1(instance,tmp);
p = &tmp->value.uint8v;
bbAppendn(databuf,p,sizeof(unsigned char));
reclaimconstant(tmp);
} break;
case NC_SHORT: {
short* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_SHORT;
convert1(instance,tmp);
p = &tmp->value.int16v;
bbAppendn(databuf,p,sizeof(short));
reclaimconstant(tmp);
} break;
case NC_USHORT: {
unsigned short* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_USHORT;
convert1(instance,tmp);
p = &tmp->value.uint16v;
bbAppendn(databuf,p,sizeof(unsigned short));
reclaimconstant(tmp);
} break;
case NC_INT: {
int* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_INT;
convert1(instance,tmp);
p = &tmp->value.int32v;
bbAppendn(databuf,p,sizeof(int));
reclaimconstant(tmp);
} break;
case NC_UINT: {
unsigned int* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_UINT;
convert1(instance,tmp);
p = &tmp->value.uint32v;
bbAppendn(databuf,p,sizeof(unsigned int));
reclaimconstant(tmp);
} break;
case NC_INT64: {
long long* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_INT64;
convert1(instance,tmp);
p = &tmp->value.int64v;
bbAppendn(databuf,p,sizeof(long long));
reclaimconstant(tmp);
} break;
case NC_UINT64: {
unsigned long long* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_UINT64;
convert1(instance,tmp);
p = &tmp->value.uint64v;
bbAppendn(databuf,p,sizeof(unsigned long long));
reclaimconstant(tmp);
} break;
case NC_FLOAT: {
float* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_FLOAT;
convert1(instance,tmp);
p = &tmp->value.floatv;
bbAppendn(databuf,p,sizeof(float));
reclaimconstant(tmp);
} break;
case NC_DOUBLE: {
double* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_DOUBLE;
convert1(instance,tmp);
p = &tmp->value.doublev;
bbAppendn(databuf,p,sizeof(double));
reclaimconstant(tmp);
} break;
case NC_STRING: {
char* p = NULL;
NCConstant* tmp = nullconst();
tmp->nctype = NC_STRING;
convert1(instance,tmp);
p = emalloc(tmp->value.stringv.len+1);
memcpy(p,tmp->value.stringv.stringv,tmp->value.stringv.len);
p[tmp->value.stringv.len] = '\0';
bbAppendn(databuf,&p,sizeof(char*));
reclaimconstant(tmp);
} break;
default: stat = NC_EINTERNAL; goto done; /* Should never happen */
} break; /*switch*/
} break; /*NC_PRIM*/
case NC_ENUM: {
Symbol* basetype = tsym->typ.basetype;
/* Pretend */
stat = bin_generate_data_r(instance,basetype,fillvalue,databuf);
} break;
case NC_OPAQUE: {
unsigned char* bytes = NULL;
size_t len = 0;
if(instance->nctype != NC_OPAQUE)
{stat = NC_EBADTYPE; goto done;}
/* Assume the opaque string has been normalized */
bytes=makebytestring(instance->value.opaquev.stringv,&len);
if(bytes == NULL) {stat = NC_ENOMEM; goto done;}
bbAppendn(databuf,(void*)bytes,len);
free(bytes);
} break;
case NC_VLEN: {
Datalist* sublist = NULL;
Bytebuffer* vlendata = NULL;
nc_vlen_t p;
if(instance->nctype != NC_COMPOUND) {
nclog(NCLOGERR,"Translating vlen: expected sublist");
stat = NC_EBADTYPE; goto done;
}
sublist = instance->value.compoundv;
vlendata = bbNew();
if((stat = binary_generate_data(sublist,tsym->typ.basetype,NULL,vlendata))) goto done;
p.len = datalistlen(sublist);
p.p = bbContents(vlendata);
bbAppendn(databuf,(char*)&p,sizeof(nc_vlen_t));
} break;
case NC_COMPOUND: { /* The really hard one */
size_t nfields, fid, i;
Datalist* cmpd = instance->value.compoundv;
write_alignment(tsym->typ.cmpdalign,databuf);
/* Get info about each field in turn and build it*/
nfields = listlength(tsym->subnodes);
for(fid=0;fid<nfields;fid++) {
Symbol* field = listget(tsym->subnodes,fid);
NCConstant* fieldinstance = datalistith(cmpd,fid);
int ndims = field->typ.dimset.ndims;
size_t arraycount = 1;
if(ndims == 0) {
ndims=1; /* fake the scalar case */
}
/* compute the total number of elements in the field array */
for(i=0;i<ndims;i++) arraycount *= field->typ.dimset.dimsyms[i]->dim.declsize;
write_alignment(field->typ.alignment,databuf);
/* Write the instances */
for(i=0;i<arraycount;i++) {
if((stat = bin_generate_data_r(fieldinstance, field->typ.basetype, NULL, databuf))) goto done;
}
}
} break;
default: stat = NC_EINTERNAL; goto done; /* Should never happen */
}
done:
return stat;
}
#if 0
/**
Internal equivalent of ncaux_reclaim_data.
*/
/* It is helpful to have a structure that contains memory and an offset */
typedef struct Reclaim {char* memory; ptrdiff_t offset;} Reclaim;
static int bin_reclaim_datar(Symbol* tsym, Reclaim* reclaim);
#ifdef USE_NETCDF4
static ptrdiff_t read_alignment(ptrdiff_t offset, unsigned long alignment);
static int bin_reclaim_usertype(Symbol* tsym, Reclaim* reclaim);
static int bin_reclaim_compound(Symbol* tsym, Reclaim* reclaim);
static int bin_reclaim_vlen(Symbol* tsym, Reclaim* reclaim);
static int bin_reclaim_enum(Symbol* tsym, Reclaim* reclaim);
static int bin_reclaim_opaque(Symbol* tsym, Reclaim* reclaim);
#endif
int
binary_reclaim_data(Symbol* tsym, void* memory, size_t count)
{
int stat = NC_NOERR;
size_t i;
Reclaim reclaimer;
if(tsym == NULL
|| (memory == NULL && count > 0))
{stat = NC_EINVAL; goto done;}
if(memory == NULL || count == 0)
goto done; /* ok, do nothing */
reclaimer.offset = 0;
reclaimer.memory = memory;
for(i=0;i<count;i++) {
if((stat=bin_reclaim_datar(tsym,&reclaimer))) /* reclaim one instance */
break;
}
done:
return stat;
}
/* Recursive type walker: reclaim a single instance */
static int
bin_reclaim_datar(Symbol* tsym, Reclaim* reclaimer)
{
int stat = NC_NOERR;
switch (tsym->subclass) {
case NC_CHAR: case NC_BYTE: case NC_UBYTE:
case NC_SHORT: case NC_USHORT:
case NC_INT: case NC_UINT: case NC_FLOAT:
case NC_INT64: case NC_UINT64: case NC_DOUBLE:
reclaimer->offset += tsym->typ.size;
break;
#ifdef USE_NETCDF4
case NC_STRING: {
char** sp = (char**)(reclaimer->memory+reclaimer->offset);
/* Need to reclaim string */
if(*sp != NULL) efree(*sp);
reclaimer->offset += tsym->typ.size;
} break;
default:
/* reclaim a user type */
stat = bin_reclaim_usertype(tsym,reclaimer);
#else
default:
stat = NC_ENOTNC4;
#endif
break;
}
return stat;
}
#ifdef USE_NETCDF4
static int
bin_reclaim_usertype(Symbol* tsym, Reclaim* reclaimer)
{
int stat = NC_NOERR;
/* Get info about the xtype */
switch (tsym->subclass) {
case NC_OPAQUE: stat = bin_reclaim_opaque(tsym,reclaimer); break;
case NC_ENUM: stat = bin_reclaim_enum(tsym,reclaimer); break;
case NC_VLEN: stat = bin_reclaim_vlen(tsym,reclaimer); break;
case NC_COMPOUND: stat = bin_reclaim_compound(tsym,reclaimer); break;
default:
stat = NC_EINVAL;
break;
}
return stat;
}
static ptrdiff_t
read_alignment(ptrdiff_t offset, unsigned long alignment)
{
size_t delta = (offset % alignment);
if(delta == 0) return offset;
return offset + (alignment - delta);
}
static int
bin_reclaim_vlen(Symbol* tsym, Reclaim* reclaimer)
{
int stat = NC_NOERR;
size_t i;
Symbol* basetype = tsym->typ.basetype;
nc_vlen_t* vl = (nc_vlen_t*)(reclaimer->memory+reclaimer->offset);
/* Free up each entry in the vlen list */
if(vl->p != NULL) {
Reclaim vreclaimer;
vreclaimer.memory = vl->p;
vreclaimer.offset = 0;
for(i=0;i<vl->len;i++) {
vreclaimer.offset = read_alignment(vreclaimer.offset,basetype->typ.alignment);
if((stat = bin_reclaim_datar(basetype,&vreclaimer))) goto done;
vreclaimer.offset += basetype->typ.size;
}
reclaimer->offset += tsym->typ.size;
efree(vl->p);
}
done:
return stat;
}
static int
bin_reclaim_enum(Symbol* tsym, Reclaim* reclaimer)
{
return bin_reclaim_datar(tsym->typ.basetype,reclaimer);
}
static int
bin_reclaim_opaque(Symbol* tsym, Reclaim* reclaimer)
{
/* basically a fixed size sequence of bytes */
reclaimer->offset += tsym->typ.size;
return NC_NOERR;
}
static int
bin_reclaim_compound(Symbol* tsym, Reclaim* reclaimer)
{
int stat = NC_NOERR;
int nfields;
size_t fid, i;
size_t arraycount = 1;
ptrdiff_t saveoffset;
reclaimer->offset = read_alignment(reclaimer->offset,tsym->typ.cmpdalign);
saveoffset = reclaimer->offset;
/* Get info about each field in turn and reclaim it */
nfields = listlength(tsym->subnodes);
for(fid=0;fid<nfields;fid++) {
Symbol* field = listget(tsym->subnodes,fid);
int ndims = field->typ.dimset.ndims;
/* compute the total number of elements in the field array */
for(i=0;i<ndims;i++) arraycount *= field->typ.dimset.dimsyms[i]->dim.declsize;
reclaimer->offset = read_alignment(reclaimer->offset,field->typ.alignment);
for(i=0;i<arraycount;i++) {
if((stat = bin_reclaim_datar(field->typ.basetype, reclaimer))) goto done;
}
}
reclaimer->offset = saveoffset;
reclaimer->offset += tsym->typ.size;
done:
return stat;
}
#endif /*USE_NETCDF4*/
#endif /*0*/
#endif /*ENABLE_BINARY*/