fixed chunking bug: default chunks must always be under 4GB in size

This commit is contained in:
Ed Hartnett 2011-02-02 14:09:15 +00:00
parent dfd3c50b60
commit 668ed2e0a5
6 changed files with 317 additions and 136 deletions

View File

@ -3,6 +3,11 @@ Entries are in reverse chronological order (most recent first).
VERSION COMMENTS
------- --------
4.1.2-beta3 2011-02-11
Fixed some bugs and some performance problems with
default chunksizes.
4.1.2-beta2 2011-01-11
Add "-c" option to nccopy to specify chunk sizes used

View File

@ -51,8 +51,8 @@ tst_varms tst_unlim_vars tst_converts tst_converts2 tst_grps tst_grps2 \
tst_compounds tst_compounds2 tst_compounds3 tst_opaques tst_strings \
tst_strings2 tst_interops tst_interops4 tst_interops5 tst_interops6 \
tst_enums tst_coords tst_coords2 tst_coords3 tst_vars3 tst_vars4 \
tst_chunks tst_utf8 tst_fills tst_fills2 tst_fillbug tst_xplatform \
tst_xplatform2 tst_h_atts2 tst_endian_fill tst_atts
tst_chunks tst_chunks2 tst_utf8 tst_fills tst_fills2 tst_fillbug \
tst_xplatform tst_xplatform2 tst_h_atts2 tst_endian_fill tst_atts
check_PROGRAMS = $(NC4_TESTS)
if LARGE_FILE_TESTS

View File

@ -206,13 +206,45 @@ nc_get_var_chunk_cache_ints(int ncid, int varid, int *sizep,
return NC_NOERR;
}
/* Check a set of chunksizes to see if they add up to a chunk that is too big. */
static int
check_chunksizes(NC_GRP_INFO_T *grp, NC_VAR_INFO_T *var, size_t *chunksizes)
{
NC_TYPE_INFO_T *type_info;
long long total;
size_t type_len;
int d;
int retval;
if ((retval = nc4_get_typelen_mem(grp->file->nc4_info, var->xtype, 0, &type_len)))
return retval;
if ((retval = nc4_find_type(grp->file->nc4_info, var->xtype, &type_info)))
return retval;
if (type_info && type_info->class == NC_VLEN)
total = sizeof(hvl_t);
else
total = type_len;
for (d = 0; d < var->ndims; d++)
{
if (chunksizes[d] < 1)
return NC_EINVAL;
total *= chunksizes[d];
}
if (total > NC_MAX_UINT)
return NC_EBADCHUNK;
return NC_NOERR;
}
/* Find the default chunk nelems (i.e. length of chunk along each
* dimension). */
static int
nc4_find_default_chunksizes(NC_VAR_INFO_T *var)
nc4_find_default_chunksizes(NC_GRP_INFO_T *grp, NC_VAR_INFO_T *var)
{
int d;
size_t type_size, num_values = 1, num_unlim = 0;
int retval;
if (var->type_info->nc_typeid == NC_STRING)
type_size = sizeof(char *);
@ -243,6 +275,20 @@ nc4_find_default_chunksizes(NC_VAR_INFO_T *var)
var->chunksizes[d] = var->dim[d]->len;
}
/* But did this add up to a chunk that is too big? */
retval = check_chunksizes(grp, var, var->chunksizes);
if (retval)
{
/* Other error? */
if (retval != NC_EBADCHUNK)
return retval;
/* Chunk is too big! Reduce each dimension by half and try again. */
for ( ; retval == NC_EBADCHUNK; retval = check_chunksizes(grp, var, var->chunksizes))
for (d = 0; d < var->ndims; d++)
var->chunksizes[d] /= 2;
}
return NC_NOERR;
}
@ -409,7 +455,7 @@ nc_def_var_nc4(int ncid, const char *name, nc_type xtype,
if (!(var->chunksizes = malloc(var->ndims * sizeof(size_t))))
return NC_ENOMEM;
if ((retval = nc4_find_default_chunksizes(var)))
if ((retval = nc4_find_default_chunksizes(grp, var)))
return retval;
/* Is this a variable with a chunksize greater than the current
@ -764,28 +810,9 @@ nc_def_var_extra(int ncid, int varid, int *shuffle, int *deflate,
* big, and that their total size of chunk is less than 4 GB. */
if (chunksizes)
{
NC_TYPE_INFO_T *type_info;
long long total;
size_t type_len;
if ((retval = nc4_get_typelen_mem(grp->file->nc4_info, var->xtype,
0, &type_len)))
if ((retval = check_chunksizes(grp, var, chunksizes)))
return retval;
if ((retval = nc4_find_type(grp->file->nc4_info, var->xtype, &type_info)))
return retval;
if (type_info && type_info->class == NC_VLEN)
total = sizeof(hvl_t);
else
total = type_len;
for (d = 0; d < var->ndims; d++)
{
if (chunksizes[d] < 1)
return NC_EBADCHUNK;
total *= chunksizes[d];
}
if (total > NC_MAX_UINT)
return NC_EBADCHUNK;
/* Set the chunksizes for this variable. */
for (d = 0; d < var->ndims; d++)
@ -799,7 +826,7 @@ nc_def_var_extra(int ncid, int varid, int *shuffle, int *deflate,
{
/* Determine default chunksizes for this variable. */
if (!var->chunksizes[0])
if ((retval = nc4_find_default_chunksizes(var)))
if ((retval = nc4_find_default_chunksizes(grp, var)))
return retval;
/* Adjust the cache. */

View File

@ -7,9 +7,8 @@
*/
#include <nc_tests.h>
#include "netcdf.h"
#define FILE_NAME "tst_vars3.nc"
#define FILE_NAME "tst_chunks.nc"
#define NDIMS1 1
#define D_SMALL "small_dim"
#define D_SMALL_LEN 16
@ -25,7 +24,7 @@ int
main(int argc, char **argv)
{
printf("\n*** Testing netcdf-4 variable functions, some more.\n");
printf("\n*** Testing netcdf-4 variable chunking.\n");
printf("**** testing that fixed vars with filter end up being chunked, with good sizes...");
{

257
libsrc4/tst_chunks2.c Normal file
View File

@ -0,0 +1,257 @@
/* This is part of the netCDF package.
Copyright 2011 University Corporation for Atmospheric Research/Unidata
See COPYRIGHT file for conditions of use.
Test netcdf-4 chunking.
*/
#include <nc_tests.h>
#define FILE_NAME "tst_chunks2.nc"
/* Calculate the waste of the chunking. A waste of 10% means the
* chunked data is 10% larget then the unchunked data. */
static int
calculate_waste(int ndims, size_t *dimlen, size_t *chunksize, float *waste)
{
int d;
size_t chunked = 1, unchunked = 1;
size_t *num_chunks;
assert(waste && dimlen && chunksize && ndims);
if (!(num_chunks = calloc(ndims, sizeof(size_t)))) ERR;
printf("\n");
/* Caclulate the total space taken up by the chunked data. */
for (d = 0; d < ndims; d++)
{
/* How many chunks along this dimension are required? */
for (num_chunks[d] = 0; (num_chunks[d] * chunksize[d]) < (dimlen[d] ? dimlen[d] : 1);
num_chunks[d]++)
;
chunked *= (num_chunks[d] * chunksize[d]);
}
/* Calculate the minimum space required for this data
* (i.e. unchunked) or one record of it. */
for (d = 0; d < ndims; d++)
unchunked *= (dimlen[d] ? dimlen[d] : 1);
printf("size for unchunked %d size for chunked %d\n", unchunked, chunked);
*waste = (float)(chunked - unchunked) / (float)chunked;
printf("\ndimlen\tchunksize\tnum_chunks\n");
for (d = 0; d < ndims; d++)
printf("%d\t%d\t\t%d\n", dimlen[d], chunksize[d], num_chunks[d]);
printf("wasted space: %2.2f%\n", *waste);
free(num_chunks);
return 0;
}
int
main(int argc, char **argv)
{
printf("\n*** Testing netcdf-4 variable chunking.\n");
printf("**** testing default chunksizes...");
{
#define NDIMS3 3
#define NUM_VARS 1
#define Y_NAME "y"
#define X_NAME "x"
#define Z_NAME "z"
#define VAR_NAME_JOE "joe"
#define XDIM_LEN 2
#define YDIM_LEN 5
#define ZDIM_LEN 3000
#define MAX_WASTE 0.1
int varid, ncid, dims[NDIMS3], dims_in[NDIMS3];
int ndims, nvars, ngatts, unlimdimid, natts;
char name_in[NC_MAX_NAME + 1];
nc_type type_in;
size_t len_in[NDIMS3];
int storage = 0;
size_t chunksizes[NDIMS3];
float waste = 0;
/* Create a file with 3D var, turn on chunking, but don't provide chunksizes. */
if (nc_create(FILE_NAME, NC_NETCDF4 | NC_CLOBBER, &ncid)) ERR;
if (nc_def_dim(ncid, X_NAME, XDIM_LEN, &dims[0])) ERR;
if (nc_def_dim(ncid, Y_NAME, YDIM_LEN, &dims[1])) ERR;
if (nc_def_dim(ncid, Z_NAME, ZDIM_LEN, &dims[2])) ERR;
if (nc_def_var(ncid, VAR_NAME_JOE, NC_FLOAT, NDIMS3, dims, &varid)) ERR;
if (nc_def_var_chunking(ncid, 0, NC_CHUNKED, NULL)) ERR;
/* Check it out. */
if (nc_inq(ncid, &ndims, &nvars, &ngatts, &unlimdimid)) ERR;
if (nvars != NUM_VARS || ndims != NDIMS3 || ngatts != 0 || unlimdimid != -1) ERR;
if (nc_inq_var(ncid, 0, name_in, &type_in, &ndims, dims_in, &natts)) ERR;
if (strcmp(name_in, VAR_NAME_JOE) || type_in != NC_FLOAT || ndims != NDIMS3 ||
dims_in[0] != dims[0] || dims_in[1] != dims[1] || dims_in[2] != dims[2] || natts != 0) ERR;
if (nc_inq_dim(ncid, 0, name_in, &len_in[0])) ERR;
if (strcmp(name_in, X_NAME) || len_in[0] != XDIM_LEN) ERR;
if (nc_inq_dim(ncid, 1, name_in, &len_in[1])) ERR;
if (strcmp(name_in, Y_NAME) || len_in[1] != YDIM_LEN) ERR;
if (nc_inq_dim(ncid, 2, name_in, &len_in[2])) ERR;
if (strcmp(name_in, Z_NAME) || len_in[2] != ZDIM_LEN) ERR;
if (nc_inq_var_chunking(ncid, 0, &storage, chunksizes)) ERR;
if (storage != NC_CHUNKED) ERR;
if (nc_close(ncid)) ERR;
/* Open the file and check again. */
if (nc_open(FILE_NAME, NC_WRITE, &ncid)) ERR;
if (nc_inq(ncid, &ndims, &nvars, &ngatts, &unlimdimid)) ERR;
if (nvars != NUM_VARS || ndims != NDIMS3 || ngatts != 0 || unlimdimid != -1) ERR;
if (nc_inq_var(ncid, 0, name_in, &type_in, &ndims, dims_in, &natts)) ERR;
if (strcmp(name_in, VAR_NAME_JOE) || type_in != NC_FLOAT || ndims != NDIMS3 ||
dims_in[0] != dims[0] || dims_in[1] != dims[1] || dims_in[2] != dims[2] || natts != 0) ERR;
if (nc_inq_dim(ncid, 0, name_in, &len_in[0])) ERR;
if (strcmp(name_in, X_NAME) || len_in[0] != XDIM_LEN) ERR;
if (nc_inq_dim(ncid, 1, name_in, &len_in[1])) ERR;
if (strcmp(name_in, Y_NAME) || len_in[1] != YDIM_LEN) ERR;
if (nc_inq_dim(ncid, 2, name_in, &len_in[2])) ERR;
if (strcmp(name_in, Z_NAME) || len_in[2] != ZDIM_LEN) ERR;
if (nc_inq_var_chunking(ncid, 0, &storage, chunksizes)) ERR;
if (storage != NC_CHUNKED) ERR;
if (calculate_waste(NDIMS3, len_in, chunksizes, &waste)) ERR;
/*if (waste > MAX_WASTE) ERR;*/
if (nc_close(ncid)) ERR;
}
SUMMARIZE_ERR;
printf("**** testing default chunksizes some more for a 3D var...");
{
#define NDIMS3 3
#define VAR_NAME "op-amp"
int varid, ncid, dims[NDIMS3], dims_in[NDIMS3];
int dimids[NDIMS3];
size_t dim_len[NDIMS3] = {1, 11, 152750};
int ndims, nvars, ngatts, unlimdimid, natts;
char name_in[NC_MAX_NAME + 1];
nc_type type_in;
size_t len_in;
int storage = 0;
size_t chunksizes[NDIMS3];
int d, i, j, k;
char dim_name[NC_MAX_NAME + 1];
float waste;
if (nc_create(FILE_NAME, NC_NETCDF4 | NC_CLOBBER, &ncid)) ERR;
/* Create a few dimensions. */
for (d = 0; d < NDIMS3; d++)
{
sprintf(dim_name, "dim_%d", d);
if (nc_def_dim(ncid, dim_name, dim_len[d], &dimids[d])) ERR;
}
/* Define a var with these dimensions, and turn on chunking. */
if (nc_def_var(ncid, VAR_NAME, NC_FLOAT, NDIMS3, dimids, &varid)) ERR;
if (nc_def_var_chunking(ncid, varid, NC_CHUNKED, NULL)) ERR;
/* Check how default chunking worked. */
if (nc_inq_var_chunking(ncid, varid, &storage, chunksizes)) ERR;
if (storage != NC_CHUNKED) ERR;
if (calculate_waste(NDIMS3, dim_len, chunksizes, &waste)) ERR;
/* if (waste > MAX_WASTE) ERR;*/
if (nc_close(ncid)) ERR;
/* Open the file and check. */
if (nc_open(FILE_NAME, NC_WRITE, &ncid)) ERR;
if (nc_close(ncid)) ERR;
}
SUMMARIZE_ERR;
printf("**** testing default chunksizes for very large 3D var...");
{
#define NDIMS3 3
int varid, ncid, dims[NDIMS3], dims_in[NDIMS3];
int dimids[NDIMS3];
size_t dim_len[NDIMS3] = {1804289383, 846930886, 1681692777};
int ndims, nvars, ngatts, unlimdimid, natts;
char name_in[NC_MAX_NAME + 1];
nc_type type_in;
size_t len_in;
int storage = 0;
size_t chunksizes[NDIMS3];
int d, i, j, k;
char dim_name[NC_MAX_NAME + 1];
float waste;
if (nc_create(FILE_NAME, NC_NETCDF4 | NC_CLOBBER, &ncid)) ERR;
/* Create a few dimensions. */
for (d = 0; d < NDIMS3; d++)
{
sprintf(dim_name, "dim_%d", d);
if (nc_def_dim(ncid, dim_name, dim_len[d], &dimids[d])) ERR;
}
/* Define a var with these dimensions, and turn on chunking. */
if (nc_def_var(ncid, VAR_NAME, NC_FLOAT, NDIMS3, dimids, &varid)) ERR;
if (nc_def_var_chunking(ncid, varid, NC_CHUNKED, NULL)) ERR;
/* Check how default chunking worked. */
if (nc_inq_var_chunking(ncid, varid, &storage, chunksizes)) ERR;
if (storage != NC_CHUNKED) ERR;
if (calculate_waste(NDIMS3, dim_len, chunksizes, &waste)) ERR;
/* if (waste > MAX_WASTE) ERR;*/
if (nc_close(ncid)) ERR;
/* Open the file and check. */
if (nc_open(FILE_NAME, NC_WRITE, &ncid)) ERR;
if (nc_close(ncid)) ERR;
}
SUMMARIZE_ERR;
/* printf("**** testing default chunksizes some randomly sized 3D vars..."); */
/* { */
/* #define NDIMS3 3 */
/* #define NUM_TESTS 3 */
/* int varid, ncid, dims[NDIMS3], dims_in[NDIMS3]; */
/* int dimids[NDIMS3]; */
/* size_t dim_len[NDIMS3]; */
/* int ndims, nvars, ngatts, unlimdimid, natts; */
/* char name_in[NC_MAX_NAME + 1]; */
/* nc_type type_in; */
/* size_t len_in; */
/* int storage = 0; */
/* size_t chunksizes[NDIMS3]; */
/* int d, t; */
/* char dim_name[NC_MAX_NAME + 1]; */
/* float waste; */
/* for (t = 0; t < NUM_TESTS; t++) */
/* { */
/* if (nc_create(FILE_NAME, NC_NETCDF4 | NC_CLOBBER, &ncid)) ERR; */
/* /\* Create a few dimensions. *\/ */
/* for (d = 0; d < NDIMS3; d++) */
/* { */
/* dim_len[d] = rand(); */
/* sprintf(dim_name, "dim_%d", d); */
/* if (nc_def_dim(ncid, dim_name, dim_len[d], &dimids[d])) ERR; */
/* } */
/* /\* Define a var with these dimensions, and turn on chunking. *\/ */
/* if (nc_def_var(ncid, VAR_NAME, NC_FLOAT, NDIMS3, dimids, &varid)) ERR; */
/* if (nc_def_var_chunking(ncid, varid, NC_CHUNKED, NULL)) ERR; */
/* /\* Check how well default chunking worked. *\/ */
/* if (nc_inq_var_chunking(ncid, varid, &storage, chunksizes)) ERR; */
/* if (storage != NC_CHUNKED) ERR; */
/* if (calculate_waste(NDIMS3, dim_len, chunksizes, &waste)) ERR; */
/* /\* if (waste > MAX_WASTE) ERR;*\/ */
/* if (nc_close(ncid)) ERR; */
/* } */
/* } */
/* SUMMARIZE_ERR; */
FINAL_RESULTS;
}

View File

@ -10,15 +10,6 @@
#define FILE_NAME "tst_vars4.nc"
void
check_err(const int stat, const int line, const char *file) {
if (stat != NC_NOERR) {
(void)fprintf(stderr,"line %d of %s: %s\n", line, file, nc_strerror(stat));
fflush(stderr);
exit(1);
}
}
int
main(int argc, char **argv)
{
@ -70,104 +61,6 @@ main(int argc, char **argv)
if (nc_close(ncid)) ERR;
}
SUMMARIZE_ERR;
printf("**** testing setting chunking without chunksizes...");
{
#define NDIMS3 3
#define NUM_VARS 1
#define Y_NAME "y"
#define X_NAME "x"
#define Z_NAME "z"
#define JOE_NAME "joe"
#define XDIM_LEN 2
#define YDIM_LEN 5
#define ZDIM_LEN 3000
int varid, ncid, dims[NDIMS3], dims_in[NDIMS3];
int ndims, nvars, ngatts, unlimdimid, natts;
char name_in[NC_MAX_NAME + 1];
nc_type type_in;
size_t len_in;
int storage = 0;
size_t chunksizes[NDIMS3];
if (nc_create(FILE_NAME, NC_NETCDF4 | NC_CLOBBER, &ncid)) ERR;
if (nc_def_dim(ncid, X_NAME, XDIM_LEN, &dims[0])) ERR;
if (nc_def_dim(ncid, Y_NAME, YDIM_LEN, &dims[1])) ERR;
if (nc_def_dim(ncid, Z_NAME, ZDIM_LEN, &dims[2])) ERR;
if (nc_def_var(ncid, JOE_NAME, NC_FLOAT, NDIMS3, dims, &varid)) ERR;
if (nc_def_var_chunking(ncid, 0, NC_CHUNKED, NULL)) ERR;
if (nc_inq(ncid, &ndims, &nvars, &ngatts, &unlimdimid)) ERR;
if (nvars != NUM_VARS || ndims != NDIMS3 || ngatts != 0 || unlimdimid != -1) ERR;
if (nc_inq_var(ncid, 0, name_in, &type_in, &ndims, dims_in, &natts)) ERR;
if (strcmp(name_in, JOE_NAME) || type_in != NC_FLOAT || ndims != NDIMS3 ||
dims_in[0] != dims[0] || dims_in[1] != dims[1] || dims_in[2] != dims[2] || natts != 0) ERR;
if (nc_inq_dim(ncid, 0, name_in, &len_in)) ERR;
if (strcmp(name_in, X_NAME) || len_in != XDIM_LEN) ERR;
if (nc_inq_dim(ncid, 1, name_in, &len_in)) ERR;
if (strcmp(name_in, Y_NAME) || len_in != YDIM_LEN) ERR;
if (nc_inq_dim(ncid, 2, name_in, &len_in)) ERR;
if (strcmp(name_in, Z_NAME) || len_in != ZDIM_LEN) ERR;
if (nc_inq_var_chunking(ncid, 0, &storage, chunksizes)) ERR;
if (storage != NC_CHUNKED) ERR;
if (nc_close(ncid)) ERR;
/* Open the file and check. */
if (nc_open(FILE_NAME, NC_WRITE, &ncid)) ERR;
if (nc_inq(ncid, &ndims, &nvars, &ngatts, &unlimdimid)) ERR;
if (nvars != NUM_VARS || ndims != NDIMS3 || ngatts != 0 || unlimdimid != -1) ERR;
if (nc_inq_var(ncid, 0, name_in, &type_in, &ndims, dims_in, &natts)) ERR;
if (strcmp(name_in, JOE_NAME) || type_in != NC_FLOAT || ndims != NDIMS3 ||
dims_in[0] != dims[0] || dims_in[1] != dims[1] || dims_in[2] != dims[2] || natts != 0) ERR;
if (nc_inq_dim(ncid, 0, name_in, &len_in)) ERR;
if (strcmp(name_in, X_NAME) || len_in != XDIM_LEN) ERR;
if (nc_inq_dim(ncid, 1, name_in, &len_in)) ERR;
if (strcmp(name_in, Y_NAME) || len_in != YDIM_LEN) ERR;
if (nc_inq_dim(ncid, 2, name_in, &len_in)) ERR;
if (strcmp(name_in, Z_NAME) || len_in != ZDIM_LEN) ERR;
if (nc_inq_var_chunking(ncid, 0, &storage, chunksizes)) ERR;
if (storage != NC_CHUNKED) ERR;
if (nc_close(ncid)) ERR;
}
SUMMARIZE_ERR;
printf("**** testing default chunksizes...");
{
# define RANK_M 3
# define RANK_time 1
int ncid;
size_t nsets_len = NC_UNLIMITED;
size_t npoints_len = 152750;
size_t n_variables_len = 11;
int measurements_id, time_id, measurements_dims[RANK_M];
int time_dims[RANK_time], time_data[1] = {1} ;
size_t time_startset[1] = {0} ;
size_t time_countset[1] = {1} ;
int ndims, nvars, ngatts, unlimdimid;
int storage;
size_t chunksizes[RANK_M];
/* Create a netCDF-4 file with two vars, and write some data to one of them. */
if (nc_create(FILE_NAME, NC_CLOBBER|NC_NETCDF4, &ncid)) ERR;
if (nc_def_dim(ncid, "nsets", nsets_len, &measurements_dims[0])) ERR;
if (nc_def_dim(ncid, "npoints", npoints_len, &measurements_dims[1])) ERR;
if (nc_def_dim(ncid, "n_variables", n_variables_len, &measurements_dims[2])) ERR;
/* nc_set_log_level(4);*/
if (nc_def_var(ncid, "measurements", NC_FLOAT, RANK_M, measurements_dims,
&measurements_id)) ERR;
time_dims[0] = measurements_dims[0];
if (nc_def_var(ncid, "time", NC_INT, RANK_time, time_dims, &time_id)) ERR;
if (nc_put_vara(ncid, time_id, time_startset, time_countset, time_data)) ERR;
if (nc_close(ncid)) ERR;
/* Reopen and check file. */
if (nc_open(FILE_NAME, NC_WRITE, &ncid)) ERR;
if (nc_inq(ncid, &ndims, &nvars, &ngatts, &unlimdimid)) ERR;
if (nvars != 2 || ndims != 3 || ngatts != 0 || unlimdimid != 0) ERR;
if (nc_inq_var_chunking(ncid, 0, &storage, chunksizes)) ERR;
if (storage != NC_CHUNKED) ERR;
if (nc_close(ncid)) ERR;
}
SUMMARIZE_ERR;
FINAL_RESULTS;
}