Added and documented nccopy options to specify chunk cache. Fixed nccopy bug of not permit chunk lengths >= dimension lengths. Added experimental (undocumented) option for computing adequate chunk cache.

This commit is contained in:
Russ Rew 2011-07-12 19:06:00 +00:00
parent 17d09ddac5
commit f10a142411
2 changed files with 69 additions and 46 deletions

View File

@ -14,6 +14,8 @@ nccopy
\%[-s]
\%[-u]
\%[-m \fI bufsize \fP]
\%[-h \fI chunk_cache \fP]
\%[-e \fI cache_elems \fP]
\%\fI infile \fP
\%\fI outfile \fP
.hy
@ -113,20 +115,44 @@ for variables that use the `m' and `n' dimensions might be
resulting from copying with a chunkspec, use the `-s'
option of ncdump on the output file.
.IP "\fB -m \fP \fI bufsize \fP"
Specifies the size, in bytes, of the copy buffer used to
to copy large variables, by copying them in smaller pieces, each no
larger than \fI bufsize \fP. A suffix of k, m, or g multiplies
the copy buffer size by one thousand, million, or billion, respectively.
The default is 5000000 bytes,
An integer or floating-point number that specifies the size, in bytes,
of the copy buffer used
to copy large variables. A suffix of K, M, G, or T multiplies
the copy buffer size by one thousand, million, billion, or trillion, respectively.
The default is 5,000,000 bytes,
but will be increased if necessary to hold at least one chunk of
netCDF-4 chunked variables in the input file. You may want to specify
a value larger than the default for OPeNDAP copies of large files over high
a value larger than the default for copying large files over high
latency networks.
.IP "\fB -h \fP \fI chunk_cache \fP"
An integer or floating-point number that specifies the size in bytes
of chunk cache for chunked variables. This is
not a property of the file, but merely a performance tuning parameter
for avoiding compressing or decompressing the same data multiple times
while copying and changing chunk shapes. A suffix of K, M, G, or T multiplies
the chunk cache size by one thousand, million, billion, or trillion, respectively.
The default is 4,194,304 (or whatever was specified for the
configure-time constant CHUNK_CACHE_SIZE when the netCDF library was
built). Ideally, the nccopy utility should accept only one memory
buffer size and divide it optimally between a copy buffer and chunk
cache, but no general algorithm for computing the optimum chunk cache
size has been implemented yet.
.IP "\fB -e \fP \fI cache_elems \fP"
Specifies number of elements that the chunk cache can hold. This is
not a property of the file, but merely a performance tuning parameter
for avoiding compressing or decompressing the same data multiple times
while copying and changing chunk shapes. The default is 1009 (or
whatever was specified for the configure-time constant
CHUNK_CACHE_NELEMS when the netCDF library was built). Ideally, the
nccopy utility should determine an optimum value for this parameter,
but no general algorithm for computing the optimum number of chunk
cache elements has been implemented yet.
.P
Note that \fB nccopy \fP requires variables that share a dimension to
also share the chunk size associated with that dimension, but the API
has no such restriction. With a program you can customize chunking
for each variable independently.
has no such restriction. If you need to customize chunking
for each variable independently, you will need to use the library API
in a custom utility program.
.SH EXAMPLES
.LP
Make a copy of foo1.nc, a netCDF file of any type, to foo2.nc, a

View File

@ -39,10 +39,12 @@ static int option_kind = SAME_AS_INPUT;
static int option_deflate_level = -1; /* default, compress output only if input compressed */
static int option_shuffle_vars = NC_NOSHUFFLE; /* default, no shuffling on compression */
static int option_fix_unlimdims = 0; /* default, preserve unlimited dimensions */
static char* option_chunkspec = 0; /* default, no chunk specification */
static size_t option_copy_buffer_size = COPY_BUFFER_SIZE;
static size_t option_chunk_cache_size = CHUNK_CACHE_SIZE; /* default from config.h */
static size_t option_chunk_cache_nelems = CHUNK_CACHE_NELEMS; /* default from config.h */
static int option_global_chunk_cache = 1; /* default, use global chunk cache */
static int option_compute_chunkcaches = 0; /* default, don't try still flaky estimate of
* chunk cache for each variable */
/* get group id in output corresponding to group igrp in input,
* given parent group id (or root group id) parid in output. */
@ -538,8 +540,7 @@ set_var_chunked(int ogrp, int o_varid)
/* Determine if this variable should be chunked. A variable
* should be chunked if any of its dims are in command-line
* chunk spec and if corresponding chunk size is smaller than
* dimension length. It will also be chunked if any of its
* chunk spec. It will also be chunked if any of its
* dims are unlimited. */
for(odim = 0; odim < ndims; odim++) {
int odimid = dimids[odim];
@ -550,7 +551,7 @@ set_var_chunked(int ogrp, int o_varid)
size_t chunksize = chunkspec_size(idimid); /* from chunkspec */
size_t dimlen;
NC_CHECK(nc_inq_dimlen(ogrp, odimid, &dimlen));
if( (chunksize > 0 && chunksize < dimlen) || dimmap_ounlim(odimid)) {
if( (chunksize > 0) || dimmap_ounlim(odimid)) {
chunked = 1;
}
varsize *= dimlen;
@ -899,29 +900,26 @@ copy_var_data(int igrp, int varid, int ogrp) {
int contig = 1;
NC_CHECK(nc_inq_var_chunking(ogrp, ovarid, &contig, NULL));
if(contig == 0) { /* chunked */
if(option_global_chunk_cache) { /* by default, use same
* global chunk cache for
* all chunked
* variables */
NC_CHECK(nc_set_var_chunk_cache(ogrp, ovarid, option_chunk_cache_size,
option_chunk_cache_nelems,
COPY_CHUNKCACHE_PREEMPTION));
} else { /* if experimental "-x" option
* specified, try to estimate
* variable-specific chunk cache,
* depending on specific size and
* shape of this variable's chunks */
if(option_compute_chunkcaches) {
/* Try to estimate variable-specific chunk cache,
* depending on specific size and shape of this
* variable's chunks. This doesn't work yet. */
size_t chunkcache_size, chunkcache_nelems;
float chunkcache_preemption;
NC_CHECK(inq_var_chunking_params(igrp, varid, ogrp, ovarid,
&chunkcache_size,
&chunkcache_nelems,
&chunkcache_preemption));
printf("%s chunkcache_size, chunkcache_nelems: %ld, %ld\n",
varname, chunkcache_size, chunkcache_nelems); /* for debugging */
NC_CHECK(nc_set_var_chunk_cache(ogrp, ovarid, chunkcache_size,
NC_CHECK(nc_set_var_chunk_cache(ogrp, ovarid,
chunkcache_size,
chunkcache_nelems,
chunkcache_preemption));
} else {
/* by default, use same chunk cache for all chunked variables */
NC_CHECK(nc_set_var_chunk_cache(ogrp, ovarid,
option_chunk_cache_size,
option_chunk_cache_nelems,
COPY_CHUNKCACHE_PREEMPTION));
}
}
}
@ -971,8 +969,8 @@ copy_var_data(int igrp, int varid, int ogrp) {
#ifdef USE_NETCDF4
/* We're all done with this input and output variable, so if
* either variable is chunked, free up its variable chunk cache */
NC_CHECK(free_var_chunk_cache(igrp, varid));
NC_CHECK(free_var_chunk_cache(ogrp, ovarid));
/* NC_CHECK(free_var_chunk_cache(igrp, varid)); */
/* NC_CHECK(free_var_chunk_cache(ogrp, ovarid)); */
#endif /* USE_NETCDF4 */
free(start);
free(count);
@ -1046,9 +1044,7 @@ count_dims(ncid) {
* type 1 or 2.
*/
static int
copy(char* infile, char* outfile,
const char* chunkspec_s /* unparsed chunkspec string, from command line */
)
copy(char* infile, char* outfile)
{
int stat = NC_NOERR;
int igrp, ogrp;
@ -1066,7 +1062,7 @@ copy(char* infile, char* outfile,
if (inkind == NC_FORMAT_CLASSIC || inkind == NC_FORMAT_64BIT) {
if (option_deflate_level > 0 ||
option_shuffle_vars == NC_SHUFFLE ||
chunkspec_s)
option_chunkspec)
{
outkind = NC_FORMAT_NETCDF4_CLASSIC;
}
@ -1074,10 +1070,10 @@ copy(char* infile, char* outfile,
}
#ifdef USE_NETCDF4
if(chunkspec_s) {
/* Now that input is open, can parse chunkspec_s into binary
if(option_chunkspec) {
/* Now that input is open, can parse option_chunkspec into binary
* structure. */
NC_CHECK(chunkspec_parse(igrp, chunkspec_s));
NC_CHECK(chunkspec_parse(igrp, option_chunkspec));
}
#endif /* USE_NETCDF4 */
@ -1141,11 +1137,13 @@ usage(void)
[-m n] set size in bytes of copy buffer, default is 5000000 bytes\n\
[-h n] set size in bytes of chunk_cache for chunked variables\n\
[-e n] set number of elements that chunk_cache can hold\n\
[-x] use experimental computed estimates for variable-specific chunk caches\n\
infile name of netCDF input file\n\
outfile name for netCDF output file\n"
error("%s [-k n] [-d n] [-s] [-c chunkspec] [-u] [-m n] [-h n] [-e n] [-x] infile outfile\n%s",
/* Don't document this flaky option until it works better */
/* [-x] use experimental computed estimates for variable-specific chunk caches\n\ */
error("%s [-k n] [-d n] [-s] [-c chunkspec] [-u] [-m n] [-h n] [-e n] infile outfile\n%s",
progname, USAGE);
}
@ -1155,7 +1153,6 @@ main(int argc, char**argv)
char* inputfile = NULL;
char* outputfile = NULL;
int c;
char* chunkspec = 0;
/* table of formats for legal -k values */
struct Kvalues {
@ -1273,16 +1270,16 @@ main(int argc, char**argv)
if(*suffix) {
switch (*suffix) {
case 'k': case 'K':
option_chunk_cache_size *= 1000;
dval *= 1000;
break;
case 'm': case 'M':
option_chunk_cache_size *= 1000000;
dval *= 1000000;
break;
case 'g': case 'G':
option_chunk_cache_size *= 1000000000;
dval *= 1000000000;
break;
case 't': case 'T':
option_chunk_cache_size *= 1.0e12;
dval *= 1.0e12;
break;
default:
error("If suffix used for '-h' option value, it must be K, M, G, or T: %c",
@ -1299,12 +1296,12 @@ main(int argc, char**argv)
}
break;
case 'x': /* use experimental variable-specific chunk caches */
option_global_chunk_cache = 0;
option_compute_chunkcaches = 1;
break;
case 'c': /* optional chunking spec for each dimension in list */
{
/* save chunkspec string for parsing later, once we know input ncid */
chunkspec = strdup(optarg);
option_chunkspec = strdup(optarg);
break;
}
default:
@ -1324,7 +1321,7 @@ main(int argc, char**argv)
error("output would overwrite input");
}
if(copy(inputfile, outputfile, chunkspec) != NC_NOERR)
if(copy(inputfile, outputfile) != NC_NOERR)
exit(1);
return 0;
}