Add hash field to dim and var to facilitate fast name compare

In non-classic netcdf-4 models, it is allowable to have
large numbers of dims and vars.  In many operations, the
entire list of dims or vars is searched for a dim/var matching
a specific name which results in *lots* of strncmp or strcmp
calls.

If we add a hash field to the var and dim structs similar to what
has already been done for the netcdf-3 formats, then we can hash the
name being searched for and numerically compare that value with
the var/dim hash value.  If they match, then do a more expensive
strncmp call to ensure that the names truly match.
This commit is contained in:
Greg Sjaardema 2016-03-03 13:18:31 -07:00
parent ba06e979c6
commit 1a84a6a99e
5 changed files with 34 additions and 9 deletions

View File

@ -113,6 +113,7 @@ typedef struct NC_DIM_INFO
NC_LIST_NODE_T l; /* Use generic doubly-linked list (must be first) */
char *name;
size_t len;
uint32_t hash;
int dimid;
nc_bool_t unlimited; /* True if the dimension is unlimited */
nc_bool_t extended; /* True if the dimension needs to be extended */
@ -148,6 +149,7 @@ typedef struct NC_VAR_INFO
NC_DIM_INFO_T **dim;
int varid;
int natts;
uint32_t hash;
nc_bool_t is_new_var; /* True if variable is newly created */
nc_bool_t was_coord_var; /* True if variable was a coordinate var, but either the dim or var has been renamed */
nc_bool_t became_coord_var; /* True if variable _became_ a coordinate var, because either the dim or var has been renamed */

View File

@ -73,6 +73,7 @@ NC4_def_dim(int ncid, const char *name, size_t len, int *idp)
NC_DIM_INFO_T *dim;
char norm_name[NC_MAX_NAME + 1];
int retval = NC_NOERR;
uint32_t nn_hash;
LOG((2, "%s: ncid 0x%x name %s len %d", __func__, ncid, name,
(int)len));
@ -122,9 +123,11 @@ NC4_def_dim(int ncid, const char *name, size_t len, int *idp)
if(len > X_UINT_MAX) /* Backward compat */
return NC_EDIMSIZE;
nn_hash = hash_fast(norm_name, strlen(norm_name));
/* Make sure the name is not already in use. */
for (dim = grp->dim; dim; dim = dim->l.next)
if (!strncmp(dim->name, norm_name, NC_MAX_NAME))
if (nn_hash == dim->hash && !strncmp(dim->name, norm_name, NC_MAX_NAME))
return NC_ENAMEINUSE;
/* Add a dimension to the list. The ID must come from the file
@ -139,6 +142,8 @@ NC4_def_dim(int ncid, const char *name, size_t len, int *idp)
if (len == NC_UNLIMITED)
dim->unlimited = NC_TRUE;
dim->hash = nn_hash;
/* Pass back the dimid. */
if (idp)
*idp = dim->dimid;
@ -157,7 +162,8 @@ NC4_inq_dimid(int ncid, const char *name, int *idp)
char norm_name[NC_MAX_NAME + 1];
int finished = 0;
int retval;
uint32_t shash;
LOG((2, "%s: ncid 0x%x name %s", __func__, ncid, name));
/* Find metadata for this file. */
@ -177,10 +183,12 @@ NC4_inq_dimid(int ncid, const char *name, int *idp)
if ((retval = nc4_normalize_name(name, norm_name)))
return retval;
shash = hash_fast(norm_name, strlen(norm_name));
/* Go through each dim and check for a name match. */
for (g = grp; g && !finished; g = g->parent)
for (dim = g->dim; dim; dim = dim->l.next)
if (!strncmp(dim->name, norm_name, NC_MAX_NAME))
if (dim->hash == shash && !strncmp(dim->name, norm_name, NC_MAX_NAME))
{
if (idp)
*idp = dim->dimid;
@ -336,6 +344,8 @@ NC4_rename_dim(int ncid, int dimid, const char *name)
return NC_ENOMEM;
strcpy(dim->name, norm_name);
dim->hash = hash_fast(norm_name, strlen(norm_name));
/* Check if dimension was a coordinate variable, but names are different now */
if (dim->coord_var && strcmp(dim->name, dim->coord_var->name))
{

View File

@ -606,6 +606,7 @@ read_scale(NC_GRP_INFO_T *grp, hid_t datasetid, const char *obj_name,
new_dim->hdf5_objid.fileno[1] = statbuf->fileno[1];
new_dim->hdf5_objid.objno[0] = statbuf->objno[0];
new_dim->hdf5_objid.objno[1] = statbuf->objno[1];
new_dim->hash = hash_fast(obj_name, strlen(obj_name));
/* If the dimscale has an unlimited dimension, then this dimension
* is unlimited. */
@ -1564,6 +1565,7 @@ read_var(NC_GRP_INFO_T *grp, hid_t datasetid, const char *obj_name,
strcpy(var->name, obj_name);
}
var->hash = hash_fast(var->name, strlen(var->name));
/* Find out what filters are applied to this HDF5 dataset,
* fletcher32, deflate, and/or shuffle. All other filters are
* ignored. */
@ -2672,6 +2674,7 @@ nc4_open_hdf4_file(const char *path, int mode, NC *nc)
dim->len = dim_len;
else
dim->len = *dimsize;
dim->hash = hash_fast(dim_name, strlen(dim_name));
}
/* Tell the variable the id of this dimension. */

View File

@ -764,7 +764,8 @@ nc4_check_dup_name(NC_GRP_INFO_T *grp, char *name)
NC_TYPE_INFO_T *type;
NC_GRP_INFO_T *g;
NC_VAR_INFO_T *var;
uint32_t hash;
/* Any types of this name? */
for (type = grp->type; type; type = type->l.next)
if (!strcmp(type->name, name))
@ -776,8 +777,9 @@ nc4_check_dup_name(NC_GRP_INFO_T *grp, char *name)
return NC_ENAMEINUSE;
/* Any variables of this name? */
hash = hash_fast(name, strlen(name));
for (var = grp->var; var; var = var->l.next)
if (!strcmp(var->name, name))
if (var->hash == hash && !strcmp(var->name, name))
return NC_ENAMEINUSE;
return NC_NOERR;

View File

@ -425,6 +425,7 @@ nc_def_var_nc4(int ncid, const char *name, nc_type xtype,
if (!(var->name = malloc((strlen(norm_name) + 1) * sizeof(char))))
BAIL(NC_ENOMEM);
strcpy(var->name, norm_name);
var->hash = hash_fast(norm_name, strlen(norm_name));
var->varid = grp->nvars++;
var->ndims = ndims;
var->is_new_var = NC_TRUE;
@ -513,7 +514,7 @@ nc_def_var_nc4(int ncid, const char *name, nc_type xtype,
BAIL(retval);
/* Check for dim index 0 having the same name, in the same group */
if (d == 0 && dim_grp == grp && strcmp(dim->name, norm_name) == 0)
if (d == 0 && dim_grp == grp && dim->hash == var->hash && strcmp(dim->name, norm_name) == 0)
{
var->dimscale = NC_TRUE;
dim->coord_var = var;
@ -571,7 +572,7 @@ nc_def_var_nc4(int ncid, const char *name, nc_type xtype,
* because the dimension will cause a HDF5 dataset to be created,
* and this var has the same name. */
for (dim = grp->dim; dim; dim = dim->l.next)
if (!strcmp(dim->name, norm_name) &&
if (dim->hash == var->hash && !strcmp(dim->name, norm_name) &&
(!var->ndims || dimidsp[0] != dim->dimid))
{
/* Set a different hdf5 name for this variable to avoid name
@ -1152,6 +1153,8 @@ NC4_inq_varid(int ncid, const char *name, int *varidp)
NC_VAR_INFO_T *var;
char norm_name[NC_MAX_NAME + 1];
int retval;
uint32_t nn_hash;
#if 0 /*def USE_PNETCDF*/
NC_HDF5_FILE_INFO_T *h5;
#endif
@ -1181,9 +1184,11 @@ NC4_inq_varid(int ncid, const char *name, int *varidp)
if ((retval = nc4_normalize_name(name, norm_name)))
return retval;
nn_hash = hash_fast(norm_name, strlen(norm_name));
/* Find var of this name. */
for (var = grp->var; var; var = var->l.next)
if (!(strcmp(var->name, norm_name)))
if (nn_hash == var->hash && !(strcmp(var->name, norm_name)))
{
*varidp = var->varid;
return NC_NOERR;
@ -1203,6 +1208,7 @@ NC4_rename_var(int ncid, int varid, const char *name)
NC_GRP_INFO_T *grp;
NC_HDF5_FILE_INFO_T *h5;
NC_VAR_INFO_T *var, *tmp_var;
uint32_t nn_hash;
int retval = NC_NOERR;
LOG((2, "%s: ncid 0x%x varid %d name %s",
@ -1234,10 +1240,11 @@ NC4_rename_var(int ncid, int varid, const char *name)
return retval;
/* Check if name is in use, and retain a pointer to the correct variable */
nn_hash = hash_fast(name, strlen(name));
tmp_var = NULL;
for (var = grp->var; var; var = var->l.next)
{
if (!strncmp(var->name, name, NC_MAX_NAME))
if (nn_hash == var->hash && !strncmp(var->name, name, NC_MAX_NAME))
return NC_ENAMEINUSE;
if (var->varid == varid)
tmp_var = var;
@ -1265,6 +1272,7 @@ NC4_rename_var(int ncid, int varid, const char *name)
if (!(var->name = malloc((strlen(name) + 1) * sizeof(char))))
return NC_ENOMEM;
strcpy(var->name, name);
var->hash = nn_hash;
/* Check if this was a coordinate variable previously, but names are different now */
if (var->dimscale && strcmp(var->name, var->dim[0]->name))