netcdf-c/ncdump/tst_unicode.c
Dennis Heimbigner 0b7a5382e7 Codify cross-platform file paths
The netcdf-c code has to deal with a variety of platforms:
Windows, OSX, Linux, Cygwin, MSYS, etc.  These platforms differ
significantly in the kind of file paths that they accept.  So in
order to handle this, I have created a set of replacements for
the most common file system operations such as _open_ or _fopen_
or _access_ to manage the file path differences correctly.

A more limited version of this idea was already implemented via
the ncwinpath.h and dwinpath.c code. So this can be viewed as a
replacement for that code. And in path in many cases, the only
change that was required was to replace '#include <ncwinpath.h>'
with '#include <ncpathmgt.h>' and then replace file operation
calls with the NCxxx equivalent from ncpathmgr.h Note that
recently, the ncwinpath.h was renamed ncpathmgmt.h, so this pull
request should not require dealing with winpath.

The heart of the change is include/ncpathmgmt.h, which provides
alternate operations such as NCfopen or NCaccess and which properly
parse and rebuild path arguments to work for the platform on which
the code is executing. This mostly matters for Windows because of the
way that it uses backslash and drive letters, as compared to *nix*.
One important feature is that the user can do string manipulations
on a file path without having to worry too much about the platform
because the path management code will properly handle most mixed cases.
So one can for example concatenate a path suffix that uses forward
slashes to a Windows path and have it work correctly.

The conversion code is in libdispatch/dpathmgr.c, and the
important function there is NCpathcvt which does the proper
conversions to the local path format.

As a rule, most code should just replace their file operations with
the corresponding NCxxx ones defined in include/ncpathmgmt.h. These
NCxxx functions all call NCpathcvt on their path arguments before
executing the actual file operation.

In some rare cases, the client may need to directly use NCpathcvt,
but this should be avoided as much as possible. If there is a need
for supporting a new file operation not already in ncpathmgmt.h, then
use the code in dpathmgr.c as a template. Also please notify Unidata
so we can include it as a formal part or our supported operations.
Also, if you see an operation in the library that is not using the
NCxxx form, then please submit an issue so we can fix it.

Misc. Changes:
* Clean up the utf8 testing code; it is impossible to get some
  tests to work under windows using shell scripts; the args do
  not pass as utf8 but as some other encoding.
* Added an extra utf8 test case: test_unicode_path.sh
* Add a true test for HDF5 1.10.6 or later because as noted in
  PR https://github.com/Unidata/netcdf-c/pull/1794,
  HDF5 changed its Windows file path handling.
2021-03-04 13:41:31 -07:00

157 lines
4.7 KiB
C

/* This is part of the netCDF package.
Copyright 2018 University Corporation for Atmospheric Research/Unidata.
See COPYRIGHT file for conditions of use.
This is a very simple example which writes a netCDF file with
Unicode names encoded with UTF-8.
$Id: tst_unicode.c,v 1.12 2008/10/20 01:48:08 ed Exp $
*/
#include <nc_tests.h>
#include "err_macros.h"
#include <stdlib.h>
#include <stdio.h>
#include "netcdf.h"
#include "ncpathmgr.h"
#ifdef _WIN32
#include <windows.h>
#include <direct.h>
#endif
#ifdef HAVE_LOCALE_H
#include <locale.h>
#endif
#define DEBUG
/* The data file we will create. */
static const unsigned char prefix[] = {
't','s','t','_','u','t','f','8','_',
'\xe6', '\xb5', '\xb7',
'\0'
};
/* Other meta-data */
#define UNITS "units"
#define NDIMS 1
#define UTF8_BYTES 18
static unsigned char name_utf8[] = {
0xCE, 0x9A, /* GREEK CAPITAL LETTER KAPPA : 2-bytes utf8 */
0xCE, 0xB1, /* GREEK SMALL LETTER LAMBDA : 2-bytes utf8 */
0xCE, 0xBB, /* GREEK SMALL LETTER ALPHA : 2-bytes utf8 */
0xCE, 0xB7, /* GREEK SMALL LETTER ETA : 2-bytes utf8 */
0xCE, 0xBC, /* GREEK SMALL LETTER MU : 2-bytes utf8 */
0xE1, 0xBD, 0xB3, /* GREEK SMALL LETTER EPSILON
WITH TONOS : 3-bytes utf8 */
0xCF, 0x81, /* GREEK SMALL LETTER RHO : 2-bytes utf8 */
0xCE, 0xB1, 0x00 /* GREEK SMALL LETTER ALPHA : 2-bytes utf8 */
};
/* Name used for dimension, variable, and attribute value */
#define UNAME ((char *) name_utf8)
#define UNAMELEN (sizeof name_utf8)
/* Note, name was normalized before storing, so retrieved name
won't match original unnormalized name. Check that we get
normalized version, instead. */
/* NFC normalized UTF-8 for Unicode 8-character "Hello" in Greek */
static unsigned char norm_utf8[] = {
0xCE, 0x9A, /* GREEK CAPITAL LETTER KAPPA : 2-bytes utf8 */
0xCE, 0xB1, /* GREEK SMALL LETTER LAMBDA : 2-bytes utf8 */
0xCE, 0xBB, /* GREEK SMALL LETTER ALPHA : 2-bytes utf8 */
0xCE, 0xB7, /* GREEK SMALL LETTER ETA : 2-bytes utf8 */
0xCE, 0xBC, /* GREEK SMALL LETTER MU : 2-bytes utf8 */
0xCE, 0xAD, /* GREEK SMALL LETTER EPSILON WITH TONOS
: 2-bytes utf8 */
0xCF, 0x81, /* GREEK SMALL LETTER RHO : 2-bytes utf8 */
0xCE, 0xB1, /* GREEK SMALL LETTER ALPHA : 2-bytes utf8 */
0x00
};
#define NNAME ((char *) norm_utf8)
#define NNAMELEN (sizeof norm_utf8)
static int
check(int err, int line, const char* file)
{
if(err != 0) {
fprintf(stderr,"ERR %s.%d (%d) %s\n",file,line,err,nc_strerror(err));
fflush(stderr);
}
return err;
}
#define CHECK(err) {if((ret=check(err,__LINE__,__FILE__))) goto done;}
static int
test(int flags, const char* model)
{
int ret = NC_NOERR;
int ncid, dimid, varid;
int dimids[NDIMS];
char name_in[UNAMELEN + 1], strings_in[UNAMELEN + 1];
nc_type att_type;
size_t att_len;
char filename[4096];
/* Construct the file name */
snprintf(filename,sizeof(filename),"%s_%s.nc",prefix,model);
printf("\n*** Testing UTF-8: %s model\n",model);
printf("*** creating UTF-8 test file |%s|...", filename);
CHECK(nc_create(filename, flags, &ncid));
/* Define dimension with Unicode UTF-8 encoded name */
CHECK(nc_def_dim(ncid, UNAME, UTF8_BYTES, &dimid));
dimids[0] = dimid;
/* Define variable with same name */
CHECK(nc_def_var(ncid, UNAME, NC_CHAR, NDIMS, dimids, &varid));
/* Create string attribute with same value */
CHECK(nc_put_att_text(ncid, varid, UNITS, UNAMELEN, UNAME));
CHECK(nc_enddef(ncid));
/* Write string data, UTF-8 encoded, to the file */
CHECK(nc_put_var_text(ncid, varid, UNAME));
CHECK(nc_close(ncid));
/* Check it out. */
/* Reopen the file. */
CHECK(nc_open(filename, NC_NOWRITE, &ncid));
CHECK(nc_inq_varid(ncid, UNAME, &varid));
CHECK(nc_inq_varname(ncid, varid, name_in));
{
if (strncmp(NNAME, name_in, NNAMELEN) != 0)
{CHECK(NC_EBADNAME);}
}
CHECK(nc_inq_att(ncid, varid, UNITS, &att_type, &att_len));
CHECK(att_type != NC_CHAR || att_len != UNAMELEN);
CHECK(nc_get_att_text(ncid, varid, UNITS, strings_in));
strings_in[att_len] = '\0'; /* null terminate, because nc_get_att_text doesn't */
if (strncmp(UNAME, strings_in, UNAMELEN) != 0)
{CHECK(NC_EBADNAME);}
CHECK(nc_close(ncid));
done:
return ret;
}
int
main(int argc, char **argv)
{
/* Run the utf8 test both for netcdf-4 and netcdf-3 */
if(test(0,"classic")) ERR;
#ifdef USE_HDF5
if(test(NC_NETCDF4,"enhanced")) ERR;
#endif
SUMMARIZE_ERR;
FINAL_RESULTS;
}