[NCF-293]

Allow .cdl files to have a leading utf-8 BOM.
Also add test.
This commit is contained in:
dmh 2014-03-07 22:52:40 -07:00
parent deeca5fb83
commit baade3e4fc
6 changed files with 124 additions and 5 deletions

View File

@ -624,7 +624,7 @@ AC_HEADER_STDBOOL
# Check for these functions...
AC_CHECK_FUNCS([strlcat strerror snprintf strchr strrchr strcat strcpy \
strdup strcasecmp strtod strtoll strtoull strstr \
mkstemp rand \
mkstemp rand memcmp \
getrlimit gettimeofday fsync MPI_Comm_f2c])
# Does the user want to use NC_DISKLESS?

View File

@ -37,7 +37,7 @@ TARGET_LINK_LIBRARIES(ncdump netcdf ${ALL_TLL_LIBS})
TARGET_LINK_LIBRARIES(nccopy netcdf ${ALL_TLL_LIBS})
IF(ENABLE_TESTS)
ADD_EXECUTABLE(rewrite-scalar rewrite-scalar.c)
ADD_EXECUTABLE(rewrite-scalar rewrite-scalar.c bom.c)
TARGET_LINK_LIBRARIES(rewrite-scalar netcdf)
# Base tests
# The tests are set up as a combination of shell scripts and executables that
@ -58,8 +58,8 @@ IF(ENABLE_TESTS)
add_sh_test(ncdump tst_charfill)
add_sh_test(ncdump tst_iter)
add_sh_test(ncdump tst_formatx3)
add_sh_test(ncdump tst_bom)
IF(EXTRA_TESTS)
add_sh_test(ncdump run_back_comp_tests)
ENDIF()

View File

@ -28,10 +28,10 @@ man_MANS = ncdump.1 nccopy.1
if BUILD_TESTSETS
#if !BUILD_DLL
# These tests are run for both netCDF-4 and non-netCDF-4 builds.
check_PROGRAMS = rewrite-scalar ctest ctest64 ncdump tst_utf8
check_PROGRAMS = rewrite-scalar ctest ctest64 ncdump tst_utf8 bom
TESTS = run_tests.sh tst_64bit.sh ctest ctest64 tst_output.sh \
tst_lengths.sh tst_calendars.sh tst_utf8 run_utf8_tests.sh \
tst_nccopy3.sh tst_charfill.sh tst_iter.sh tst_formatx3.sh
tst_nccopy3.sh tst_charfill.sh tst_iter.sh tst_formatx3.sh tst_bom.sh
if LARGE_FILE_TESTS
TESTS += tst_iter.sh

33
ncdump/bom.c Normal file
View File

@ -0,0 +1,33 @@
/*********************************************************************
* Copyright 1993, UCAR/Unidata
* See netcdf/COPYRIGHT file for copying and redistribution conditions.
*********************************************************************/
#include <config.h>
#include <stdlib.h>
#include <stdio.h>
/* BOM Sequences */
static char* U8 = "\xEF\xBB\xBF"; /* UTF-8 */
static char* BE32 = "\x00\x00\xFE\xFF"; /* UTF-32; big-endian */
static char* LE32 = "\xFF\xFE"; /* UTF-32; little-endian */
static char* BE16 = "\xFE\xFF"; /* UTF-16; big-endian */
static char* LE16 = "\xFF\xFE"; /* UTF-16; little-endian */
int
main(int argc, char** argv)
{
char* bom = U8;
int bomlen = 3;
if(argc > 1 && strlen(argv[1]) > 0) {
char* which = argv[1];
switch (which[0]) {
case '1': bom = BE16; bomlen = 2; break;
case '3': bom = BE32; bomlen = 2; break;
default: break;
}
}
fwrite(bom,1,bomlen,stdout);
exit(0);
}

54
ncdump/tst_bom.sh Normal file
View File

@ -0,0 +1,54 @@
#!/bin/sh
# This shell script tests BOM support in ncgen
set -e
if test "x$srcdir" = "x"; then
srcdir=`dirname $0`;
fi
# add hack for sunos
export srcdir;
echo ""
rm -f tst_bom.cdl tmp.cdl tst_bom8.* tst_bom16.*
cat <<EOF >>tst_bom.cdl
netcdf tst_bom {
variables:
float f;
data:
f = 1;
}
EOF
echo "*** Generate a cdl file with leading UTF-8 BOM."
./bom 8 >tst_bom8.cdl
cat tst_bom.cdl >> tst_bom8.cdl
echo "*** Verify .nc file"
../ncgen/ncgen -k1 -o tst_bom8.nc tst_bom8.cdl
../ncdump/ncdump -n tst_bom tst_bom8.nc > tmp.cdl
diff -w tst_bom.cdl tmp.cdl
# Do it again but with Big-Endian 16; should fail
rm -f tmp.cdl tst_bom8.* tst_bom16.*
echo "*** Generate a cdl file with leading UTF-16 BOM."
./bom 16 >tst_bom16.cdl
cat tst_bom.cdl >> tst_bom16.cdl
echo "*** Verify UTF-16 file fails"
if ../ncgen/ncgen -k1 -o tst_bom16.nc tst_bom16.cdl ; then
echo 'BOM Big Endian 16 succeeded, but should not'
exit 1
else
echo '***XFAIL: BOM Big Endian 16'
fi
# Cleanup
rm -f tst_bom.cdl tmp.cdl tst_bom8.* tst_bom16.*
exit 0

View File

@ -124,6 +124,13 @@ struct Languages legallanguages[] = {
};
#endif
/* BOM Sequences */
static char* U8 = "\xEF\xBB\xBF"; /* UTF-8 */
static char* BE32 = "\x00\x00\xFE\xFF"; /* UTF-32; big-endian */
static char* LE32 = "\xFF\xFE"; /* UTF-32; little-endian */
static char* BE16 = "\xFE\xFF"; /* UTF-16; big-endian */
static char* LE16 = "\xFF\xFE"; /* UTF-16; little-endian */
/* The default minimum iterator size depends
on whether we are doing binary or language
based output.
@ -371,11 +378,36 @@ main(
fp = stdin;
if (argc > 0 && strcmp(argv[0], "-") != 0) {
char bom[4];
size_t count;
if ((fp = fopen(argv[0], "r")) == NULL) {
derror ("can't open file %s for reading: ", argv[0]);
perror("");
return(7);
}
/* Check the leading bytes for an occurrence of a BOM */
/* re: http://www.unicode.org/faq/utf_bom.html#BOM */
/* Attempt to read the first four bytes */
memset(bom,0,sizeof(bom));
count = fread(bom,1,2,fp);
if(count == 2) {
switch (bom[0]) {
case '\x00':
case '\xFF':
case '\xFE':
/* Only UTF-* is allowed; complain and exit */
fprintf(stderr,"Input file contains a BOM indicating a non-UTF8 encoding\n");
return 1;
case '\xEF':
/* skip the BOM */
fread(bom,1,1,fp);
break;
default: /* legal printable char, presumably; rewind */
rewind(fp);
break;
}
}
cdlname = (char*)emalloc(NC_MAX_NAME);
cdlname = nulldup(argv[0]);
if(strlen(cdlname) > NC_MAX_NAME) cdlname[NC_MAX_NAME] = '\0';