[NCF-293]

Allow .cdl files to have a leading utf-8 BOM. Also add test.
2025-03-25 17:40:27 +08:00 · 2014-03-07 22:52:40 -07:00 · 2014-03-07 22:52:40 -07:00 · baade3e4fc
commit baade3e4fc
parent deeca5fb83
6 changed files with 124 additions and 5 deletions
--- a/configure.ac
+++ b/configure.ac
@ -624,7 +624,7 @@ AC_HEADER_STDBOOL
 # Check for these functions...
 AC_CHECK_FUNCS([strlcat strerror snprintf strchr strrchr strcat strcpy \
                strdup strcasecmp strtod strtoll strtoull strstr \
-		mkstemp rand \
+		mkstemp rand memcmp \
 		getrlimit gettimeofday fsync MPI_Comm_f2c])

 # Does the user want to use NC_DISKLESS?
--- a/ncdump/CMakeLists.txt
+++ b/ncdump/CMakeLists.txt
@ -37,7 +37,7 @@ TARGET_LINK_LIBRARIES(ncdump netcdf ${ALL_TLL_LIBS})
 TARGET_LINK_LIBRARIES(nccopy netcdf ${ALL_TLL_LIBS})

 IF(ENABLE_TESTS)
-	ADD_EXECUTABLE(rewrite-scalar rewrite-scalar.c)
+	ADD_EXECUTABLE(rewrite-scalar rewrite-scalar.c bom.c)
 	TARGET_LINK_LIBRARIES(rewrite-scalar netcdf)
 	# Base tests
 	# The tests are set up as a combination of shell scripts and executables that
@ -58,8 +58,8 @@ IF(ENABLE_TESTS)
 	add_sh_test(ncdump tst_charfill)
 	add_sh_test(ncdump tst_iter)
 	add_sh_test(ncdump tst_formatx3)
+	add_sh_test(ncdump tst_bom)
 	
-
 	IF(EXTRA_TESTS)
 		add_sh_test(ncdump run_back_comp_tests)
 	ENDIF()
--- a/ncdump/Makefile.am
+++ b/ncdump/Makefile.am
@ -28,10 +28,10 @@ man_MANS = ncdump.1 nccopy.1
 if BUILD_TESTSETS
 #if !BUILD_DLL
 # These tests are run for both netCDF-4 and non-netCDF-4 builds.
-check_PROGRAMS = rewrite-scalar ctest ctest64 ncdump tst_utf8
+check_PROGRAMS = rewrite-scalar ctest ctest64 ncdump tst_utf8 bom
 TESTS = run_tests.sh tst_64bit.sh ctest ctest64 tst_output.sh	\
 tst_lengths.sh tst_calendars.sh tst_utf8 run_utf8_tests.sh      \
-tst_nccopy3.sh tst_charfill.sh tst_iter.sh tst_formatx3.sh
+tst_nccopy3.sh tst_charfill.sh tst_iter.sh tst_formatx3.sh tst_bom.sh

 if LARGE_FILE_TESTS
 TESTS += tst_iter.sh
--- a/ncdump/bom.c
+++ b/ncdump/bom.c
@ -0,0 +1,33 @@
+/*********************************************************************
+ *   Copyright 1993, UCAR/Unidata
+ *   See netcdf/COPYRIGHT file for copying and redistribution conditions.
+ *********************************************************************/
+
+#include <config.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+/* BOM Sequences */
+static char* U8   = "\xEF\xBB\xBF";    /* UTF-8 */
+static char* BE32 = "\x00\x00\xFE\xFF"; /* UTF-32; big-endian */
+static char* LE32 = "\xFF\xFE";       /* UTF-32; little-endian */
+static char* BE16 = "\xFE\xFF";       /* UTF-16; big-endian */
+static char* LE16 = "\xFF\xFE";       /* UTF-16; little-endian */
+
+int
+main(int argc, char** argv)
+{
+    char* bom = U8;
+    int bomlen = 3;
+    if(argc > 1 && strlen(argv[1]) > 0) {
+	char* which = argv[1];
+	switch (which[0]) {
+	case '1': bom = BE16; bomlen = 2; break;
+	case '3': bom = BE32; bomlen = 2; break;
+	default: break;
+	}
+    }
+    fwrite(bom,1,bomlen,stdout);
+    exit(0);
+}
+
--- a/ncdump/tst_bom.sh
+++ b/ncdump/tst_bom.sh
@ -0,0 +1,54 @@
+#!/bin/sh
+# This shell script tests BOM support in ncgen
+
+set -e
+
+if test "x$srcdir" = "x"; then
+    srcdir=`dirname $0`; 
+fi
+# add hack for sunos
+export srcdir;
+
+echo ""
+
+rm -f tst_bom.cdl tmp.cdl tst_bom8.* tst_bom16.*
+
+cat <<EOF >>tst_bom.cdl
+netcdf tst_bom {
+variables:
+  float f;
+data:
+
+  f = 1;
+}
+EOF
+
+echo "*** Generate a cdl file with leading UTF-8 BOM."
+./bom 8 >tst_bom8.cdl
+cat tst_bom.cdl >> tst_bom8.cdl
+
+echo "*** Verify .nc file"
+../ncgen/ncgen -k1 -o tst_bom8.nc tst_bom8.cdl
+../ncdump/ncdump -n tst_bom tst_bom8.nc > tmp.cdl
+diff -w tst_bom.cdl tmp.cdl
+
+# Do it again but with Big-Endian 16; should fail
+
+rm -f tmp.cdl tst_bom8.* tst_bom16.*
+
+echo "*** Generate a cdl file with leading UTF-16 BOM."
+./bom 16 >tst_bom16.cdl
+cat tst_bom.cdl >> tst_bom16.cdl
+
+echo "*** Verify UTF-16 file fails"
+if ../ncgen/ncgen -k1 -o tst_bom16.nc tst_bom16.cdl ; then
+echo 'BOM Big Endian 16 succeeded, but should not'
+exit 1
+else
+echo '***XFAIL: BOM Big Endian 16'
+fi
+
+# Cleanup
+rm -f tst_bom.cdl tmp.cdl tst_bom8.* tst_bom16.*
+
+exit 0
--- a/ncgen/main.c
+++ b/ncgen/main.c
@ -124,6 +124,13 @@ struct Languages legallanguages[] = {
 };
 #endif

+/* BOM Sequences */
+static char* U8   = "\xEF\xBB\xBF";    /* UTF-8 */
+static char* BE32 = "\x00\x00\xFE\xFF"; /* UTF-32; big-endian */
+static char* LE32 = "\xFF\xFE";       /* UTF-32; little-endian */
+static char* BE16 = "\xFE\xFF";       /* UTF-16; big-endian */
+static char* LE16 = "\xFF\xFE";       /* UTF-16; little-endian */
+
 /* The default minimum iterator size depends
   on whether we are doing binary or language
   based output.
@ -371,11 +378,36 @@ main(

    fp = stdin;
    if (argc > 0 && strcmp(argv[0], "-") != 0) {
+	char bom[4];
+	size_t count;
 	if ((fp = fopen(argv[0], "r")) == NULL) {
 	    derror ("can't open file %s for reading: ", argv[0]);
 	    perror("");
 	    return(7);
 	}
+   	/* Check the leading bytes for an occurrence of a BOM */
+        /* re: http://www.unicode.org/faq/utf_bom.html#BOM */
+	/* Attempt to read the first four bytes */
+	memset(bom,0,sizeof(bom));
+	count = fread(bom,1,2,fp);
+	if(count == 2) {
+	    switch (bom[0]) {
+	    case '\x00':
+	    case '\xFF':
+	    case '\xFE':
+	        /* Only UTF-* is allowed; complain and exit */
+		fprintf(stderr,"Input file contains a BOM indicating a non-UTF8 encoding\n");
+		return 1;
+	    case '\xEF':
+		/* skip the BOM */
+	        fread(bom,1,1,fp);
+	        break;
+	    default: /* legal printable char, presumably; rewind */
+	        rewind(fp);
+		break;
+	    }
+	}
+
 	cdlname = (char*)emalloc(NC_MAX_NAME);
 	cdlname = nulldup(argv[0]);
 	if(strlen(cdlname) > NC_MAX_NAME) cdlname[NC_MAX_NAME] = '\0';