/* * Copyright 2017, University Corporation for Atmospheric Research * See netcdf/COPYRIGHT file for copying and redistribution conditions. */ #include "config.h" #ifdef HAVE_STDLIB_H #include <stdlib.h> #endif #ifdef HAVE_STRING_H #include <string.h> #endif #include "netcdf.h" #include "ncutf8.h" #include "utf8proc.h" /* Provide a wrapper around whatever utf8 library we use. */ /* * Check validity of a UTF8 encoded null-terminated byte string. * Return codes: * NC_NOERR -- string is valid utf8 * NC_ENOMEM -- out of memory * NC_EINVAL -- invalid argument or internal error * NC_EBADNAME-- not valid utf8 */ int nc_utf8_validate(const unsigned char* name) { int ncstat = NC_NOERR; const nc_utf8proc_uint8_t *str; nc_utf8proc_ssize_t nchars = -1; nc_utf8proc_int32_t codepoint; nc_utf8proc_ssize_t count; str = (const nc_utf8proc_uint8_t*)name; while(*str) { count = nc_utf8proc_iterate(str,nchars,&codepoint); if(count < 0) { switch (count) { case UTF8PROC_ERROR_NOMEM: case UTF8PROC_ERROR_OVERFLOW: ncstat = NC_ENOMEM; break; case UTF8PROC_ERROR_INVALIDOPTS: ncstat = NC_EINVAL; break; case UTF8PROC_ERROR_INVALIDUTF8: case UTF8PROC_ERROR_NOTASSIGNED: default: ncstat = NC_EBADNAME; break; } goto done; } else { /* move to next char */ str += count; } } done: return ncstat; } /* * Returns a pointer to newly allocated memory of a * normalized version of the null-terminated string 'str'. * Normalized string is returned in normalp argument; * caller must free. * Return codes: * NC_NOERR -- success * NC_ENOMEM -- out of memory * NC_EINVAL -- illegal argument or internal error * NC_EBADNAME -- other failure */ int nc_utf8_normalize(const unsigned char* utf8, unsigned char** normalp) { int ncstat = NC_NOERR; const nc_utf8proc_uint8_t* str = (const nc_utf8proc_uint8_t*)utf8; nc_utf8proc_uint8_t* retval = NULL; nc_utf8proc_ssize_t count; count = nc_utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE); if(count < 0) {/* error */ switch (count) { case UTF8PROC_ERROR_NOMEM: case UTF8PROC_ERROR_OVERFLOW: ncstat = NC_ENOMEM; break; case UTF8PROC_ERROR_INVALIDOPTS: ncstat = NC_EINVAL; break; case UTF8PROC_ERROR_INVALIDUTF8: case UTF8PROC_ERROR_NOTASSIGNED: default: ncstat = NC_EBADNAME; break; } goto done; } else if(normalp) *normalp = (unsigned char*)retval; done: return ncstat; } /* * Convert a normalized utf8 string to utf16. This is approximate * because it just does the truncation version of conversion for * each 32-bit codepoint to get the corresponding utf16. * Return codes: * NC_NOERR -- success * NC_ENOMEM -- out of memory * NC_EINVAL -- invalid argument or internal error * NC_EBADNAME-- not valid utf16 */ int nc_utf8_to_utf16(const unsigned char* s8, unsigned short** utf16p, size_t* len16p) { int ncstat = NC_NOERR; const nc_utf8proc_uint8_t *str; nc_utf8proc_ssize_t nchars = -1; nc_utf8proc_int32_t codepoint; nc_utf8proc_ssize_t count; size_t len8, len16; unsigned short* utf16; unsigned short* p16; len8 = strlen((char*)s8); utf16 = (unsigned short*)malloc(sizeof(unsigned short)*(len8+1)); if(utf16 == NULL) { ncstat = NC_ENOMEM; goto done; } str = (const nc_utf8proc_uint8_t*)s8; /* Walk the string and convert each codepoint */ p16 = utf16; len16 = 0; while(*str) { count = nc_utf8proc_iterate(str,nchars,&codepoint); if(count < 0) { switch (count) { case UTF8PROC_ERROR_NOMEM: case UTF8PROC_ERROR_OVERFLOW: ncstat = NC_ENOMEM; break; case UTF8PROC_ERROR_INVALIDOPTS: ncstat = NC_EINVAL; break; case UTF8PROC_ERROR_INVALIDUTF8: case UTF8PROC_ERROR_NOTASSIGNED: default: ncstat = NC_EBADNAME; break; } goto done; } else { /* move to next char */ /* Complain if top 16 bits not zero */ if((codepoint & 0x0000FFFF) != 0) { ncstat = NC_EBADNAME; goto done; } /* Truncate codepoint to 16 bits and store */ *p16++ = (unsigned short)(codepoint & 0x0000FFFF); str += count; len16++; } } *p16++ = (unsigned short)0; if(utf16p) *utf16p = utf16; if(len16p) *len16p = len16; done: if(ncstat) free(utf16); return ncstat; }