mirror of
https://github.com/Unidata/netcdf-c.git
synced 2025-03-25 17:40:27 +08:00
Resolves Github issue https://github.com/Unidata/netcdf-c/issues/349.
Update utf8proc.[ch] to use the version now maintained by the Julia Language project (https://github.com/JuliaLang/utf8proc/blob/master/LICENSE.md). The license for the previous version was unacceptable for the Debian and Ubuntu release systems. The new version both updates the code and addresses the license issue. It turns out that the utf8proc software we are using was turned over to the Julia Language developers and the license terms changed to allow modification. (https://github.com/JuliaLang/utf8proc/blob/master/LICENSE.md). So the fix here is as follows: 1. Wrap the library with a fixed interface: libdispatch/dutf8.c and include/ncutf8.h. 2. Replace the existing utf8proc code with the new version from https://github.com/JuliaLang/utf8proc. 3. Add a couple more test cases: nc_test/tst_utf8_validate.c and nc_test_utf8_phrases.c. If/when I can find a usable normalization test, I will incorporate that later.
This commit is contained in:
parent
581737b04e
commit
47daf33074
@ -11,6 +11,10 @@ This file contains a high-level description of this package's evolution. Release
|
||||
|
||||
## 4.4.2 - TBD
|
||||
|
||||
* [License] Update utf8proc.[ch] to use the version now maintained by the
|
||||
Julia Language project (https://github.com/JuliaLang/utf8proc/blob/master/LICENSE.md). The license for the previous version was unacceptable for the
|
||||
Debian and Ubuntu release systems. The new version both updates the code
|
||||
and addresses the license issue.
|
||||
* [Enhancement] Added support for HDF5 collective metadata operations when available. Patch submitted by Greg Sjaardema, see [Pull request #335](https://github.com/Unidata/netcdf-c/pull/335) for more information.
|
||||
* [Bug] Addressed a potential type punning issue. See [GitHub #351](https://github.com/Unidata/netcdf-c/issues/351) for more information.
|
||||
* [Bug] Addressed an issue where netCDF wouldn't build on Windows systems using MSVC 2012. See [GitHub #304](https://github.com/Unidata/netcdf-c/issues/304) for more information.
|
||||
|
2
cf
2
cf
@ -117,7 +117,7 @@ FLAGS="$FLAGS --disable-examples"
|
||||
#FLAGS="$FLAGS --disable-dap-remote-tests"
|
||||
FLAGS="$FLAGS --enable-dap-auth-tests"
|
||||
#FLAGS="$FLAGS --enable-doxygen"
|
||||
#FLAGS="$FLAGS --enable-logging"
|
||||
FLAGS="$FLAGS --enable-logging"
|
||||
#FLAGS="$FLAGS --disable-diskless"
|
||||
#FLAGS="$FLAGS --enable-mmap"
|
||||
#FLAGS="$FLAGS --with-udunits"
|
||||
|
@ -15,7 +15,7 @@ include_HEADERS += netcdf_mem.h
|
||||
endif
|
||||
|
||||
noinst_HEADERS = nc_logging.h nc_tests.h fbits.h nc.h \
|
||||
nclist.h ncuri.h utf8proc.h ncdispatch.h ncdimscale.h \
|
||||
nclist.h ncuri.h ncutf8.h ncdispatch.h ncdimscale.h \
|
||||
netcdf_f.h err_macros.h ncbytes.h nchashmap.h ceconstraints.h rnd.h \
|
||||
nclog.h ncconfigure.h nc4internal.h nctime.h nc3dispatch.h nc3internal.h \
|
||||
onstack.h nc_hashmap.h
|
||||
|
34
include/ncutf8.h
Normal file
34
include/ncutf8.h
Normal file
@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Copyright 2017, University Corporation for Atmospheric Research
|
||||
* See netcdf/COPYRIGHT file for copying and redistribution conditions.
|
||||
*/
|
||||
|
||||
#ifndef NCUTF8_H
|
||||
#define NCUTF8_H 1
|
||||
|
||||
/* Provide a wrapper around whatever utf8 library we use. */
|
||||
|
||||
/*
|
||||
* Check validity of a UTF8 encoded null-terminated byte string.
|
||||
* Return codes:
|
||||
* NC_NOERR -- string is valid utf8
|
||||
* NC_ENOMEM -- out of memory
|
||||
* NC_EBADNAME-- not valid utf8
|
||||
*/
|
||||
extern int nc_utf8_validate(const unsigned char * name);
|
||||
|
||||
/*
|
||||
* Apply NFC normalization to a string.
|
||||
* Returns a pointer to newly allocated memory of an NFC
|
||||
* normalized version of the null-terminated string 'str'.
|
||||
* Pointer to normalized string is returned in normalp argument;
|
||||
* caller must free.
|
||||
* Return codes:
|
||||
* NC_NOERR -- success
|
||||
* NC_ENOMEM -- out of memory
|
||||
* NC_EBADNAME -- other failure
|
||||
*/
|
||||
extern int nc_utf8_normalize(const unsigned char* str, unsigned char** normalp);
|
||||
|
||||
#endif /*NCUTF8_H*/
|
||||
|
@ -1,401 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* File name: utf8proc.h
|
||||
* Version: 1.1.1
|
||||
* Last changed: 2007-07-22
|
||||
* Changed 2008-05-16 by rkr to add config.h and replacement for stdbool.h
|
||||
* for pre-C99 compilers that don't support bool.
|
||||
* Changed 2008-06-05 by rkr to add utf8proc_check(str, options) function for
|
||||
* for just checking UTF-8 validity
|
||||
* Description:
|
||||
* Header files for libutf8proc, which is a mapping tool for UTF-8 strings
|
||||
* with following features:
|
||||
* - decomposing and composing of strings
|
||||
* - replacing compatibility characters with their equivalents
|
||||
* - stripping of "default ignorable characters"
|
||||
* like SOFT-HYPHEN or ZERO-WIDTH-SPACE
|
||||
* - folding of certain characters for string comparison
|
||||
* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-")
|
||||
* (see "LUMP" option)
|
||||
* - optional rejection of strings containing non-assigned code points
|
||||
* - stripping of control characters
|
||||
* - stripping of character marks (accents, etc.)
|
||||
* - transformation of LF, CRLF, CR and NEL to line-feed (LF)
|
||||
* or to the unicode chararacters for paragraph separation (PS)
|
||||
* or line separation (LS).
|
||||
* - unicode case folding (for case insensitive string comparisons)
|
||||
* - rejection of illegal UTF-8 data
|
||||
* (i.e. UTF-8 encoded UTF-16 surrogates)
|
||||
* - support for korean hangul characters
|
||||
* Unicode Version 5.0.0 is supported.
|
||||
*/
|
||||
|
||||
|
||||
#ifndef UTF8PROC_H
|
||||
#define UTF8PROC_H
|
||||
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#ifdef HAVE_STDBOOL_H
|
||||
#include <stdbool.h>
|
||||
#else
|
||||
# if ! HAVE__BOOL
|
||||
# ifdef __cplusplus
|
||||
typedef bool _Bool;
|
||||
# else
|
||||
typedef unsigned char _Bool;
|
||||
# endif
|
||||
# endif
|
||||
#ifndef _Bool
|
||||
# define bool _Bool
|
||||
#endif
|
||||
# define false 0
|
||||
# define true 1
|
||||
# define __bool_true_false_are_defined 1
|
||||
#endif
|
||||
#include <sys/types.h>
|
||||
#ifdef HAVE_INTTYPES_H
|
||||
#include <inttypes.h>
|
||||
#elif defined(_MSC_VER) && defined(HAVE_STDINT_H)
|
||||
#include <stdint.h>
|
||||
#else /* HAVE_INTTYPES_H */
|
||||
#include <pstdint.h>
|
||||
#endif /* HAVE_INTTYPES_H */
|
||||
#include <limits.h>
|
||||
|
||||
#ifndef HAVE_SSIZE_T
|
||||
#define ssize_t int
|
||||
#endif
|
||||
|
||||
#ifndef SSIZE_MAX
|
||||
#define SSIZE_MAX (SIZE_MAX/2)
|
||||
#endif
|
||||
|
||||
#define UTF8PROC_NULLTERM (1<<0)
|
||||
#define UTF8PROC_STABLE (1<<1)
|
||||
#define UTF8PROC_COMPAT (1<<2)
|
||||
#define UTF8PROC_COMPOSE (1<<3)
|
||||
#define UTF8PROC_DECOMPOSE (1<<4)
|
||||
#define UTF8PROC_IGNORE (1<<5)
|
||||
#define UTF8PROC_REJECTNA (1<<6)
|
||||
#define UTF8PROC_NLF2LS (1<<7)
|
||||
#define UTF8PROC_NLF2PS (1<<8)
|
||||
#define UTF8PROC_NLF2LF (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
|
||||
#define UTF8PROC_STRIPCC (1<<9)
|
||||
#define UTF8PROC_CASEFOLD (1<<10)
|
||||
#define UTF8PROC_CHARBOUND (1<<11)
|
||||
#define UTF8PROC_LUMP (1<<12)
|
||||
#define UTF8PROC_STRIPMARK (1<<13)
|
||||
/*
|
||||
* Flags being regarded by several functions in the library:
|
||||
* NULLTERM: The given UTF-8 input is NULL terminated.
|
||||
* STABLE: Unicode Versioning Stability has to be respected.
|
||||
* COMPAT: Compatibility decomposition
|
||||
* (i.e. formatting information is lost)
|
||||
* COMPOSE: Return a result with composed characters.
|
||||
* DECOMPOSE: Return a result with decomposed characters.
|
||||
* IGNORE: Strip "default ignorable characters"
|
||||
* REJECTNA: Return an error, if the input contains unassigned
|
||||
* code points.
|
||||
* NLF2LS: Indicating that NLF-sequences (LF, CRLF, CR, NEL) are
|
||||
* representing a line break, and should be converted to the
|
||||
* unicode character for line separation (LS).
|
||||
* NLF2PS: Indicating that NLF-sequences are representing a paragraph
|
||||
* break, and should be converted to the unicode character for
|
||||
* paragraph separation (PS).
|
||||
* NLF2LF: Indicating that the meaning of NLF-sequences is unknown.
|
||||
* STRIPCC: Strips and/or convers control characters.
|
||||
* NLF-sequences are transformed into space, except if one of
|
||||
* the NLF2LS/PS/LF options is given.
|
||||
* HorizontalTab (HT) and FormFeed (FF) are treated as a
|
||||
* NLF-sequence in this case.
|
||||
* All other control characters are simply removed.
|
||||
* CASEFOLD: Performs unicode case folding, to be able to do a
|
||||
* case-insensitive string comparison.
|
||||
* CHARBOUND: Inserts 0xFF bytes at the beginning of each sequence which
|
||||
* is representing a single grapheme cluster (see UAX#29).
|
||||
* LUMP: Lumps certain characters together
|
||||
* (e.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-").
|
||||
* (See lump.txt for details.)
|
||||
* If NLF2LF is set, this includes a transformation of
|
||||
* paragraph and line separators to ASCII line-feed (LF).
|
||||
* STRIPMARK: Strips all character markings
|
||||
* (non-spacing, spacing and enclosing) (i.e. accents)
|
||||
* NOTE: this option works only with COMPOSE or DECOMPOSE
|
||||
*/
|
||||
|
||||
#define UTF8PROC_ERROR_NOMEM -1
|
||||
#define UTF8PROC_ERROR_OVERFLOW -2
|
||||
#define UTF8PROC_ERROR_INVALIDUTF8 -3
|
||||
#define UTF8PROC_ERROR_NOTASSIGNED -4
|
||||
#define UTF8PROC_ERROR_INVALIDOPTS -5
|
||||
/*
|
||||
* Error codes being returned by almost all functions:
|
||||
* ERROR_NOMEM: Memory could not be allocated.
|
||||
* ERROR_OVERFLOW: The given string is too long to be processed.
|
||||
* ERROR_INVALIDUTF8: The given string is not a legal UTF-8 string.
|
||||
* ERROR_NOTASSIGNED: The REJECTNA flag was set,
|
||||
* and an unassigned code point was found.
|
||||
* ERROR_INVALIDOPTS: Invalid options have been used.
|
||||
*/
|
||||
|
||||
typedef int16_t utf8proc_propval_t;
|
||||
typedef struct utf8proc_property_struct {
|
||||
utf8proc_propval_t category;
|
||||
utf8proc_propval_t combining_class;
|
||||
utf8proc_propval_t bidi_class;
|
||||
utf8proc_propval_t decomp_type;
|
||||
const int32_t *decomp_mapping;
|
||||
unsigned bidi_mirrored:1;
|
||||
int32_t uppercase_mapping;
|
||||
int32_t lowercase_mapping;
|
||||
int32_t titlecase_mapping;
|
||||
int32_t comb1st_index;
|
||||
int32_t comb2nd_index;
|
||||
unsigned comp_exclusion:1;
|
||||
unsigned ignorable:1;
|
||||
unsigned control_boundary:1;
|
||||
unsigned extend:1;
|
||||
const int32_t *casefold_mapping;
|
||||
} utf8proc_property_t;
|
||||
|
||||
#define UTF8PROC_CATEGORY_LU 1
|
||||
#define UTF8PROC_CATEGORY_LL 2
|
||||
#define UTF8PROC_CATEGORY_LT 3
|
||||
#define UTF8PROC_CATEGORY_LM 4
|
||||
#define UTF8PROC_CATEGORY_LO 5
|
||||
#define UTF8PROC_CATEGORY_MN 6
|
||||
#define UTF8PROC_CATEGORY_MC 7
|
||||
#define UTF8PROC_CATEGORY_ME 8
|
||||
#define UTF8PROC_CATEGORY_ND 9
|
||||
#define UTF8PROC_CATEGORY_NL 10
|
||||
#define UTF8PROC_CATEGORY_NO 11
|
||||
#define UTF8PROC_CATEGORY_PC 12
|
||||
#define UTF8PROC_CATEGORY_PD 13
|
||||
#define UTF8PROC_CATEGORY_PS 14
|
||||
#define UTF8PROC_CATEGORY_PE 15
|
||||
#define UTF8PROC_CATEGORY_PI 16
|
||||
#define UTF8PROC_CATEGORY_PF 17
|
||||
#define UTF8PROC_CATEGORY_PO 18
|
||||
#define UTF8PROC_CATEGORY_SM 19
|
||||
#define UTF8PROC_CATEGORY_SC 20
|
||||
#define UTF8PROC_CATEGORY_SK 21
|
||||
#define UTF8PROC_CATEGORY_SO 22
|
||||
#define UTF8PROC_CATEGORY_ZS 23
|
||||
#define UTF8PROC_CATEGORY_ZL 24
|
||||
#define UTF8PROC_CATEGORY_ZP 25
|
||||
#define UTF8PROC_CATEGORY_CC 26
|
||||
#define UTF8PROC_CATEGORY_CF 27
|
||||
#define UTF8PROC_CATEGORY_CS 28
|
||||
#define UTF8PROC_CATEGORY_CO 29
|
||||
#define UTF8PROC_CATEGORY_CN 30
|
||||
#define UTF8PROC_BIDI_CLASS_L 1
|
||||
#define UTF8PROC_BIDI_CLASS_LRE 2
|
||||
#define UTF8PROC_BIDI_CLASS_LRO 3
|
||||
#define UTF8PROC_BIDI_CLASS_R 4
|
||||
#define UTF8PROC_BIDI_CLASS_AL 5
|
||||
#define UTF8PROC_BIDI_CLASS_RLE 6
|
||||
#define UTF8PROC_BIDI_CLASS_RLO 7
|
||||
#define UTF8PROC_BIDI_CLASS_PDF 8
|
||||
#define UTF8PROC_BIDI_CLASS_EN 9
|
||||
#define UTF8PROC_BIDI_CLASS_ES 10
|
||||
#define UTF8PROC_BIDI_CLASS_ET 11
|
||||
#define UTF8PROC_BIDI_CLASS_AN 12
|
||||
#define UTF8PROC_BIDI_CLASS_CS 13
|
||||
#define UTF8PROC_BIDI_CLASS_NSM 14
|
||||
#define UTF8PROC_BIDI_CLASS_BN 15
|
||||
#define UTF8PROC_BIDI_CLASS_B 16
|
||||
#define UTF8PROC_BIDI_CLASS_S 17
|
||||
#define UTF8PROC_BIDI_CLASS_WS 18
|
||||
#define UTF8PROC_BIDI_CLASS_ON 19
|
||||
#define UTF8PROC_DECOMP_TYPE_FONT 1
|
||||
#define UTF8PROC_DECOMP_TYPE_NOBREAK 2
|
||||
#define UTF8PROC_DECOMP_TYPE_INITIAL 3
|
||||
#define UTF8PROC_DECOMP_TYPE_MEDIAL 4
|
||||
#define UTF8PROC_DECOMP_TYPE_FINAL 5
|
||||
#define UTF8PROC_DECOMP_TYPE_ISOLATED 6
|
||||
#define UTF8PROC_DECOMP_TYPE_CIRCLE 7
|
||||
#define UTF8PROC_DECOMP_TYPE_SUPER 8
|
||||
#define UTF8PROC_DECOMP_TYPE_SUB 9
|
||||
#define UTF8PROC_DECOMP_TYPE_VERTICAL 10
|
||||
#define UTF8PROC_DECOMP_TYPE_WIDE 11
|
||||
#define UTF8PROC_DECOMP_TYPE_NARROW 12
|
||||
#define UTF8PROC_DECOMP_TYPE_SMALL 13
|
||||
#define UTF8PROC_DECOMP_TYPE_SQUARE 14
|
||||
#define UTF8PROC_DECOMP_TYPE_FRACTION 15
|
||||
#define UTF8PROC_DECOMP_TYPE_COMPAT 16
|
||||
|
||||
extern const int8_t utf8proc_utf8class[256];
|
||||
|
||||
const char *utf8proc_errmsg(ssize_t errcode);
|
||||
/*
|
||||
* Returns a static error string for the given error code.
|
||||
*/
|
||||
|
||||
ssize_t utf8proc_iterate(const uint8_t *str, ssize_t strlen, int32_t *dst);
|
||||
/*
|
||||
* Reads a single char from the UTF-8 sequence being pointed to by 'str'.
|
||||
* The maximum number of bytes read is 'strlen', unless 'strlen' is
|
||||
* negative.
|
||||
* If a valid unicode char could be read, it is stored in the variable
|
||||
* being pointed to by 'dst', otherwise that variable will be set to -1.
|
||||
* In case of success the number of bytes read is returned, otherwise a
|
||||
* negative error code is returned.
|
||||
*/
|
||||
|
||||
bool utf8proc_codepoint_valid(int32_t uc);
|
||||
/*
|
||||
* Returns 1, if the given unicode code-point is valid, otherwise 0.
|
||||
*/
|
||||
|
||||
ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst);
|
||||
/*
|
||||
* Encodes the unicode char with the code point 'uc' as an UTF-8 string in
|
||||
* the byte array being pointed to by 'dst'. This array has to be at least
|
||||
* 4 bytes long.
|
||||
* In case of success the number of bytes written is returned,
|
||||
* otherwise 0.
|
||||
* This function does not check if 'uc' is a valid unicode code point.
|
||||
*/
|
||||
|
||||
const utf8proc_property_t *utf8proc_get_property(int32_t uc);
|
||||
/*
|
||||
* Returns a pointer to a (constant) struct containing information about
|
||||
* the unicode char with the given code point 'uc'.
|
||||
* If the character is not existent a pointer to a special struct is
|
||||
* returned, where 'category' is a NULL pointer.
|
||||
* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
|
||||
* 0x10FFFF, otherwise the program might crash!
|
||||
*/
|
||||
|
||||
ssize_t utf8proc_decompose_char(
|
||||
int32_t uc, int32_t *dst, ssize_t bufsize,
|
||||
int options, int *last_boundclass
|
||||
);
|
||||
/*
|
||||
* Writes a decomposition of the unicode char 'uc' into the array being
|
||||
* pointed to by 'dst'.
|
||||
* Following flags in the 'options' field are regarded:
|
||||
* REJECTNA: an unassigned unicode code point leads to an error
|
||||
* IGNORE: "default ignorable" chars are stripped
|
||||
* CASEFOLD: unicode casefolding is applied
|
||||
* COMPAT: replace certain characters with their
|
||||
* compatibility decomposition
|
||||
* CHARBOUND: Inserts 0xFF bytes before each grapheme cluster
|
||||
* LUMP: lumps certain different characters together
|
||||
* STRIPMARK: removes all character marks
|
||||
* The pointer 'last_boundclass' has to point to an integer variable which
|
||||
* is storing the last character boundary class, if the CHARBOUND option
|
||||
* is used.
|
||||
* In case of success the number of chars written is returned,
|
||||
* in case of an error, a negative error code is returned.
|
||||
* If the number of written chars would be bigger than 'bufsize',
|
||||
* the buffer (up to 'bufsize') has inpredictable data, and the needed
|
||||
* buffer size is returned.
|
||||
* WARNING: The parameter 'uc' has to be in the range of 0x0000 to
|
||||
* 0x10FFFF, otherwise the program might crash!
|
||||
*/
|
||||
|
||||
ssize_t utf8proc_decompose(
|
||||
const uint8_t *str, ssize_t strlen,
|
||||
int32_t *buffer, ssize_t bufsize, int options
|
||||
);
|
||||
/*
|
||||
* Does the same as 'utf8proc_decompose_char', but acts on a whole UTF-8
|
||||
* string, and orders the decomposed sequences correctly.
|
||||
* If the NULLTERM flag in 'options' is set, processing will be stopped,
|
||||
* when a NULL byte is encounted, otherwise 'strlen' bytes are processed.
|
||||
* The result in form of unicode code points is written into the buffer
|
||||
* being pointed to by 'buffer', having the length of 'bufsize' entries.
|
||||
* In case of success the number of chars written is returned,
|
||||
* in case of an error, a negative error code is returned.
|
||||
* If the number of written chars would be bigger than 'bufsize',
|
||||
* the buffer (up to 'bufsize') has inpredictable data, and the needed
|
||||
* buffer size is returned.
|
||||
*/
|
||||
|
||||
ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options);
|
||||
/*
|
||||
* Reencodes the sequence of unicode characters given by the pointer
|
||||
* 'buffer' and 'length' as UTF-8.
|
||||
* The result is stored in the same memory area where the data is read.
|
||||
* Following flags in the 'options' field are regarded:
|
||||
* NLF2LS: converts LF, CRLF, CR and NEL into LS
|
||||
* NLF2PS: converts LF, CRLF, CR and NEL into PS
|
||||
* NLF2LF: converts LF, CRLF, CR and NEL into LF
|
||||
* STRIPCC: strips or converts all non-affected control characters
|
||||
* COMPOSE: tries to combine decomposed characters into composite
|
||||
* characters
|
||||
* STABLE: prohibits combining characters which would violate
|
||||
* the unicode versioning stability
|
||||
* In case of success the length of the resulting UTF-8 string is
|
||||
* returned, otherwise a negative error code is returned.
|
||||
* WARNING: The amount of free space being pointed to by 'buffer', has to
|
||||
* exceed the amount of the input data by one byte, and the
|
||||
* entries of the array pointed to by 'str' have to be in the
|
||||
* range of 0x0000 to 0x10FFFF, otherwise the program might
|
||||
* crash!
|
||||
*/
|
||||
|
||||
ssize_t utf8proc_map(
|
||||
const uint8_t *str, ssize_t strlen, uint8_t **dstptr, int options
|
||||
);
|
||||
/*
|
||||
* Maps the given UTF-8 string being pointed to by 'str' to a new UTF-8
|
||||
* string, which is allocated dynamically, and afterwards pointed to by
|
||||
* the pointer being pointed to by 'dstptr'.
|
||||
* If the NULLTERM flag in the 'options' field is set, the length is
|
||||
* determined by a NULL terminator, otherwise the parameter 'strlen' is
|
||||
* evaluated to determine the string length, but in any case the result
|
||||
* will be NULL terminated (though it might contain NULL characters
|
||||
* before). Other flags in the 'options' field are passed to the functions
|
||||
* defined above, and regarded as described.
|
||||
* In case of success the length of the new string is returned,
|
||||
* otherwise a negative error code is returned.
|
||||
* NOTICE: The memory of the new UTF-8 string will have been allocated with
|
||||
* 'malloc', and has theirfore to be freed with 'free'.
|
||||
*/
|
||||
|
||||
uint8_t *utf8proc_NFD(const uint8_t *str);
|
||||
uint8_t *utf8proc_NFC(const uint8_t *str);
|
||||
uint8_t *utf8proc_NFKD(const uint8_t *str);
|
||||
uint8_t *utf8proc_NFKC(const uint8_t *str);
|
||||
/*
|
||||
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
||||
* normalized version of the null-terminated string 'str'.
|
||||
*/
|
||||
|
||||
ssize_t utf8proc_check(const uint8_t *str);
|
||||
/*
|
||||
* Just checks UTF-8 string for validity, returns 0 if valid or one of
|
||||
* the negative UTF8PROC_ERROR_* codes if invalid or memory exhausted
|
||||
* checking. Assumes null-terminated string str and UTF8PROC_STABLE
|
||||
* option.
|
||||
*/
|
||||
|
||||
#endif
|
||||
|
@ -1,4 +1,5 @@
|
||||
SET(libdispatch_SOURCES dparallel.c dcopy.c dfile.c ddim.c datt.c dattinq.c dattput.c dattget.c derror.c dvar.c dvarget.c dvarput.c dvarinq.c ddispatch.c nclog.c dstring.c dutf8proc.c ncuri.c nclist.c ncbytes.c nchashmap.c nctime.c dinternal.c nc.c nclistmgr.c)
|
||||
SET(libdispatch_SOURCES dparallel.c dcopy.c dfile.c ddim.c datt.c dattinq.c dattput.c dattget.c derror.c dvar.c dvarget.c dvarput.c dvarinq.c ddispatch.c nclog.c dstring.c dutf8.c ncuri.c nclist.c ncbytes.c nchashmap.c nctime.c dinternal.c nc.c nclistmgr.c
|
||||
utf8proc.c)
|
||||
|
||||
IF(USE_NETCDF4)
|
||||
SET(libdispatch_SOURCES ${libdispatch_SOURCES} dgroup.c dvlen.c dcompound.c dtype.c denum.c dopaque.c ncaux.c)
|
||||
|
@ -17,11 +17,14 @@ libdispatch_la_CPPFLAGS = ${AM_CPPFLAGS}
|
||||
# The source files.
|
||||
libdispatch_la_SOURCES = dparallel.c dcopy.c dfile.c ddim.c datt.c \
|
||||
dattinq.c dattput.c dattget.c derror.c dvar.c dvarget.c dvarput.c \
|
||||
dvarinq.c dinternal.c ddispatch.c \
|
||||
nclog.c dstring.c dutf8proc.c utf8proc_data.h \
|
||||
dvarinq.c dinternal.c ddispatch.c dutf8.c \
|
||||
nclog.c dstring.c \
|
||||
ncuri.c nclist.c ncbytes.c nchashmap.c nctime.c \
|
||||
nc.c nclistmgr.c
|
||||
|
||||
# Add the utf8 codebase
|
||||
libdispatch_la_SOURCES += utf8proc.c utf8proc.h
|
||||
|
||||
# Add functions only found in netCDF-4.
|
||||
if USE_NETCDF4
|
||||
libdispatch_la_SOURCES += dgroup.c dvlen.c dcompound.c dtype.c denum.c \
|
||||
@ -40,7 +43,7 @@ libnetcdf2_la_SOURCES = dv2i.c
|
||||
libnetcdf2_la_CPPFLAGS = ${AM_CPPFLAGS} -DDLL_EXPORT
|
||||
endif # BUILD_V2
|
||||
|
||||
EXTRA_DIST=CMakeLists.txt ncsettings.hdr
|
||||
EXTRA_DIST=CMakeLists.txt ncsettings.hdr utf8proc_data.c
|
||||
|
||||
# Build ncsettings.c as follows:
|
||||
# 1. copy ncsettings.hdr to ncsettings.c
|
||||
|
@ -12,7 +12,7 @@
|
||||
#include <assert.h>
|
||||
#include "nc.h"
|
||||
#include "rnd.h"
|
||||
#include "utf8proc.h"
|
||||
#include "ncutf8.h"
|
||||
|
||||
|
||||
/* There are 3 levels of UTF8 checking: 1=> (exact)validating 2=>relaxed
|
||||
@ -171,7 +171,7 @@ NC_check_name(const char *name)
|
||||
int skip;
|
||||
int ch;
|
||||
const char *cp = name;
|
||||
ssize_t utf8_stat;
|
||||
int stat;
|
||||
|
||||
assert(name != NULL);
|
||||
|
||||
@ -180,8 +180,8 @@ NC_check_name(const char *name)
|
||||
goto fail;
|
||||
|
||||
/* check validity of any UTF-8 */
|
||||
utf8_stat = utf8proc_check((const unsigned char *)name);
|
||||
if (utf8_stat < 0)
|
||||
stat = nc_utf8_validate((const unsigned char *)name);
|
||||
if (stat != NC_NOERR)
|
||||
goto fail;
|
||||
|
||||
/* First char must be [a-z][A-Z][0-9]_ | UTF8 */
|
||||
|
98
libdispatch/dutf8.c
Normal file
98
libdispatch/dutf8.c
Normal file
@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Copyright 2017, University Corporation for Atmospheric Research
|
||||
* See netcdf/COPYRIGHT file for copying and redistribution conditions.
|
||||
*/
|
||||
|
||||
#include "config.h"
|
||||
#include "netcdf.h"
|
||||
#include "ncutf8.h"
|
||||
#include "utf8proc.h"
|
||||
|
||||
/* Provide a wrapper around whatever utf8 library we use. */
|
||||
|
||||
/*
|
||||
* Check validity of a UTF8 encoded null-terminated byte string.
|
||||
* Return codes:
|
||||
* NC_NOERR -- string is valid utf8
|
||||
* NC_ENOMEM -- out of memory
|
||||
* NC_EINVAL -- invalid argument or internal error
|
||||
* NC_EBADNAME-- not valid utf8
|
||||
*/
|
||||
|
||||
int
|
||||
nc_utf8_validate(const unsigned char* name)
|
||||
{
|
||||
int ncstat = NC_NOERR;
|
||||
const utf8proc_uint8_t *str;
|
||||
utf8proc_ssize_t strlen = -1;
|
||||
utf8proc_int32_t codepoint;
|
||||
utf8proc_ssize_t count;
|
||||
|
||||
str = (const utf8proc_uint8_t*)name;
|
||||
while(*str) {
|
||||
count = utf8proc_iterate(str,strlen,&codepoint);
|
||||
if(count < 0) {
|
||||
switch (count) {
|
||||
case UTF8PROC_ERROR_NOMEM:
|
||||
case UTF8PROC_ERROR_OVERFLOW:
|
||||
ncstat = NC_ENOMEM;
|
||||
break;
|
||||
case UTF8PROC_ERROR_INVALIDOPTS:
|
||||
ncstat = NC_EINVAL;
|
||||
break;
|
||||
case UTF8PROC_ERROR_INVALIDUTF8:
|
||||
case UTF8PROC_ERROR_NOTASSIGNED:
|
||||
default:
|
||||
ncstat = NC_EBADNAME;
|
||||
break;
|
||||
}
|
||||
goto done;
|
||||
} else { /* move to next char */
|
||||
str += count;
|
||||
}
|
||||
}
|
||||
done:
|
||||
return ncstat;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns a pointer to newly allocated memory of a
|
||||
* normalized version of the null-terminated string 'str'.
|
||||
* Normalized string is returned in normalp argument;
|
||||
* caller must free.
|
||||
* Return codes:
|
||||
* NC_NOERR -- success
|
||||
* NC_ENOMEM -- out of memory
|
||||
* NC_EINVAL -- illegal argument or internal error
|
||||
* NC_EBADNAME -- other failure
|
||||
*/
|
||||
int
|
||||
nc_utf8_normalize(const unsigned char* utf8, unsigned char** normalp)
|
||||
{
|
||||
int ncstat = NC_NOERR;
|
||||
const utf8proc_uint8_t* str = (const utf8proc_uint8_t*)utf8;
|
||||
utf8proc_uint8_t* retval = NULL;
|
||||
utf8proc_ssize_t count;
|
||||
count = utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE | UTF8PROC_COMPOSE);
|
||||
if(count < 0) {/* error */
|
||||
switch (count) {
|
||||
case UTF8PROC_ERROR_NOMEM:
|
||||
case UTF8PROC_ERROR_OVERFLOW:
|
||||
ncstat = NC_ENOMEM;
|
||||
break;
|
||||
case UTF8PROC_ERROR_INVALIDOPTS:
|
||||
ncstat = NC_EINVAL;
|
||||
break;
|
||||
case UTF8PROC_ERROR_INVALIDUTF8:
|
||||
case UTF8PROC_ERROR_NOTASSIGNED:
|
||||
default:
|
||||
ncstat = NC_EBADNAME;
|
||||
break;
|
||||
}
|
||||
goto done;
|
||||
} else
|
||||
if(normalp) *normalp = (unsigned char*)retval;
|
||||
done:
|
||||
return ncstat;
|
||||
}
|
||||
|
@ -1,590 +0,0 @@
|
||||
/*
|
||||
* Copyright (c) 2006-2007 Jan Behrens, FlexiGuided GmbH, Berlin
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This library contains derived data from a modified version of the
|
||||
* Unicode data files.
|
||||
*
|
||||
* The original data files are available at
|
||||
* http://www.unicode.org/Public/UNIDATA/
|
||||
*
|
||||
* Please notice the copyright statement in the file "utf8proc_data.c".
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* File name: utf8proc.c
|
||||
* Version: 1.1.1
|
||||
* Last changed: 2007-07-22
|
||||
*
|
||||
* Description:
|
||||
* Implementation of libutf8proc.
|
||||
*/
|
||||
|
||||
|
||||
#include "utf8proc.h"
|
||||
#include "utf8proc_data.h"
|
||||
|
||||
const int8_t utf8proc_utf8class[256] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
#define UTF8PROC_HANGUL_SBASE 0xAC00
|
||||
#define UTF8PROC_HANGUL_LBASE 0x1100
|
||||
#define UTF8PROC_HANGUL_VBASE 0x1161
|
||||
#define UTF8PROC_HANGUL_TBASE 0x11A7
|
||||
#define UTF8PROC_HANGUL_LCOUNT 19
|
||||
#define UTF8PROC_HANGUL_VCOUNT 21
|
||||
#define UTF8PROC_HANGUL_TCOUNT 28
|
||||
#define UTF8PROC_HANGUL_NCOUNT 588
|
||||
#define UTF8PROC_HANGUL_SCOUNT 11172
|
||||
/*// END is exclusive*/
|
||||
#define UTF8PROC_HANGUL_L_START 0x1100
|
||||
#define UTF8PROC_HANGUL_L_END 0x115A
|
||||
#define UTF8PROC_HANGUL_L_FILLER 0x115F
|
||||
#define UTF8PROC_HANGUL_V_START 0x1160
|
||||
#define UTF8PROC_HANGUL_V_END 0x11A3
|
||||
#define UTF8PROC_HANGUL_T_START 0x11A8
|
||||
#define UTF8PROC_HANGUL_T_END 0x11FA
|
||||
#define UTF8PROC_HANGUL_S_START 0xAC00
|
||||
#define UTF8PROC_HANGUL_S_END 0xD7A4
|
||||
|
||||
|
||||
#define UTF8PROC_BOUNDCLASS_START 0
|
||||
#define UTF8PROC_BOUNDCLASS_OTHER 1
|
||||
#define UTF8PROC_BOUNDCLASS_CR 2
|
||||
#define UTF8PROC_BOUNDCLASS_LF 3
|
||||
#define UTF8PROC_BOUNDCLASS_CONTROL 4
|
||||
#define UTF8PROC_BOUNDCLASS_EXTEND 5
|
||||
#define UTF8PROC_BOUNDCLASS_L 6
|
||||
#define UTF8PROC_BOUNDCLASS_V 7
|
||||
#define UTF8PROC_BOUNDCLASS_T 8
|
||||
#define UTF8PROC_BOUNDCLASS_LV 9
|
||||
#define UTF8PROC_BOUNDCLASS_LVT 10
|
||||
|
||||
|
||||
const char *utf8proc_errmsg(ssize_t errcode) {
|
||||
switch (errcode) {
|
||||
case UTF8PROC_ERROR_NOMEM:
|
||||
return "Memory for processing UTF-8 data could not be allocated.";
|
||||
case UTF8PROC_ERROR_OVERFLOW:
|
||||
return "UTF-8 string is too long to be processed.";
|
||||
case UTF8PROC_ERROR_INVALIDUTF8:
|
||||
return "Invalid UTF-8 string";
|
||||
case UTF8PROC_ERROR_NOTASSIGNED:
|
||||
return "Unassigned Unicode code point found in UTF-8 string.";
|
||||
case UTF8PROC_ERROR_INVALIDOPTS:
|
||||
return "Invalid options for UTF-8 processing chosen.";
|
||||
default:
|
||||
return "An unknown error occurred while processing UTF-8 data.";
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t utf8proc_iterate(
|
||||
const uint8_t *str, ssize_t slen, int32_t *dst
|
||||
) {
|
||||
int length;
|
||||
int i;
|
||||
int32_t uc = -1;
|
||||
*dst = -1;
|
||||
if (!slen) return 0;
|
||||
length = utf8proc_utf8class[str[0]];
|
||||
if (!length) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
if (slen >= 0 && length > slen) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
for (i=1; i<length; i++) {
|
||||
if ((str[i] & 0xC0) != 0x80) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
}
|
||||
switch (length) {
|
||||
case 1:
|
||||
uc = str[0];
|
||||
break;
|
||||
case 2:
|
||||
uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
|
||||
if (uc < 0x80) uc = -1;
|
||||
break;
|
||||
case 3:
|
||||
uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
|
||||
+ (str[2] & 0x3F);
|
||||
if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
|
||||
(uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
|
||||
break;
|
||||
case 4:
|
||||
uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
|
||||
+ ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
|
||||
if (uc < 0x10000 || uc >= 0x110000) uc = -1;
|
||||
break;
|
||||
}
|
||||
if (uc < 0 || ((uc & 0xFFFF) >= 0xFFFE))
|
||||
return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
*dst = uc;
|
||||
return length;
|
||||
}
|
||||
|
||||
bool utf8proc_codepoint_valid(int32_t uc) {
|
||||
if (uc < 0 || uc >= 0x110000 ||
|
||||
((uc & 0xFFFF) >= 0xFFFE) || (uc >= 0xD800 && uc < 0xE000) ||
|
||||
(uc >= 0xFDD0 && uc < 0xFDF0)) return false;
|
||||
else return true;
|
||||
}
|
||||
|
||||
ssize_t utf8proc_encode_char(int32_t uc, uint8_t *dst) {
|
||||
if (uc < 0x00) {
|
||||
return 0;
|
||||
} else if (uc < 0x80) {
|
||||
dst[0] = (uint8_t)uc;
|
||||
return 1;
|
||||
} else if (uc < 0x800) {
|
||||
dst[0] = (uint8_t)(0xC0 + (uc >> 6));
|
||||
dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 2;
|
||||
} else if (uc == 0xFFFF) {
|
||||
dst[0] = 0xFF;
|
||||
return 1;
|
||||
} else if (uc == 0xFFFE) {
|
||||
dst[0] = 0xFE;
|
||||
return 1;
|
||||
} else if (uc < 0x10000) {
|
||||
dst[0] = (uint8_t)(0xE0 + (uc >> 12));
|
||||
dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 3;
|
||||
} else if (uc < 0x110000) {
|
||||
dst[0] = (uint8_t)(0xF0 + (uc >> 18));
|
||||
dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
||||
dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 4;
|
||||
} else return 0;
|
||||
}
|
||||
|
||||
const utf8proc_property_t *utf8proc_get_property(int32_t uc) {
|
||||
/* // ASSERT: uc >= 0 && uc < 0x110000*/
|
||||
return utf8proc_properties + (
|
||||
utf8proc_stage2table[
|
||||
utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#define utf8proc_decompose_lump(replacement_uc) \
|
||||
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
||||
options & ~UTF8PROC_LUMP, last_boundclass)
|
||||
|
||||
ssize_t utf8proc_decompose_char(int32_t uc, int32_t *dst, ssize_t bufsize,
|
||||
int options, int *last_boundclass) {
|
||||
/*// ASSERT: uc >= 0 && uc < 0x110000*/
|
||||
const utf8proc_property_t *property;
|
||||
utf8proc_propval_t category;
|
||||
int32_t hangul_sindex;
|
||||
property = utf8proc_get_property(uc);
|
||||
category = property->category;
|
||||
hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
|
||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
|
||||
int32_t hangul_tindex;
|
||||
if (bufsize >= 1) {
|
||||
dst[0] = UTF8PROC_HANGUL_LBASE +
|
||||
hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
|
||||
if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
|
||||
(hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
|
||||
}
|
||||
hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
|
||||
if (!hangul_tindex) return 2;
|
||||
if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_REJECTNA) {
|
||||
if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
|
||||
}
|
||||
if (options & UTF8PROC_IGNORE) {
|
||||
if (property->ignorable) return 0;
|
||||
}
|
||||
if (options & UTF8PROC_LUMP) {
|
||||
if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
|
||||
if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
|
||||
utf8proc_decompose_lump(0x0027);
|
||||
if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
|
||||
utf8proc_decompose_lump(0x002D);
|
||||
if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
|
||||
if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
|
||||
if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
|
||||
utf8proc_decompose_lump(0x003C);
|
||||
if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
|
||||
utf8proc_decompose_lump(0x003E);
|
||||
if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
|
||||
if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
|
||||
utf8proc_decompose_lump(0x005E);
|
||||
if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
|
||||
utf8proc_decompose_lump(0x005F);
|
||||
if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
|
||||
if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
|
||||
if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
|
||||
if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
|
||||
if (category == UTF8PROC_CATEGORY_ZL ||
|
||||
category == UTF8PROC_CATEGORY_ZP)
|
||||
utf8proc_decompose_lump(0x000A);
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_STRIPMARK) {
|
||||
if (category == UTF8PROC_CATEGORY_MN ||
|
||||
category == UTF8PROC_CATEGORY_MC ||
|
||||
category == UTF8PROC_CATEGORY_ME) return 0;
|
||||
}
|
||||
if (options & UTF8PROC_CASEFOLD) {
|
||||
if (property->casefold_mapping) {
|
||||
const int32_t *casefold_entry;
|
||||
ssize_t written = 0;
|
||||
for (casefold_entry = property->casefold_mapping;
|
||||
*casefold_entry >= 0; casefold_entry++) {
|
||||
written += utf8proc_decompose_char(*casefold_entry, dst+written,
|
||||
(bufsize > written) ? (bufsize - written) : 0, options,
|
||||
last_boundclass);
|
||||
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||
}
|
||||
return written;
|
||||
}
|
||||
}
|
||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||
if (property->decomp_mapping &&
|
||||
(!property->decomp_type || (options & UTF8PROC_COMPAT))) {
|
||||
const int32_t *decomp_entry;
|
||||
ssize_t written = 0;
|
||||
for (decomp_entry = property->decomp_mapping;
|
||||
*decomp_entry >= 0; decomp_entry++) {
|
||||
written += utf8proc_decompose_char(*decomp_entry, dst+written,
|
||||
(bufsize > written) ? (bufsize - written) : 0, options,
|
||||
last_boundclass);
|
||||
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||
}
|
||||
return written;
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_CHARBOUND) {
|
||||
bool boundary;
|
||||
int tbc, lbc;
|
||||
tbc =
|
||||
(uc == 0x000D) ? UTF8PROC_BOUNDCLASS_CR :
|
||||
(uc == 0x000A) ? UTF8PROC_BOUNDCLASS_LF :
|
||||
((category == UTF8PROC_CATEGORY_ZL ||
|
||||
category == UTF8PROC_CATEGORY_ZP ||
|
||||
category == UTF8PROC_CATEGORY_CC ||
|
||||
category == UTF8PROC_CATEGORY_CF) &&
|
||||
!(uc == 0x200C || uc == 0x200D)) ? UTF8PROC_BOUNDCLASS_CONTROL :
|
||||
property->extend ? UTF8PROC_BOUNDCLASS_EXTEND :
|
||||
((uc >= UTF8PROC_HANGUL_L_START && uc < UTF8PROC_HANGUL_L_END) ||
|
||||
uc == UTF8PROC_HANGUL_L_FILLER) ? UTF8PROC_BOUNDCLASS_L :
|
||||
(uc >= UTF8PROC_HANGUL_V_START && uc < UTF8PROC_HANGUL_V_END) ?
|
||||
UTF8PROC_BOUNDCLASS_V :
|
||||
(uc >= UTF8PROC_HANGUL_T_START && uc < UTF8PROC_HANGUL_T_END) ?
|
||||
UTF8PROC_BOUNDCLASS_T :
|
||||
(uc >= UTF8PROC_HANGUL_S_START && uc < UTF8PROC_HANGUL_S_END) ? (
|
||||
((uc-UTF8PROC_HANGUL_SBASE) % UTF8PROC_HANGUL_TCOUNT == 0) ?
|
||||
UTF8PROC_BOUNDCLASS_LV : UTF8PROC_BOUNDCLASS_LVT
|
||||
) :
|
||||
UTF8PROC_BOUNDCLASS_OTHER;
|
||||
lbc = *last_boundclass;
|
||||
boundary =
|
||||
(tbc == UTF8PROC_BOUNDCLASS_EXTEND) ? false :
|
||||
(lbc == UTF8PROC_BOUNDCLASS_START) ? true :
|
||||
(lbc == UTF8PROC_BOUNDCLASS_CR &&
|
||||
tbc == UTF8PROC_BOUNDCLASS_LF) ? false :
|
||||
(lbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
||||
(tbc == UTF8PROC_BOUNDCLASS_CONTROL) ? true :
|
||||
(lbc == UTF8PROC_BOUNDCLASS_L &&
|
||||
(tbc == UTF8PROC_BOUNDCLASS_L ||
|
||||
tbc == UTF8PROC_BOUNDCLASS_V ||
|
||||
tbc == UTF8PROC_BOUNDCLASS_LV ||
|
||||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false :
|
||||
((lbc == UTF8PROC_BOUNDCLASS_LV ||
|
||||
lbc == UTF8PROC_BOUNDCLASS_V) &&
|
||||
(tbc == UTF8PROC_BOUNDCLASS_V ||
|
||||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false :
|
||||
((lbc == UTF8PROC_BOUNDCLASS_LVT ||
|
||||
lbc == UTF8PROC_BOUNDCLASS_T) &&
|
||||
tbc == UTF8PROC_BOUNDCLASS_T) ? false :
|
||||
true;
|
||||
*last_boundclass = tbc;
|
||||
if (boundary) {
|
||||
if (bufsize >= 1) dst[0] = 0xFFFF;
|
||||
if (bufsize >= 2) dst[1] = uc;
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
if (bufsize >= 1) *dst = uc;
|
||||
return 1;
|
||||
}
|
||||
|
||||
ssize_t utf8proc_decompose(
|
||||
const uint8_t *str, ssize_t slen,
|
||||
int32_t *buffer, ssize_t bufsize, int options
|
||||
) {
|
||||
/*// slen will be ignored, if UTF8PROC_NULLTERM is set in options*/
|
||||
ssize_t wpos = 0;
|
||||
if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
|
||||
return UTF8PROC_ERROR_INVALIDOPTS;
|
||||
if ((options & UTF8PROC_STRIPMARK) &&
|
||||
!(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
|
||||
return UTF8PROC_ERROR_INVALIDOPTS;
|
||||
{
|
||||
int32_t uc;
|
||||
ssize_t rpos = 0;
|
||||
ssize_t decomp_result;
|
||||
int boundclass = UTF8PROC_BOUNDCLASS_START;
|
||||
while (1) {
|
||||
if (options & UTF8PROC_NULLTERM) {
|
||||
rpos += utf8proc_iterate(str + rpos, -1, &uc);
|
||||
/* checking of return value is not necessary,
|
||||
as 'uc' is < 0 in case of error. */
|
||||
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||
if (uc == 0) break;
|
||||
} else {
|
||||
if (rpos >= slen) break;
|
||||
rpos += utf8proc_iterate(str + rpos, slen - rpos, &uc);
|
||||
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
}
|
||||
decomp_result = utf8proc_decompose_char(
|
||||
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
|
||||
&boundclass
|
||||
);
|
||||
if (decomp_result < 0) return decomp_result;
|
||||
wpos += decomp_result;
|
||||
/* // prohibiting integer overflows due to too long strings:*/
|
||||
if (wpos < 0 || wpos > SSIZE_MAX/sizeof(int32_t)/2)
|
||||
return UTF8PROC_ERROR_OVERFLOW;
|
||||
}
|
||||
}
|
||||
if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
|
||||
ssize_t pos = 0;
|
||||
while (pos < wpos-1) {
|
||||
int32_t uc1, uc2;
|
||||
const utf8proc_property_t *property1, *property2;
|
||||
uc1 = buffer[pos];
|
||||
uc2 = buffer[pos+1];
|
||||
property1 = utf8proc_get_property(uc1);
|
||||
property2 = utf8proc_get_property(uc2);
|
||||
if (property1->combining_class > property2->combining_class &&
|
||||
property2->combining_class > 0) {
|
||||
buffer[pos] = uc2;
|
||||
buffer[pos+1] = uc1;
|
||||
if (pos > 0) pos--; else pos++;
|
||||
} else {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return wpos;
|
||||
}
|
||||
|
||||
ssize_t utf8proc_reencode(int32_t *buffer, ssize_t length, int options) {
|
||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
||||
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
||||
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
||||
ssize_t rpos;
|
||||
ssize_t wpos = 0;
|
||||
int32_t uc;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
|
||||
if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
|
||||
((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
|
||||
if (options & UTF8PROC_NLF2LS) {
|
||||
if (options & UTF8PROC_NLF2PS) {
|
||||
buffer[wpos++] = 0x000A;
|
||||
} else {
|
||||
buffer[wpos++] = 0x2028;
|
||||
}
|
||||
} else {
|
||||
if (options & UTF8PROC_NLF2PS) {
|
||||
buffer[wpos++] = 0x2029;
|
||||
} else {
|
||||
buffer[wpos++] = 0x0020;
|
||||
}
|
||||
}
|
||||
} else if ((options & UTF8PROC_STRIPCC) &&
|
||||
(uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
|
||||
if (uc == 0x0009) buffer[wpos++] = 0x0020;
|
||||
} else {
|
||||
buffer[wpos++] = uc;
|
||||
}
|
||||
}
|
||||
length = wpos;
|
||||
}
|
||||
if (options & UTF8PROC_COMPOSE) {
|
||||
int32_t *starter = NULL;
|
||||
int32_t current_char;
|
||||
const utf8proc_property_t *starter_property = NULL, *current_property;
|
||||
utf8proc_propval_t max_combining_class = -1;
|
||||
ssize_t rpos;
|
||||
ssize_t wpos = 0;
|
||||
int32_t composition;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
current_char = buffer[rpos];
|
||||
current_property = utf8proc_get_property(current_char);
|
||||
if (starter && current_property->combining_class > max_combining_class) {
|
||||
/* // combination perhaps possible*/
|
||||
int32_t hangul_lindex;
|
||||
int32_t hangul_sindex;
|
||||
hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
|
||||
if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
|
||||
int32_t hangul_vindex;
|
||||
hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
|
||||
if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
|
||||
*starter = UTF8PROC_HANGUL_SBASE +
|
||||
(hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
|
||||
UTF8PROC_HANGUL_TCOUNT;
|
||||
starter_property = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
|
||||
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
|
||||
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
|
||||
int32_t hangul_tindex;
|
||||
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
|
||||
if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
|
||||
*starter += hangul_tindex;
|
||||
starter_property = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (!starter_property) {
|
||||
starter_property = utf8proc_get_property(*starter);
|
||||
}
|
||||
if (starter_property->comb1st_index >= 0 &&
|
||||
current_property->comb2nd_index >= 0) {
|
||||
composition = utf8proc_combinations[
|
||||
starter_property->comb1st_index +
|
||||
current_property->comb2nd_index
|
||||
];
|
||||
if (composition >= 0 && (!(options & UTF8PROC_STABLE) ||
|
||||
!(utf8proc_get_property(composition)->comp_exclusion))) {
|
||||
*starter = composition;
|
||||
starter_property = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
buffer[wpos] = current_char;
|
||||
if (current_property->combining_class) {
|
||||
if (current_property->combining_class > max_combining_class) {
|
||||
max_combining_class = current_property->combining_class;
|
||||
}
|
||||
} else {
|
||||
starter = buffer + wpos;
|
||||
starter_property = NULL;
|
||||
max_combining_class = -1;
|
||||
}
|
||||
wpos++;
|
||||
}
|
||||
length = wpos;
|
||||
}
|
||||
{
|
||||
ssize_t rpos, wpos = 0;
|
||||
int32_t uc;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
wpos += utf8proc_encode_char(uc, ((uint8_t *)buffer) + wpos);
|
||||
}
|
||||
((uint8_t *)buffer)[wpos] = 0;
|
||||
return wpos;
|
||||
}
|
||||
}
|
||||
|
||||
ssize_t utf8proc_map(
|
||||
const uint8_t *str, ssize_t slen, uint8_t **dstptr, int options
|
||||
) {
|
||||
int32_t *buffer;
|
||||
ssize_t result;
|
||||
*dstptr = NULL;
|
||||
result = utf8proc_decompose(str, slen, NULL, 0, options);
|
||||
if (result < 0) return result;
|
||||
buffer = (int32_t*)malloc(((size_t)result) * sizeof(int32_t) + 1);
|
||||
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
||||
result = utf8proc_decompose(str, slen, buffer, result, options);
|
||||
if (result < 0) {
|
||||
free(buffer);
|
||||
return result;
|
||||
}
|
||||
result = utf8proc_reencode(buffer, result, options);
|
||||
if (result < 0) {
|
||||
free(buffer);
|
||||
return result;
|
||||
}
|
||||
{
|
||||
int32_t *newptr;
|
||||
newptr = realloc(buffer, result+1);
|
||||
if (newptr) buffer = newptr;
|
||||
}
|
||||
*dstptr = (uint8_t *)buffer;
|
||||
return result;
|
||||
}
|
||||
|
||||
uint8_t *utf8proc_NFD(const uint8_t *str) {
|
||||
uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_DECOMPOSE);
|
||||
return retval;
|
||||
}
|
||||
|
||||
uint8_t *utf8proc_NFC(const uint8_t *str) {
|
||||
uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_COMPOSE);
|
||||
return retval;
|
||||
}
|
||||
|
||||
uint8_t *utf8proc_NFKD(const uint8_t *str) {
|
||||
uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
||||
return retval;
|
||||
}
|
||||
|
||||
uint8_t *utf8proc_NFKC(const uint8_t *str) {
|
||||
uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
||||
return retval;
|
||||
}
|
||||
|
||||
ssize_t utf8proc_check(const uint8_t *str) {
|
||||
ssize_t result;
|
||||
result = utf8proc_decompose(str, 0, NULL, 0,
|
||||
UTF8PROC_NULLTERM | UTF8PROC_STABLE);
|
||||
return result;
|
||||
}
|
755
libdispatch/u/utf8proc.c
Normal file
755
libdispatch/u/utf8proc.c
Normal file
@ -0,0 +1,755 @@
|
||||
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
|
||||
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This library contains derived data from a modified version of the
|
||||
* Unicode data files.
|
||||
*
|
||||
* The original data files are available at
|
||||
* http://www.unicode.org/Public/UNIDATA/
|
||||
*
|
||||
* Please notice the copyright statement in the file "utf8proc_data.c".
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* File name: utf8proc.c
|
||||
*
|
||||
* Description:
|
||||
* Implementation of libutf8proc.
|
||||
*/
|
||||
|
||||
|
||||
#include "utf8proc.h"
|
||||
#include "utf8proc_data.c"
|
||||
|
||||
|
||||
UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
#define UTF8PROC_HANGUL_SBASE 0xAC00
|
||||
#define UTF8PROC_HANGUL_LBASE 0x1100
|
||||
#define UTF8PROC_HANGUL_VBASE 0x1161
|
||||
#define UTF8PROC_HANGUL_TBASE 0x11A7
|
||||
#define UTF8PROC_HANGUL_LCOUNT 19
|
||||
#define UTF8PROC_HANGUL_VCOUNT 21
|
||||
#define UTF8PROC_HANGUL_TCOUNT 28
|
||||
#define UTF8PROC_HANGUL_NCOUNT 588
|
||||
#define UTF8PROC_HANGUL_SCOUNT 11172
|
||||
/* END is exclusive */
|
||||
#define UTF8PROC_HANGUL_L_START 0x1100
|
||||
#define UTF8PROC_HANGUL_L_END 0x115A
|
||||
#define UTF8PROC_HANGUL_L_FILLER 0x115F
|
||||
#define UTF8PROC_HANGUL_V_START 0x1160
|
||||
#define UTF8PROC_HANGUL_V_END 0x11A3
|
||||
#define UTF8PROC_HANGUL_T_START 0x11A8
|
||||
#define UTF8PROC_HANGUL_T_END 0x11FA
|
||||
#define UTF8PROC_HANGUL_S_START 0xAC00
|
||||
#define UTF8PROC_HANGUL_S_END 0xD7A4
|
||||
|
||||
/* Should follow semantic-versioning rules (semver.org) based on API
|
||||
compatibility. (Note that the shared-library version number will
|
||||
be different, being based on ABI compatibility.): */
|
||||
#define STRINGIZEx(x) #x
|
||||
#define STRINGIZE(x) STRINGIZEx(x)
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
|
||||
return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
|
||||
switch (errcode) {
|
||||
case UTF8PROC_ERROR_NOMEM:
|
||||
return "Memory for processing UTF-8 data could not be allocated.";
|
||||
case UTF8PROC_ERROR_OVERFLOW:
|
||||
return "UTF-8 string is too long to be processed.";
|
||||
case UTF8PROC_ERROR_INVALIDUTF8:
|
||||
return "Invalid UTF-8 string";
|
||||
case UTF8PROC_ERROR_NOTASSIGNED:
|
||||
return "Unassigned Unicode code point found in UTF-8 string.";
|
||||
case UTF8PROC_ERROR_INVALIDOPTS:
|
||||
return "Invalid options for UTF-8 processing chosen.";
|
||||
default:
|
||||
return "An unknown error occurred while processing UTF-8 data.";
|
||||
}
|
||||
}
|
||||
|
||||
#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
|
||||
) {
|
||||
utf8proc_uint32_t uc;
|
||||
const utf8proc_uint8_t *end;
|
||||
|
||||
*dst = -1;
|
||||
if (!strlen) return 0;
|
||||
end = str + ((strlen < 0) ? 4 : strlen);
|
||||
uc = *str++;
|
||||
if (uc < 0x80) {
|
||||
*dst = uc;
|
||||
return 1;
|
||||
}
|
||||
// Must be between 0xc2 and 0xf4 inclusive to be valid
|
||||
if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
if (uc < 0xe0) { // 2-byte sequence
|
||||
// Must have valid continuation character
|
||||
if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
|
||||
return 2;
|
||||
}
|
||||
if (uc < 0xf0) { // 3-byte sequence
|
||||
if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
|
||||
return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
// Check for surrogate chars
|
||||
if (uc == 0xed && *str > 0x9f)
|
||||
return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
|
||||
if (uc < 0x800)
|
||||
return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
*dst = uc;
|
||||
return 3;
|
||||
}
|
||||
// 4-byte sequence
|
||||
// Must have 3 valid continuation characters
|
||||
if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
|
||||
return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
// Make sure in correct range (0x10000 - 0x10ffff)
|
||||
if (uc == 0xf0) {
|
||||
if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
} else if (uc == 0xf4) {
|
||||
if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
}
|
||||
*dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
|
||||
return 4;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
|
||||
return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
|
||||
if (uc < 0x00) {
|
||||
return 0;
|
||||
} else if (uc < 0x80) {
|
||||
dst[0] = (utf8proc_uint8_t) uc;
|
||||
return 1;
|
||||
} else if (uc < 0x800) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 2;
|
||||
// Note: we allow encoding 0xd800-0xdfff here, so as not to change
|
||||
// the API, however, these are actually invalid in UTF-8
|
||||
} else if (uc < 0x10000) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 3;
|
||||
} else if (uc < 0x110000) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
||||
dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 4;
|
||||
} else return 0;
|
||||
}
|
||||
|
||||
/* internal "unsafe" version that does not check whether uc is in range */
|
||||
static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
|
||||
if (uc < 0x00) {
|
||||
return 0;
|
||||
} else if (uc < 0x80) {
|
||||
dst[0] = (utf8proc_uint8_t)uc;
|
||||
return 1;
|
||||
} else if (uc < 0x800) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 2;
|
||||
} else if (uc == 0xFFFF) {
|
||||
dst[0] = (utf8proc_uint8_t)0xFF;
|
||||
return 1;
|
||||
} else if (uc == 0xFFFE) {
|
||||
dst[0] = (utf8proc_uint8_t)0xFE;
|
||||
return 1;
|
||||
} else if (uc < 0x10000) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 3;
|
||||
} else if (uc < 0x110000) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
||||
dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 4;
|
||||
} else return 0;
|
||||
}
|
||||
|
||||
/* internal "unsafe" version that does not check whether uc is in range */
|
||||
static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
|
||||
/* ASSERT: uc >= 0 && uc < 0x110000 */
|
||||
return utf8proc_properties + (
|
||||
utf8proc_stage2table[
|
||||
utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
|
||||
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
|
||||
}
|
||||
|
||||
/* return whether there is a grapheme break between boundclasses lbc and tbc
|
||||
(according to the definition of extended grapheme clusters)
|
||||
|
||||
Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
|
||||
http://www.unicode.org/reports/tr29/tr29-29.html
|
||||
|
||||
CAVEATS:
|
||||
Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
|
||||
and GB 12/13 (regional indicator code points) require knowledge of previous characters
|
||||
and are thus not handled by this function. This may result in an incorrect break before
|
||||
an E_Modifier class codepoint and an incorrectly missing break between two
|
||||
REGIONAL_INDICATOR class code points if such support does not exist in the caller.
|
||||
|
||||
See the special support in grapheme_break_extended, for required bookkeeping by the caller.
|
||||
*/
|
||||
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
|
||||
return
|
||||
(lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
|
||||
(lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
|
||||
tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
|
||||
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
|
||||
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
|
||||
(lbc == UTF8PROC_BOUNDCLASS_L && // GB6
|
||||
(tbc == UTF8PROC_BOUNDCLASS_L || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_V || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_LV || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
|
||||
((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
|
||||
lbc == UTF8PROC_BOUNDCLASS_V) && // ---
|
||||
(tbc == UTF8PROC_BOUNDCLASS_V || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
|
||||
((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
|
||||
lbc == UTF8PROC_BOUNDCLASS_T) && // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
|
||||
(tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
|
||||
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
|
||||
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
|
||||
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
|
||||
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
|
||||
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
|
||||
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
|
||||
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
|
||||
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
|
||||
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
|
||||
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
|
||||
true; // GB999
|
||||
}
|
||||
|
||||
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
|
||||
{
|
||||
int lbc_override = lbc;
|
||||
if (state && *state != UTF8PROC_BOUNDCLASS_START)
|
||||
lbc_override = *state;
|
||||
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
|
||||
if (state) {
|
||||
// Special support for GB 12/13 made possible by GB999. After two RI
|
||||
// class codepoints we want to force a break. Do this by resetting the
|
||||
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
|
||||
// after that character according to GB999 (unless of course such a break is
|
||||
// forbidden by a different rule such as GB9).
|
||||
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
||||
*state = UTF8PROC_BOUNDCLASS_OTHER;
|
||||
// Special support for GB10. Fold any EXTEND codepoints into the previous
|
||||
// boundclass if we're dealing with an emoji base boundclass.
|
||||
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
|
||||
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
|
||||
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
|
||||
*state = UTF8PROC_BOUNDCLASS_E_BASE;
|
||||
else
|
||||
*state = tbc;
|
||||
}
|
||||
return break_permitted;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
|
||||
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
|
||||
|
||||
return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
|
||||
utf8proc_get_property(c2)->boundclass,
|
||||
state);
|
||||
}
|
||||
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
|
||||
utf8proc_int32_t c1, utf8proc_int32_t c2) {
|
||||
return utf8proc_grapheme_break_stateful(c1, c2, NULL);
|
||||
}
|
||||
|
||||
static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
|
||||
{
|
||||
utf8proc_int32_t entry_cp = **entry;
|
||||
if ((entry_cp & 0xF800) == 0xD800) {
|
||||
*entry = *entry + 1;
|
||||
entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
|
||||
entry_cp += 0x10000;
|
||||
}
|
||||
return entry_cp;
|
||||
}
|
||||
|
||||
static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
|
||||
{
|
||||
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
|
||||
return seqindex_decode_entry(&entry);
|
||||
}
|
||||
|
||||
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
||||
utf8proc_ssize_t written = 0;
|
||||
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
|
||||
int len = seqindex >> 13;
|
||||
if (len >= 7) {
|
||||
len = *entry;
|
||||
entry++;
|
||||
}
|
||||
for (; len >= 0; entry++, len--) {
|
||||
utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
|
||||
|
||||
written += utf8proc_decompose_char(entry_cp, dst+written,
|
||||
(bufsize > written) ? (bufsize - written) : 0, options,
|
||||
last_boundclass);
|
||||
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||
}
|
||||
return written;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
||||
{
|
||||
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
|
||||
return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
|
||||
{
|
||||
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
|
||||
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
|
||||
{
|
||||
utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
|
||||
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
|
||||
}
|
||||
|
||||
/* return a character width analogous to wcwidth (except portable and
|
||||
hopefully less buggy than most system wcwidth functions). */
|
||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
|
||||
return utf8proc_get_property(c)->charwidth;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
|
||||
return utf8proc_get_property(c)->category;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
|
||||
static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
|
||||
return s[utf8proc_category(c)];
|
||||
}
|
||||
|
||||
#define utf8proc_decompose_lump(replacement_uc) \
|
||||
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
||||
options & ~UTF8PROC_LUMP, last_boundclass)
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
||||
const utf8proc_property_t *property;
|
||||
utf8proc_propval_t category;
|
||||
utf8proc_int32_t hangul_sindex;
|
||||
if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
|
||||
property = unsafe_get_property(uc);
|
||||
category = property->category;
|
||||
hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
|
||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
|
||||
utf8proc_int32_t hangul_tindex;
|
||||
if (bufsize >= 1) {
|
||||
dst[0] = UTF8PROC_HANGUL_LBASE +
|
||||
hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
|
||||
if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
|
||||
(hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
|
||||
}
|
||||
hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
|
||||
if (!hangul_tindex) return 2;
|
||||
if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_REJECTNA) {
|
||||
if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
|
||||
}
|
||||
if (options & UTF8PROC_IGNORE) {
|
||||
if (property->ignorable) return 0;
|
||||
}
|
||||
if (options & UTF8PROC_LUMP) {
|
||||
if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
|
||||
if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
|
||||
utf8proc_decompose_lump(0x0027);
|
||||
if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
|
||||
utf8proc_decompose_lump(0x002D);
|
||||
if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
|
||||
if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
|
||||
if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
|
||||
utf8proc_decompose_lump(0x003C);
|
||||
if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
|
||||
utf8proc_decompose_lump(0x003E);
|
||||
if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
|
||||
if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
|
||||
utf8proc_decompose_lump(0x005E);
|
||||
if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
|
||||
utf8proc_decompose_lump(0x005F);
|
||||
if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
|
||||
if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
|
||||
if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
|
||||
if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
|
||||
if (category == UTF8PROC_CATEGORY_ZL ||
|
||||
category == UTF8PROC_CATEGORY_ZP)
|
||||
utf8proc_decompose_lump(0x000A);
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_STRIPMARK) {
|
||||
if (category == UTF8PROC_CATEGORY_MN ||
|
||||
category == UTF8PROC_CATEGORY_MC ||
|
||||
category == UTF8PROC_CATEGORY_ME) return 0;
|
||||
}
|
||||
if (options & UTF8PROC_CASEFOLD) {
|
||||
if (property->casefold_seqindex != UINT16_MAX) {
|
||||
return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
|
||||
}
|
||||
}
|
||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||
if (property->decomp_seqindex != UINT16_MAX &&
|
||||
(!property->decomp_type || (options & UTF8PROC_COMPAT))) {
|
||||
return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_CHARBOUND) {
|
||||
utf8proc_bool boundary;
|
||||
int tbc = property->boundclass;
|
||||
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
|
||||
if (boundary) {
|
||||
if (bufsize >= 1) dst[0] = 0xFFFF;
|
||||
if (bufsize >= 2) dst[1] = uc;
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
if (bufsize >= 1) *dst = uc;
|
||||
return 1;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||
) {
|
||||
return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
|
||||
utf8proc_custom_func custom_func, void *custom_data
|
||||
) {
|
||||
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
|
||||
utf8proc_ssize_t wpos = 0;
|
||||
if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
|
||||
return UTF8PROC_ERROR_INVALIDOPTS;
|
||||
if ((options & UTF8PROC_STRIPMARK) &&
|
||||
!(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
|
||||
return UTF8PROC_ERROR_INVALIDOPTS;
|
||||
{
|
||||
utf8proc_int32_t uc;
|
||||
utf8proc_ssize_t rpos = 0;
|
||||
utf8proc_ssize_t decomp_result;
|
||||
int boundclass = UTF8PROC_BOUNDCLASS_START;
|
||||
while (1) {
|
||||
if (options & UTF8PROC_NULLTERM) {
|
||||
rpos += utf8proc_iterate(str + rpos, -1, &uc);
|
||||
/* checking of return value is not necessary,
|
||||
as 'uc' is < 0 in case of error */
|
||||
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||
if (uc == 0) break;
|
||||
} else {
|
||||
if (rpos >= strlen) break;
|
||||
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
|
||||
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
}
|
||||
if (custom_func != NULL) {
|
||||
uc = custom_func(uc, custom_data); /* user-specified custom mapping */
|
||||
}
|
||||
decomp_result = utf8proc_decompose_char(
|
||||
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
|
||||
&boundclass
|
||||
);
|
||||
if (decomp_result < 0) return decomp_result;
|
||||
wpos += decomp_result;
|
||||
/* prohibiting integer overflows due to too long strings: */
|
||||
if (wpos < 0 ||
|
||||
wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
|
||||
return UTF8PROC_ERROR_OVERFLOW;
|
||||
}
|
||||
}
|
||||
if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
|
||||
utf8proc_ssize_t pos = 0;
|
||||
while (pos < wpos-1) {
|
||||
utf8proc_int32_t uc1, uc2;
|
||||
const utf8proc_property_t *property1, *property2;
|
||||
uc1 = buffer[pos];
|
||||
uc2 = buffer[pos+1];
|
||||
property1 = unsafe_get_property(uc1);
|
||||
property2 = unsafe_get_property(uc2);
|
||||
if (property1->combining_class > property2->combining_class &&
|
||||
property2->combining_class > 0) {
|
||||
buffer[pos] = uc2;
|
||||
buffer[pos+1] = uc1;
|
||||
if (pos > 0) pos--; else pos++;
|
||||
} else {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return wpos;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
|
||||
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
||||
utf8proc_ssize_t rpos;
|
||||
utf8proc_ssize_t wpos = 0;
|
||||
utf8proc_int32_t uc;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
|
||||
if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
|
||||
((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
|
||||
if (options & UTF8PROC_NLF2LS) {
|
||||
if (options & UTF8PROC_NLF2PS) {
|
||||
buffer[wpos++] = 0x000A;
|
||||
} else {
|
||||
buffer[wpos++] = 0x2028;
|
||||
}
|
||||
} else {
|
||||
if (options & UTF8PROC_NLF2PS) {
|
||||
buffer[wpos++] = 0x2029;
|
||||
} else {
|
||||
buffer[wpos++] = 0x0020;
|
||||
}
|
||||
}
|
||||
} else if ((options & UTF8PROC_STRIPCC) &&
|
||||
(uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
|
||||
if (uc == 0x0009) buffer[wpos++] = 0x0020;
|
||||
} else {
|
||||
buffer[wpos++] = uc;
|
||||
}
|
||||
}
|
||||
length = wpos;
|
||||
}
|
||||
if (options & UTF8PROC_COMPOSE) {
|
||||
utf8proc_int32_t *starter = NULL;
|
||||
utf8proc_int32_t current_char;
|
||||
const utf8proc_property_t *starter_property = NULL, *current_property;
|
||||
utf8proc_propval_t max_combining_class = -1;
|
||||
utf8proc_ssize_t rpos;
|
||||
utf8proc_ssize_t wpos = 0;
|
||||
utf8proc_int32_t composition;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
current_char = buffer[rpos];
|
||||
current_property = unsafe_get_property(current_char);
|
||||
if (starter && current_property->combining_class > max_combining_class) {
|
||||
/* combination perhaps possible */
|
||||
utf8proc_int32_t hangul_lindex;
|
||||
utf8proc_int32_t hangul_sindex;
|
||||
hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
|
||||
if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
|
||||
utf8proc_int32_t hangul_vindex;
|
||||
hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
|
||||
if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
|
||||
*starter = UTF8PROC_HANGUL_SBASE +
|
||||
(hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
|
||||
UTF8PROC_HANGUL_TCOUNT;
|
||||
starter_property = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
|
||||
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
|
||||
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
|
||||
utf8proc_int32_t hangul_tindex;
|
||||
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
|
||||
if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
|
||||
*starter += hangul_tindex;
|
||||
starter_property = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (!starter_property) {
|
||||
starter_property = unsafe_get_property(*starter);
|
||||
}
|
||||
if (starter_property->comb_index < 0x8000 &&
|
||||
current_property->comb_index != UINT16_MAX &&
|
||||
current_property->comb_index >= 0x8000) {
|
||||
int sidx = starter_property->comb_index;
|
||||
int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
|
||||
if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
|
||||
idx += sidx + 2;
|
||||
if (current_property->comb_index & 0x4000) {
|
||||
composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
|
||||
} else
|
||||
composition = utf8proc_combinations[idx];
|
||||
|
||||
if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
|
||||
!(unsafe_get_property(composition)->comp_exclusion))) {
|
||||
*starter = composition;
|
||||
starter_property = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
buffer[wpos] = current_char;
|
||||
if (current_property->combining_class) {
|
||||
if (current_property->combining_class > max_combining_class) {
|
||||
max_combining_class = current_property->combining_class;
|
||||
}
|
||||
} else {
|
||||
starter = buffer + wpos;
|
||||
starter_property = NULL;
|
||||
max_combining_class = -1;
|
||||
}
|
||||
wpos++;
|
||||
}
|
||||
length = wpos;
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
||||
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
||||
length = utf8proc_normalize_utf32(buffer, length, options);
|
||||
if (length < 0) return length;
|
||||
{
|
||||
utf8proc_ssize_t rpos, wpos = 0;
|
||||
utf8proc_int32_t uc;
|
||||
if (options & UTF8PROC_CHARBOUND) {
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
|
||||
}
|
||||
} else {
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
|
||||
}
|
||||
}
|
||||
((utf8proc_uint8_t *)buffer)[wpos] = 0;
|
||||
return wpos;
|
||||
}
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||
) {
|
||||
return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
|
||||
utf8proc_custom_func custom_func, void *custom_data
|
||||
) {
|
||||
utf8proc_int32_t *buffer;
|
||||
utf8proc_ssize_t result;
|
||||
*dstptr = NULL;
|
||||
result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
|
||||
if (result < 0) return result;
|
||||
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
|
||||
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
||||
result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
|
||||
if (result < 0) {
|
||||
free(buffer);
|
||||
return result;
|
||||
}
|
||||
result = utf8proc_reencode(buffer, result, options);
|
||||
if (result < 0) {
|
||||
free(buffer);
|
||||
return result;
|
||||
}
|
||||
{
|
||||
utf8proc_int32_t *newptr;
|
||||
newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
|
||||
if (newptr) buffer = newptr;
|
||||
}
|
||||
*dstptr = (utf8proc_uint8_t *)buffer;
|
||||
return result;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_DECOMPOSE);
|
||||
return retval;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_COMPOSE);
|
||||
return retval;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
||||
return retval;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
||||
return retval;
|
||||
}
|
699
libdispatch/u/utf8proc.h
Normal file
699
libdispatch/u/utf8proc.h
Normal file
@ -0,0 +1,699 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
|
||||
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @mainpage
|
||||
*
|
||||
* utf8proc is a free/open-source (MIT/expat licensed) C library
|
||||
* providing Unicode normalization, case-folding, and other operations
|
||||
* for strings in the UTF-8 encoding, supporting Unicode version
|
||||
* 8.0.0. See the utf8proc home page (http://julialang.org/utf8proc/)
|
||||
* for downloads and other information, or the source code on github
|
||||
* (https://github.com/JuliaLang/utf8proc).
|
||||
*
|
||||
* For the utf8proc API documentation, see: @ref utf8proc.h
|
||||
*
|
||||
* The features of utf8proc include:
|
||||
*
|
||||
* - Transformation of strings (@ref utf8proc_map) to:
|
||||
* - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)
|
||||
* - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)
|
||||
* - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK)
|
||||
* - case-folding (@ref UTF8PROC_CASEFOLD)
|
||||
* - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC
|
||||
* - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)
|
||||
* - Character-width computation: @ref utf8proc_charwidth
|
||||
* - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string
|
||||
* - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8.
|
||||
*/
|
||||
|
||||
/** @file */
|
||||
|
||||
#ifndef UTF8PROC_H
|
||||
#define UTF8PROC_H
|
||||
|
||||
/** @name API version
|
||||
*
|
||||
* The utf8proc API version MAJOR.MINOR.PATCH, following
|
||||
* semantic-versioning rules (http://semver.org) based on API
|
||||
* compatibility.
|
||||
*
|
||||
* This is also returned at runtime by @ref utf8proc_version; however, the
|
||||
* runtime version may append a string like "-dev" to the version number
|
||||
* for prerelease versions.
|
||||
*
|
||||
* @note The shared-library version number in the Makefile
|
||||
* (and CMakeLists.txt, and MANIFEST) may be different,
|
||||
* being based on ABI compatibility rather than API compatibility.
|
||||
*/
|
||||
/** @{ */
|
||||
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
||||
#define UTF8PROC_VERSION_MAJOR 2
|
||||
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
||||
#define UTF8PROC_VERSION_MINOR 1
|
||||
/** The PATCH version (increased for fixes that do not change the API). */
|
||||
#define UTF8PROC_VERSION_PATCH 0
|
||||
/** @} */
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1800
|
||||
// MSVC prior to 2013 lacked stdbool.h and inttypes.h
|
||||
typedef signed char utf8proc_int8_t;
|
||||
typedef unsigned char utf8proc_uint8_t;
|
||||
typedef short utf8proc_int16_t;
|
||||
typedef unsigned short utf8proc_uint16_t;
|
||||
typedef int utf8proc_int32_t;
|
||||
typedef unsigned int utf8proc_uint32_t;
|
||||
# ifdef _WIN64
|
||||
typedef __int64 utf8proc_ssize_t;
|
||||
typedef unsigned __int64 utf8proc_size_t;
|
||||
# else
|
||||
typedef int utf8proc_ssize_t;
|
||||
typedef unsigned int utf8proc_size_t;
|
||||
# endif
|
||||
# ifndef __cplusplus
|
||||
// emulate C99 bool
|
||||
typedef unsigned char utf8proc_bool;
|
||||
# ifndef __bool_true_false_are_defined
|
||||
# define false 0
|
||||
# define true 1
|
||||
# define __bool_true_false_are_defined 1
|
||||
# endif
|
||||
# else
|
||||
typedef bool utf8proc_bool;
|
||||
# endif
|
||||
#else
|
||||
# include <stddef.h>
|
||||
# include <stdbool.h>
|
||||
# include <inttypes.h>
|
||||
typedef int8_t utf8proc_int8_t;
|
||||
typedef uint8_t utf8proc_uint8_t;
|
||||
typedef int16_t utf8proc_int16_t;
|
||||
typedef uint16_t utf8proc_uint16_t;
|
||||
typedef int32_t utf8proc_int32_t;
|
||||
typedef uint32_t utf8proc_uint32_t;
|
||||
typedef size_t utf8proc_size_t;
|
||||
typedef ptrdiff_t utf8proc_ssize_t;
|
||||
typedef bool utf8proc_bool;
|
||||
#endif
|
||||
#include <limits.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
# ifdef UTF8PROC_EXPORTS
|
||||
# define UTF8PROC_DLLEXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define UTF8PROC_DLLEXPORT __declspec(dllimport)
|
||||
# endif
|
||||
#elif __GNUC__ >= 4
|
||||
# define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
|
||||
#else
|
||||
# define UTF8PROC_DLLEXPORT
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef SSIZE_MAX
|
||||
#define SSIZE_MAX ((size_t)SIZE_MAX/2)
|
||||
#endif
|
||||
|
||||
#ifndef UINT16_MAX
|
||||
# define UINT16_MAX 65535U
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Option flags used by several functions in the library.
|
||||
*/
|
||||
typedef enum {
|
||||
/** The given UTF-8 input is NULL terminated. */
|
||||
UTF8PROC_NULLTERM = (1<<0),
|
||||
/** Unicode Versioning Stability has to be respected. */
|
||||
UTF8PROC_STABLE = (1<<1),
|
||||
/** Compatibility decomposition (i.e. formatting information is lost). */
|
||||
UTF8PROC_COMPAT = (1<<2),
|
||||
/** Return a result with decomposed characters. */
|
||||
UTF8PROC_COMPOSE = (1<<3),
|
||||
/** Return a result with decomposed characters. */
|
||||
UTF8PROC_DECOMPOSE = (1<<4),
|
||||
/** Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. */
|
||||
UTF8PROC_IGNORE = (1<<5),
|
||||
/** Return an error, if the input contains unassigned codepoints. */
|
||||
UTF8PROC_REJECTNA = (1<<6),
|
||||
/**
|
||||
* Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a
|
||||
* line break, and should be converted to the codepoint for line
|
||||
* separation (LS).
|
||||
*/
|
||||
UTF8PROC_NLF2LS = (1<<7),
|
||||
/**
|
||||
* Indicating that NLF-sequences are representing a paragraph break, and
|
||||
* should be converted to the codepoint for paragraph separation
|
||||
* (PS).
|
||||
*/
|
||||
UTF8PROC_NLF2PS = (1<<8),
|
||||
/** Indicating that the meaning of NLF-sequences is unknown. */
|
||||
UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS),
|
||||
/** Strips and/or convers control characters.
|
||||
*
|
||||
* NLF-sequences are transformed into space, except if one of the
|
||||
* NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF)
|
||||
* are treated as a NLF-sequence in this case. All other control
|
||||
* characters are simply removed.
|
||||
*/
|
||||
UTF8PROC_STRIPCC = (1<<9),
|
||||
/**
|
||||
* Performs unicode case folding, to be able to do a case-insensitive
|
||||
* string comparison.
|
||||
*/
|
||||
UTF8PROC_CASEFOLD = (1<<10),
|
||||
/**
|
||||
* Inserts 0xFF bytes at the beginning of each sequence which is
|
||||
* representing a single grapheme cluster (see UAX#29).
|
||||
*/
|
||||
UTF8PROC_CHARBOUND = (1<<11),
|
||||
/** Lumps certain characters together.
|
||||
*
|
||||
* E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details.
|
||||
*
|
||||
* If NLF2LF is set, this includes a transformation of paragraph and
|
||||
* line separators to ASCII line-feed (LF).
|
||||
*/
|
||||
UTF8PROC_LUMP = (1<<12),
|
||||
/** Strips all character markings.
|
||||
*
|
||||
* This includes non-spacing, spacing and enclosing (i.e. accents).
|
||||
* @note This option works only with @ref UTF8PROC_COMPOSE or
|
||||
* @ref UTF8PROC_DECOMPOSE
|
||||
*/
|
||||
UTF8PROC_STRIPMARK = (1<<13),
|
||||
} utf8proc_option_t;
|
||||
|
||||
/** @name Error codes
|
||||
* Error codes being returned by almost all functions.
|
||||
*/
|
||||
/** @{ */
|
||||
/** Memory could not be allocated. */
|
||||
#define UTF8PROC_ERROR_NOMEM -1
|
||||
/** The given string is too long to be processed. */
|
||||
#define UTF8PROC_ERROR_OVERFLOW -2
|
||||
/** The given string is not a legal UTF-8 string. */
|
||||
#define UTF8PROC_ERROR_INVALIDUTF8 -3
|
||||
/** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */
|
||||
#define UTF8PROC_ERROR_NOTASSIGNED -4
|
||||
/** Invalid options have been used. */
|
||||
#define UTF8PROC_ERROR_INVALIDOPTS -5
|
||||
/** @} */
|
||||
|
||||
/* @name Types */
|
||||
|
||||
/** Holds the value of a property. */
|
||||
typedef utf8proc_int16_t utf8proc_propval_t;
|
||||
|
||||
/** Struct containing information about a codepoint. */
|
||||
typedef struct utf8proc_property_struct {
|
||||
/**
|
||||
* Unicode category.
|
||||
* @see utf8proc_category_t.
|
||||
*/
|
||||
utf8proc_propval_t category;
|
||||
utf8proc_propval_t combining_class;
|
||||
/**
|
||||
* Bidirectional class.
|
||||
* @see utf8proc_bidi_class_t.
|
||||
*/
|
||||
utf8proc_propval_t bidi_class;
|
||||
/**
|
||||
* @anchor Decomposition type.
|
||||
* @see utf8proc_decomp_type_t.
|
||||
*/
|
||||
utf8proc_propval_t decomp_type;
|
||||
utf8proc_uint16_t decomp_seqindex;
|
||||
utf8proc_uint16_t casefold_seqindex;
|
||||
utf8proc_uint16_t uppercase_seqindex;
|
||||
utf8proc_uint16_t lowercase_seqindex;
|
||||
utf8proc_uint16_t titlecase_seqindex;
|
||||
utf8proc_uint16_t comb_index;
|
||||
unsigned bidi_mirrored:1;
|
||||
unsigned comp_exclusion:1;
|
||||
/**
|
||||
* Can this codepoint be ignored?
|
||||
*
|
||||
* Used by @ref utf8proc_decompose_char when @ref UTF8PROC_IGNORE is
|
||||
* passed as an option.
|
||||
*/
|
||||
unsigned ignorable:1;
|
||||
unsigned control_boundary:1;
|
||||
/** The width of the codepoint. */
|
||||
unsigned charwidth:2;
|
||||
unsigned pad:2;
|
||||
/**
|
||||
* Boundclass.
|
||||
* @see utf8proc_boundclass_t.
|
||||
*/
|
||||
unsigned boundclass:8;
|
||||
} utf8proc_property_t;
|
||||
|
||||
/** Unicode categories. */
|
||||
typedef enum {
|
||||
UTF8PROC_CATEGORY_CN = 0, /**< Other, not assigned */
|
||||
UTF8PROC_CATEGORY_LU = 1, /**< Letter, uppercase */
|
||||
UTF8PROC_CATEGORY_LL = 2, /**< Letter, lowercase */
|
||||
UTF8PROC_CATEGORY_LT = 3, /**< Letter, titlecase */
|
||||
UTF8PROC_CATEGORY_LM = 4, /**< Letter, modifier */
|
||||
UTF8PROC_CATEGORY_LO = 5, /**< Letter, other */
|
||||
UTF8PROC_CATEGORY_MN = 6, /**< Mark, nonspacing */
|
||||
UTF8PROC_CATEGORY_MC = 7, /**< Mark, spacing combining */
|
||||
UTF8PROC_CATEGORY_ME = 8, /**< Mark, enclosing */
|
||||
UTF8PROC_CATEGORY_ND = 9, /**< Number, decimal digit */
|
||||
UTF8PROC_CATEGORY_NL = 10, /**< Number, letter */
|
||||
UTF8PROC_CATEGORY_NO = 11, /**< Number, other */
|
||||
UTF8PROC_CATEGORY_PC = 12, /**< Punctuation, connector */
|
||||
UTF8PROC_CATEGORY_PD = 13, /**< Punctuation, dash */
|
||||
UTF8PROC_CATEGORY_PS = 14, /**< Punctuation, open */
|
||||
UTF8PROC_CATEGORY_PE = 15, /**< Punctuation, close */
|
||||
UTF8PROC_CATEGORY_PI = 16, /**< Punctuation, initial quote */
|
||||
UTF8PROC_CATEGORY_PF = 17, /**< Punctuation, final quote */
|
||||
UTF8PROC_CATEGORY_PO = 18, /**< Punctuation, other */
|
||||
UTF8PROC_CATEGORY_SM = 19, /**< Symbol, math */
|
||||
UTF8PROC_CATEGORY_SC = 20, /**< Symbol, currency */
|
||||
UTF8PROC_CATEGORY_SK = 21, /**< Symbol, modifier */
|
||||
UTF8PROC_CATEGORY_SO = 22, /**< Symbol, other */
|
||||
UTF8PROC_CATEGORY_ZS = 23, /**< Separator, space */
|
||||
UTF8PROC_CATEGORY_ZL = 24, /**< Separator, line */
|
||||
UTF8PROC_CATEGORY_ZP = 25, /**< Separator, paragraph */
|
||||
UTF8PROC_CATEGORY_CC = 26, /**< Other, control */
|
||||
UTF8PROC_CATEGORY_CF = 27, /**< Other, format */
|
||||
UTF8PROC_CATEGORY_CS = 28, /**< Other, surrogate */
|
||||
UTF8PROC_CATEGORY_CO = 29, /**< Other, private use */
|
||||
} utf8proc_category_t;
|
||||
|
||||
/** Bidirectional character classes. */
|
||||
typedef enum {
|
||||
UTF8PROC_BIDI_CLASS_L = 1, /**< Left-to-Right */
|
||||
UTF8PROC_BIDI_CLASS_LRE = 2, /**< Left-to-Right Embedding */
|
||||
UTF8PROC_BIDI_CLASS_LRO = 3, /**< Left-to-Right Override */
|
||||
UTF8PROC_BIDI_CLASS_R = 4, /**< Right-to-Left */
|
||||
UTF8PROC_BIDI_CLASS_AL = 5, /**< Right-to-Left Arabic */
|
||||
UTF8PROC_BIDI_CLASS_RLE = 6, /**< Right-to-Left Embedding */
|
||||
UTF8PROC_BIDI_CLASS_RLO = 7, /**< Right-to-Left Override */
|
||||
UTF8PROC_BIDI_CLASS_PDF = 8, /**< Pop Directional Format */
|
||||
UTF8PROC_BIDI_CLASS_EN = 9, /**< European Number */
|
||||
UTF8PROC_BIDI_CLASS_ES = 10, /**< European Separator */
|
||||
UTF8PROC_BIDI_CLASS_ET = 11, /**< European Number Terminator */
|
||||
UTF8PROC_BIDI_CLASS_AN = 12, /**< Arabic Number */
|
||||
UTF8PROC_BIDI_CLASS_CS = 13, /**< Common Number Separator */
|
||||
UTF8PROC_BIDI_CLASS_NSM = 14, /**< Nonspacing Mark */
|
||||
UTF8PROC_BIDI_CLASS_BN = 15, /**< Boundary Neutral */
|
||||
UTF8PROC_BIDI_CLASS_B = 16, /**< Paragraph Separator */
|
||||
UTF8PROC_BIDI_CLASS_S = 17, /**< Segment Separator */
|
||||
UTF8PROC_BIDI_CLASS_WS = 18, /**< Whitespace */
|
||||
UTF8PROC_BIDI_CLASS_ON = 19, /**< Other Neutrals */
|
||||
UTF8PROC_BIDI_CLASS_LRI = 20, /**< Left-to-Right Isolate */
|
||||
UTF8PROC_BIDI_CLASS_RLI = 21, /**< Right-to-Left Isolate */
|
||||
UTF8PROC_BIDI_CLASS_FSI = 22, /**< First Strong Isolate */
|
||||
UTF8PROC_BIDI_CLASS_PDI = 23, /**< Pop Directional Isolate */
|
||||
} utf8proc_bidi_class_t;
|
||||
|
||||
/** Decomposition type. */
|
||||
typedef enum {
|
||||
UTF8PROC_DECOMP_TYPE_FONT = 1, /**< Font */
|
||||
UTF8PROC_DECOMP_TYPE_NOBREAK = 2, /**< Nobreak */
|
||||
UTF8PROC_DECOMP_TYPE_INITIAL = 3, /**< Initial */
|
||||
UTF8PROC_DECOMP_TYPE_MEDIAL = 4, /**< Medial */
|
||||
UTF8PROC_DECOMP_TYPE_FINAL = 5, /**< Final */
|
||||
UTF8PROC_DECOMP_TYPE_ISOLATED = 6, /**< Isolated */
|
||||
UTF8PROC_DECOMP_TYPE_CIRCLE = 7, /**< Circle */
|
||||
UTF8PROC_DECOMP_TYPE_SUPER = 8, /**< Super */
|
||||
UTF8PROC_DECOMP_TYPE_SUB = 9, /**< Sub */
|
||||
UTF8PROC_DECOMP_TYPE_VERTICAL = 10, /**< Vertical */
|
||||
UTF8PROC_DECOMP_TYPE_WIDE = 11, /**< Wide */
|
||||
UTF8PROC_DECOMP_TYPE_NARROW = 12, /**< Narrow */
|
||||
UTF8PROC_DECOMP_TYPE_SMALL = 13, /**< Small */
|
||||
UTF8PROC_DECOMP_TYPE_SQUARE = 14, /**< Square */
|
||||
UTF8PROC_DECOMP_TYPE_FRACTION = 15, /**< Fraction */
|
||||
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
|
||||
} utf8proc_decomp_type_t;
|
||||
|
||||
/** Boundclass property. (TR29) */
|
||||
typedef enum {
|
||||
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
|
||||
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
|
||||
UTF8PROC_BOUNDCLASS_CR = 2, /**< Cr */
|
||||
UTF8PROC_BOUNDCLASS_LF = 3, /**< Lf */
|
||||
UTF8PROC_BOUNDCLASS_CONTROL = 4, /**< Control */
|
||||
UTF8PROC_BOUNDCLASS_EXTEND = 5, /**< Extend */
|
||||
UTF8PROC_BOUNDCLASS_L = 6, /**< L */
|
||||
UTF8PROC_BOUNDCLASS_V = 7, /**< V */
|
||||
UTF8PROC_BOUNDCLASS_T = 8, /**< T */
|
||||
UTF8PROC_BOUNDCLASS_LV = 9, /**< Lv */
|
||||
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
|
||||
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
|
||||
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
||||
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
|
||||
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
|
||||
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
|
||||
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
|
||||
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
|
||||
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
||||
} utf8proc_boundclass_t;
|
||||
|
||||
/**
|
||||
* Function pointer type passed to @ref utf8proc_map_custom and
|
||||
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
|
||||
* mapping of codepoints to be applied in conjunction with other mappings.
|
||||
*/
|
||||
typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
|
||||
|
||||
/**
|
||||
* Array containing the byte lengths of a UTF-8 encoded codepoint based
|
||||
* on the first byte.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256];
|
||||
|
||||
/**
|
||||
* Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
|
||||
* (http://semver.org format), possibly with a "-dev" suffix for
|
||||
* development versions.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_version(void);
|
||||
|
||||
/**
|
||||
* Returns an informative error string for the given utf8proc error code
|
||||
* (e.g. the error codes returned by @ref utf8proc_map).
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
|
||||
|
||||
/**
|
||||
* Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
|
||||
* The maximum number of bytes read is `strlen`, unless `strlen` is
|
||||
* negative (in which case up to 4 bytes are read).
|
||||
*
|
||||
* If a valid codepoint could be read, it is stored in the variable
|
||||
* pointed to by `codepoint_ref`, otherwise that variable will be set to -1.
|
||||
* In case of success, the number of bytes read is returned; otherwise, a
|
||||
* negative error code is returned.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
|
||||
|
||||
/**
|
||||
* Check if a codepoint is valid (regardless of whether it has been
|
||||
* assigned a value by the current Unicode standard).
|
||||
*
|
||||
* @return 1 if the given `codepoint` is valid and otherwise return 0.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Encodes the codepoint as an UTF-8 string in the byte array pointed
|
||||
* to by `dst`. This array must be at least 4 bytes long.
|
||||
*
|
||||
* In case of success the number of bytes written is returned, and
|
||||
* otherwise 0 is returned.
|
||||
*
|
||||
* This function does not check whether `codepoint` is valid Unicode.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst);
|
||||
|
||||
/**
|
||||
* Look up the properties for a given codepoint.
|
||||
*
|
||||
* @param codepoint The Unicode codepoint.
|
||||
*
|
||||
* @returns
|
||||
* A pointer to a (constant) struct containing information about
|
||||
* the codepoint.
|
||||
* @par
|
||||
* If the codepoint is unassigned or invalid, a pointer to a special struct is
|
||||
* returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN).
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint);
|
||||
|
||||
/** Decompose a codepoint into an array of codepoints.
|
||||
*
|
||||
* @param codepoint the codepoint.
|
||||
* @param dst the destination buffer.
|
||||
* @param bufsize the size of the destination buffer.
|
||||
* @param options one or more of the following flags:
|
||||
* - @ref UTF8PROC_REJECTNA - return an error `codepoint` is unassigned
|
||||
* - @ref UTF8PROC_IGNORE - strip "default ignorable" codepoints
|
||||
* - @ref UTF8PROC_CASEFOLD - apply Unicode casefolding
|
||||
* - @ref UTF8PROC_COMPAT - replace certain codepoints with their
|
||||
* compatibility decomposition
|
||||
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
|
||||
* - @ref UTF8PROC_LUMP - lump certain different codepoints together
|
||||
* - @ref UTF8PROC_STRIPMARK - remove all character marks
|
||||
* @param last_boundclass
|
||||
* Pointer to an integer variable containing
|
||||
* the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
|
||||
* option is used. Otherwise, this parameter is ignored.
|
||||
*
|
||||
* @return
|
||||
* In case of success, the number of codepoints written is returned; in case
|
||||
* of an error, a negative error code is returned (@ref utf8proc_errmsg).
|
||||
* @par
|
||||
* If the number of written codepoints would be bigger than `bufsize`, the
|
||||
* required buffer size is returned, while the buffer will be overwritten with
|
||||
* undefined data.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
|
||||
utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize,
|
||||
utf8proc_option_t options, int *last_boundclass
|
||||
);
|
||||
|
||||
/**
|
||||
* The same as @ref utf8proc_decompose_char, but acts on a whole UTF-8
|
||||
* string and orders the decomposed sequences correctly.
|
||||
*
|
||||
* If the @ref UTF8PROC_NULLTERM flag in `options` is set, processing
|
||||
* will be stopped, when a NULL byte is encounted, otherwise `strlen`
|
||||
* bytes are processed. The result (in the form of 32-bit unicode
|
||||
* codepoints) is written into the buffer being pointed to by
|
||||
* `buffer` (which must contain at least `bufsize` entries). In case of
|
||||
* success, the number of codepoints written is returned; in case of an
|
||||
* error, a negative error code is returned (@ref utf8proc_errmsg).
|
||||
* See @ref utf8proc_decompose_custom to supply additional transformations.
|
||||
*
|
||||
* If the number of written codepoints would be bigger than `bufsize`, the
|
||||
* required buffer size is returned, while the buffer will be overwritten with
|
||||
* undefined data.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||
);
|
||||
|
||||
/**
|
||||
* The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
|
||||
* that is called on each codepoint in `str` before any other transformations
|
||||
* (along with a `custom_data` pointer that is passed through to `custom_func`).
|
||||
* The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
|
||||
utf8proc_custom_func custom_func, void *custom_data
|
||||
);
|
||||
|
||||
/**
|
||||
* Normalizes the sequence of `length` codepoints pointed to by `buffer`
|
||||
* in-place (i.e., the result is also stored in `buffer`).
|
||||
*
|
||||
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
||||
* @param length the length (in codepoints) of the buffer.
|
||||
* @param options a bitwise or (`|`) of one or more of the following flags:
|
||||
* - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
|
||||
* - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
|
||||
* - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
|
||||
* - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
|
||||
* - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
|
||||
* codepoints
|
||||
* - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
|
||||
* the unicode versioning stability
|
||||
*
|
||||
* @return
|
||||
* In case of success, the length (in codepoints) of the normalized UTF-32 string is
|
||||
* returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
|
||||
*
|
||||
* @warning The entries of the array pointed to by `str` have to be in the
|
||||
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
||||
|
||||
/**
|
||||
* Reencodes the sequence of `length` codepoints pointed to by `buffer`
|
||||
* UTF-8 data in-place (i.e., the result is also stored in `buffer`).
|
||||
* Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
|
||||
*
|
||||
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
||||
* @param length the length (in codepoints) of the buffer.
|
||||
* @param options a bitwise or (`|`) of one or more of the following flags:
|
||||
* - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
|
||||
* - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
|
||||
* - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
|
||||
* - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
|
||||
* - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
|
||||
* codepoints
|
||||
* - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
|
||||
* the unicode versioning stability
|
||||
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
|
||||
*
|
||||
* @return
|
||||
* In case of success, the length (in bytes) of the resulting nul-terminated
|
||||
* UTF-8 string is returned; otherwise, a negative error code is returned
|
||||
* (@ref utf8proc_errmsg).
|
||||
*
|
||||
* @warning The amount of free space pointed to by `buffer` must
|
||||
* exceed the amount of the input data by one byte, and the
|
||||
* entries of the array pointed to by `str` have to be in the
|
||||
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
||||
|
||||
/**
|
||||
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
||||
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
||||
*
|
||||
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
|
||||
* state to break graphemes. This state can be passed in as a pointer
|
||||
* in the `state` argument and should initially be set to 0. If the
|
||||
* state is not passed in (i.e. a null pointer is passed), UAX#29 rules
|
||||
* GB10/12/13 which require this state will not be applied, essentially
|
||||
* matching the rules in Unicode 8.0.0.
|
||||
*
|
||||
* @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
|
||||
* be called IN ORDER on ALL potential breaks in a string.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
|
||||
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
|
||||
|
||||
/**
|
||||
* Same as @ref utf8proc_grapheme_break_stateful, except without support for the
|
||||
* Unicode 9 additions to the algorithm. Supported for legacy reasons.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
|
||||
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
|
||||
|
||||
|
||||
/**
|
||||
* Given a codepoint `c`, return the codepoint of the corresponding
|
||||
* lower-case character, if any; otherwise (if there is no lower-case
|
||||
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
|
||||
|
||||
/**
|
||||
* Given a codepoint `c`, return the codepoint of the corresponding
|
||||
* upper-case character, if any; otherwise (if there is no upper-case
|
||||
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
||||
|
||||
/**
|
||||
* Given a codepoint `c`, return the codepoint of the corresponding
|
||||
* title-case character, if any; otherwise (if there is no title-case
|
||||
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
|
||||
|
||||
/**
|
||||
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
||||
* except that a width of 0 is returned for non-printable codepoints
|
||||
* instead of -1 as in `wcwidth`.
|
||||
*
|
||||
* @note
|
||||
* If you want to check for particular types of non-printable characters,
|
||||
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
|
||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Return the Unicode category for the codepoint (one of the
|
||||
* @ref utf8proc_category_t constants.)
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Return the two-letter (nul-terminated) Unicode category string for
|
||||
* the codepoint (e.g. `"Lu"` or `"Co"`).
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Maps the given UTF-8 string pointed to by `str` to a new UTF-8
|
||||
* string, allocated dynamically by `malloc` and returned via `dstptr`.
|
||||
*
|
||||
* If the @ref UTF8PROC_NULLTERM flag in the `options` field is set,
|
||||
* the length is determined by a NULL terminator, otherwise the
|
||||
* parameter `strlen` is evaluated to determine the string length, but
|
||||
* in any case the result will be NULL terminated (though it might
|
||||
* contain NULL characters with the string if `str` contained NULL
|
||||
* characters). Other flags in the `options` field are passed to the
|
||||
* functions defined above, and regarded as described. See also
|
||||
* @ref utfproc_map_custom to supply a custom codepoint transformation.
|
||||
*
|
||||
* In case of success the length of the new string is returned,
|
||||
* otherwise a negative error code is returned.
|
||||
*
|
||||
* @note The memory of the new UTF-8 string will have been allocated
|
||||
* with `malloc`, and should therefore be deallocated with `free`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||
);
|
||||
|
||||
/**
|
||||
* Like @ref utf8proc_map, but also takes a `custom_func` mapping function
|
||||
* that is called on each codepoint in `str` before any other transformations
|
||||
* (along with a `custom_data` pointer that is passed through to `custom_func`).
|
||||
* The `custom_func` argument is ignored if it is `NULL`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
|
||||
utf8proc_custom_func custom_func, void *custom_data
|
||||
);
|
||||
|
||||
/** @name Unicode normalization
|
||||
*
|
||||
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
||||
* normalized version of the null-terminated string `str`. These
|
||||
* are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
|
||||
* combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
|
||||
*/
|
||||
/** @{ */
|
||||
/** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
|
||||
/** NFC normalization (@ref UTF8PROC_COMPOSE). */
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
|
||||
/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
|
||||
/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
||||
/** @} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
13048
libdispatch/u8.c
Normal file
13048
libdispatch/u8.c
Normal file
File diff suppressed because it is too large
Load Diff
718
libdispatch/u8.h
Normal file
718
libdispatch/u8.h
Normal file
@ -0,0 +1,718 @@
|
||||
/*
|
||||
Copyright (C) 2014-2016 Quinten Lansu
|
||||
|
||||
Permission is hereby granted, free of charge, to any person
|
||||
obtaining a copy of this software and associated documentation
|
||||
files (the "Software"), to deal in the Software without
|
||||
restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following
|
||||
conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* This is the concatenation of
|
||||
|
||||
base.h casemapping.h codepoint.h database.h streaming.h
|
||||
composition.h decomposition.h unicodedatabase.h utf8rewind.h
|
||||
(order is important)
|
||||
|
||||
with some modifications to simplify
|
||||
*/
|
||||
|
||||
#ifndef U8_H
|
||||
#define U8_H 1
|
||||
|
||||
#ifndef _UTF8REWIND_H_
|
||||
#define _UTF8REWIND_H_
|
||||
|
||||
|
||||
|
||||
|
||||
#define UTF8_VERSION_MAKE(_major, _minor, _bugfix) \
|
||||
((_major) * 10000) + ((_minor) * 100) + (_bugfix)
|
||||
|
||||
#define UTF8_VERSION_MAJOR 1
|
||||
|
||||
#define UTF8_VERSION_MINOR 5
|
||||
|
||||
#define UTF8_VERSION_BUGFIX 1
|
||||
|
||||
#define UTF8_VERSION \
|
||||
UTF8_VERSION_MAKE(UTF8_VERSION_MAJOR, UTF8_VERSION_MINOR, UTF8_VERSION_BUGFIX)
|
||||
|
||||
#define UTF8_VERSION_STRING "1.5.1"
|
||||
|
||||
#define UTF8_VERSION_GUARD(_major, _minor, _bugfix) \
|
||||
(UTF8_VERSION >= UTF8_VERSION_MAKE(_major, _minor, _bugfix))
|
||||
|
||||
|
||||
|
||||
#define UTF8_ERR_NONE (0)
|
||||
|
||||
#define UTF8_ERR_INVALID_DATA (-1)
|
||||
|
||||
#define UTF8_ERR_INVALID_FLAG (-2)
|
||||
|
||||
#define UTF8_ERR_NOT_ENOUGH_SPACE (-3)
|
||||
|
||||
#define UTF8_ERR_OVERLAPPING_PARAMETERS (-4)
|
||||
|
||||
#define UTF8_ERR_INVALID_LOCALE (-5)
|
||||
|
||||
|
||||
|
||||
#define UTF8_LOCALE_DEFAULT 0
|
||||
|
||||
#define UTF8_LOCALE_LITHUANIAN 1
|
||||
|
||||
#define UTF8_LOCALE_TURKISH_AND_AZERI_LATIN 2
|
||||
|
||||
#define UTF8_LOCALE_MAXIMUM 3
|
||||
|
||||
|
||||
|
||||
#define UTF8_NORMALIZE_COMPOSE 0x00000001
|
||||
|
||||
#define UTF8_NORMALIZE_DECOMPOSE 0x00000002
|
||||
|
||||
#define UTF8_NORMALIZE_COMPATIBILITY 0x00000004
|
||||
|
||||
#define UTF8_NORMALIZATION_RESULT_YES (0)
|
||||
|
||||
#define UTF8_NORMALIZATION_RESULT_MAYBE (1)
|
||||
|
||||
#define UTF8_NORMALIZATION_RESULT_NO (2)
|
||||
|
||||
|
||||
|
||||
#define UTF8_CATEGORY_LETTER_UPPERCASE 0x00000001
|
||||
|
||||
#define UTF8_CATEGORY_LETTER_LOWERCASE 0x00000002
|
||||
|
||||
#define UTF8_CATEGORY_LETTER_TITLECASE 0x00000004
|
||||
|
||||
#define UTF8_CATEGORY_LETTER_MODIFIER 0x00000008
|
||||
|
||||
#define UTF8_CATEGORY_LETTER_OTHER 0x00000010
|
||||
|
||||
#define UTF8_CATEGORY_LETTER \
|
||||
(UTF8_CATEGORY_LETTER_UPPERCASE | UTF8_CATEGORY_LETTER_LOWERCASE | \
|
||||
UTF8_CATEGORY_LETTER_TITLECASE | UTF8_CATEGORY_LETTER_MODIFIER | \
|
||||
UTF8_CATEGORY_LETTER_OTHER)
|
||||
|
||||
#define UTF8_CATEGORY_CASE_MAPPED \
|
||||
(UTF8_CATEGORY_LETTER_UPPERCASE | UTF8_CATEGORY_LETTER_LOWERCASE | \
|
||||
UTF8_CATEGORY_LETTER_TITLECASE)
|
||||
|
||||
#define UTF8_CATEGORY_MARK_NON_SPACING 0x00000020
|
||||
|
||||
#define UTF8_CATEGORY_MARK_SPACING 0x00000040
|
||||
|
||||
#define UTF8_CATEGORY_MARK_ENCLOSING 0x00000080
|
||||
|
||||
#define UTF8_CATEGORY_MARK \
|
||||
(UTF8_CATEGORY_MARK_NON_SPACING | UTF8_CATEGORY_MARK_SPACING | \
|
||||
UTF8_CATEGORY_MARK_ENCLOSING)
|
||||
|
||||
#define UTF8_CATEGORY_NUMBER_DECIMAL 0x00000100
|
||||
|
||||
#define UTF8_CATEGORY_NUMBER_LETTER 0x00000200
|
||||
|
||||
#define UTF8_CATEGORY_NUMBER_OTHER 0x00000400
|
||||
|
||||
#define UTF8_CATEGORY_NUMBER \
|
||||
(UTF8_CATEGORY_NUMBER_DECIMAL | UTF8_CATEGORY_NUMBER_LETTER | \
|
||||
UTF8_CATEGORY_NUMBER_OTHER)
|
||||
|
||||
#define UTF8_CATEGORY_PUNCTUATION_CONNECTOR 0x00000800
|
||||
|
||||
#define UTF8_CATEGORY_PUNCTUATION_DASH 0x00001000
|
||||
|
||||
#define UTF8_CATEGORY_PUNCTUATION_OPEN 0x00002000
|
||||
|
||||
#define UTF8_CATEGORY_PUNCTUATION_CLOSE 0x00004000
|
||||
|
||||
#define UTF8_CATEGORY_PUNCTUATION_INITIAL 0x00008000
|
||||
|
||||
#define UTF8_CATEGORY_PUNCTUATION_FINAL 0x00010000
|
||||
|
||||
#define UTF8_CATEGORY_PUNCTUATION_OTHER 0x00020000
|
||||
|
||||
#define UTF8_CATEGORY_PUNCTUATION \
|
||||
(UTF8_CATEGORY_PUNCTUATION_CONNECTOR | UTF8_CATEGORY_PUNCTUATION_DASH | \
|
||||
UTF8_CATEGORY_PUNCTUATION_OPEN | UTF8_CATEGORY_PUNCTUATION_CLOSE | \
|
||||
UTF8_CATEGORY_PUNCTUATION_INITIAL | UTF8_CATEGORY_PUNCTUATION_FINAL | \
|
||||
UTF8_CATEGORY_PUNCTUATION_OTHER)
|
||||
|
||||
#define UTF8_CATEGORY_SYMBOL_MATH 0x00040000
|
||||
|
||||
#define UTF8_CATEGORY_SYMBOL_CURRENCY 0x00080000
|
||||
|
||||
#define UTF8_CATEGORY_SYMBOL_MODIFIER 0x00100000
|
||||
|
||||
#define UTF8_CATEGORY_SYMBOL_OTHER 0x00200000
|
||||
|
||||
#define UTF8_CATEGORY_SYMBOL \
|
||||
(UTF8_CATEGORY_SYMBOL_MATH | UTF8_CATEGORY_SYMBOL_CURRENCY | \
|
||||
UTF8_CATEGORY_SYMBOL_MODIFIER | UTF8_CATEGORY_SYMBOL_OTHER)
|
||||
|
||||
#define UTF8_CATEGORY_SEPARATOR_SPACE 0x00400000
|
||||
|
||||
#define UTF8_CATEGORY_SEPARATOR_LINE 0x00800000
|
||||
|
||||
#define UTF8_CATEGORY_SEPARATOR_PARAGRAPH 0x01000000
|
||||
|
||||
#define UTF8_CATEGORY_SEPARATOR \
|
||||
(UTF8_CATEGORY_SEPARATOR_SPACE | UTF8_CATEGORY_SEPARATOR_LINE | \
|
||||
UTF8_CATEGORY_SEPARATOR_PARAGRAPH)
|
||||
|
||||
#define UTF8_CATEGORY_CONTROL 0x02000000
|
||||
|
||||
#define UTF8_CATEGORY_FORMAT 0x04000000
|
||||
|
||||
#define UTF8_CATEGORY_SURROGATE 0x08000000
|
||||
|
||||
#define UTF8_CATEGORY_PRIVATE_USE 0x10000000
|
||||
|
||||
#define UTF8_CATEGORY_UNASSIGNED 0x20000000
|
||||
|
||||
#define UTF8_CATEGORY_COMPATIBILITY 0x40000000
|
||||
|
||||
#define UTF8_CATEGORY_IGNORE_GRAPHEME_CLUSTER 0x80000000
|
||||
|
||||
#define UTF8_CATEGORY_ISCNTRL \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_CONTROL)
|
||||
|
||||
#define UTF8_CATEGORY_ISPRINT \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER | \
|
||||
UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL | \
|
||||
UTF8_CATEGORY_SEPARATOR)
|
||||
|
||||
#define UTF8_CATEGORY_ISSPACE \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_SEPARATOR_SPACE)
|
||||
|
||||
#define UTF8_CATEGORY_ISBLANK \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_SEPARATOR_SPACE | UTF8_CATEGORY_PRIVATE_USE)
|
||||
|
||||
#define UTF8_CATEGORY_ISGRAPH \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER | \
|
||||
UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL)
|
||||
|
||||
#define UTF8_CATEGORY_ISPUNCT \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_PUNCTUATION | UTF8_CATEGORY_SYMBOL)
|
||||
|
||||
#define UTF8_CATEGORY_ISALNUM \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_LETTER | UTF8_CATEGORY_NUMBER)
|
||||
|
||||
#define UTF8_CATEGORY_ISALPHA \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_LETTER)
|
||||
|
||||
#define UTF8_CATEGORY_ISUPPER \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_LETTER_UPPERCASE)
|
||||
|
||||
#define UTF8_CATEGORY_ISLOWER \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_LETTER_LOWERCASE)
|
||||
|
||||
#define UTF8_CATEGORY_ISDIGIT \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_NUMBER)
|
||||
|
||||
#define UTF8_CATEGORY_ISXDIGIT \
|
||||
(UTF8_CATEGORY_COMPATIBILITY | \
|
||||
UTF8_CATEGORY_NUMBER | UTF8_CATEGORY_PRIVATE_USE)
|
||||
|
||||
|
||||
|
||||
|
||||
#ifndef UTF8_WCHAR_SIZE
|
||||
#if (__SIZEOF_WCHAR_T__ == 4) || (WCHAR_MAX > UINT16_MAX) || (__WCHAR_MAX__ > UINT16_MAX)
|
||||
#define UTF8_WCHAR_SIZE (4)
|
||||
#else
|
||||
#define UTF8_WCHAR_SIZE (2)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if (UTF8_WCHAR_SIZE == 4)
|
||||
|
||||
#define UTF8_WCHAR_UTF32 (1)
|
||||
#elif (UTF8_WCHAR_SIZE == 2)
|
||||
|
||||
#define UTF8_WCHAR_UTF16 (1)
|
||||
#else
|
||||
#error Invalid size for wchar_t type.
|
||||
#endif
|
||||
|
||||
#ifndef UTF8_API
|
||||
#ifdef __cplusplus
|
||||
#define UTF8_API extern "C"
|
||||
#else
|
||||
#define UTF8_API
|
||||
#endif
|
||||
#endif
|
||||
|
||||
typedef int int32_t;
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
|
||||
typedef uint16_t utf16_t;
|
||||
typedef uint32_t unicode_t;
|
||||
|
||||
#endif /* _UTF8REWIND_H_ */
|
||||
|
||||
|
||||
#ifndef _UTF8REWIND_INTERNAL_BASE_H_
|
||||
#define _UTF8REWIND_INTERNAL_BASE_H_
|
||||
|
||||
|
||||
|
||||
|
||||
#if defined(__GNUC__) && !defined(COMPILER_ICC)
|
||||
#define UTF8_UNUSED(_parameter) _parameter __attribute__ ((unused))
|
||||
#else
|
||||
#define UTF8_UNUSED(_parameter) _parameter
|
||||
#endif
|
||||
|
||||
#define UTF8_SET_ERROR(_error) \
|
||||
if (errors != 0) { *errors = UTF8_ERR_ ## _error; }
|
||||
|
||||
/* Validates input before transforming */
|
||||
/* Check for parameter overlap using the separating axis theorem */
|
||||
|
||||
#define UTF8_VALIDATE_PARAMETERS_CHAR(_inputType, _result) \
|
||||
if (input == 0) { \
|
||||
UTF8_SET_ERROR(INVALID_DATA); \
|
||||
return _result; \
|
||||
} \
|
||||
else if (inputSize < sizeof(_inputType)) { \
|
||||
if (target != 0) { \
|
||||
if (targetSize < 3) { \
|
||||
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
|
||||
return _result; \
|
||||
} \
|
||||
memcpy(target, REPLACEMENT_CHARACTER_STRING, REPLACEMENT_CHARACTER_STRING_LENGTH); \
|
||||
} \
|
||||
UTF8_SET_ERROR(INVALID_DATA); \
|
||||
return _result + REPLACEMENT_CHARACTER_STRING_LENGTH; \
|
||||
} \
|
||||
if (target != 0 && targetSize == 0) { \
|
||||
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
|
||||
return _result; \
|
||||
} \
|
||||
if ((char*)input == target) { \
|
||||
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
|
||||
return _result; \
|
||||
} \
|
||||
{ \
|
||||
char* input_center = (char*)input + (inputSize / 2); \
|
||||
char* target_center = target + (targetSize / 2); \
|
||||
size_t delta = (size_t)((input_center > target_center) ? (input_center - target_center) : (target_center - input_center)); \
|
||||
if (delta < (inputSize + targetSize) / 2) { \
|
||||
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
|
||||
return _result; \
|
||||
} \
|
||||
}
|
||||
|
||||
#define UTF8_VALIDATE_PARAMETERS(_inputType, _outputType, _result) \
|
||||
if (input == 0) { \
|
||||
UTF8_SET_ERROR(INVALID_DATA); \
|
||||
return _result; \
|
||||
} \
|
||||
else if (inputSize < sizeof(_inputType)) { \
|
||||
if (target != 0) { \
|
||||
if (targetSize < sizeof(_outputType)) { \
|
||||
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
|
||||
return _result; \
|
||||
} \
|
||||
*target = REPLACEMENT_CHARACTER; \
|
||||
} \
|
||||
UTF8_SET_ERROR(INVALID_DATA); \
|
||||
return _result + sizeof(_outputType); \
|
||||
} \
|
||||
if (target != 0 && targetSize < sizeof(_outputType)) { \
|
||||
UTF8_SET_ERROR(NOT_ENOUGH_SPACE); \
|
||||
return _result; \
|
||||
} \
|
||||
if ((char*)input == (char*)target) { \
|
||||
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
|
||||
return _result; \
|
||||
} \
|
||||
{ \
|
||||
char* input_center = (char*)input + (inputSize / 2); \
|
||||
char* target_center = (char*)target + (targetSize / 2); \
|
||||
size_t delta = (size_t)((input_center > target_center) ? (input_center - target_center) : (target_center - input_center)); \
|
||||
if (delta < (inputSize + targetSize) / 2) { \
|
||||
UTF8_SET_ERROR(OVERLAPPING_PARAMETERS); \
|
||||
return _result; \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
|
||||
#endif /* _UTF8REWIND_INTERNAL_BASE_H_ */
|
||||
|
||||
#ifndef _UTF8REWIND_INTERNAL_CASEMAPPING_H_
|
||||
#define _UTF8REWIND_INTERNAL_CASEMAPPING_H_
|
||||
|
||||
|
||||
|
||||
|
||||
typedef struct {
|
||||
const char* src;
|
||||
char* dst;
|
||||
size_t src_size;
|
||||
size_t dst_size;
|
||||
size_t total_bytes_needed;
|
||||
unicode_t last_code_point;
|
||||
size_t locale;
|
||||
const uint32_t* property_index1;
|
||||
const uint32_t* property_index2;
|
||||
const uint32_t* property_data;
|
||||
uint32_t last_general_category;
|
||||
uint8_t last_code_point_size;
|
||||
uint8_t last_canonical_combining_class;
|
||||
uint8_t quickcheck_flags;
|
||||
} CaseMappingState;
|
||||
|
||||
uint8_t casemapping_initialize(
|
||||
CaseMappingState* state,
|
||||
const char* input, size_t inputSize,
|
||||
char* target, size_t targetSize,
|
||||
const uint32_t* propertyIndex1, const uint32_t* propertyIndex2, const uint32_t* propertyData,
|
||||
uint8_t quickCheck, size_t locale,
|
||||
int32_t* errors);
|
||||
|
||||
size_t casemapping_execute(CaseMappingState* state, int32_t* errors);
|
||||
|
||||
|
||||
|
||||
#endif /* _UTF8REWIND_INTERNAL_CASEMAPPING_H_ */
|
||||
#ifndef _UTF8REWIND_INTERNAL_CODEPOINT_H_
|
||||
#define _UTF8REWIND_INTERNAL_CODEPOINT_H_
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#define MAX_BASIC_LATIN 0x007F
|
||||
|
||||
|
||||
#define MAX_LATIN_1 0x00FF
|
||||
|
||||
|
||||
#define MAX_BASIC_MULTILINGUAL_PLANE 0xFFFF
|
||||
|
||||
|
||||
#define MAX_LEGAL_UNICODE 0x10FFFF
|
||||
|
||||
|
||||
#define REPLACEMENT_CHARACTER 0xFFFD
|
||||
|
||||
|
||||
#define REPLACEMENT_CHARACTER_STRING "\xEF\xBF\xBD"
|
||||
|
||||
|
||||
#define REPLACEMENT_CHARACTER_STRING_LENGTH 3
|
||||
|
||||
|
||||
#define SURROGATE_HIGH_START 0xD800
|
||||
|
||||
|
||||
#define SURROGATE_HIGH_END 0xDBFF
|
||||
|
||||
|
||||
#define SURROGATE_LOW_START 0xDC00
|
||||
|
||||
|
||||
#define SURROGATE_LOW_END 0xDFFF
|
||||
|
||||
|
||||
#define HANGUL_JAMO_FIRST 0x1100
|
||||
|
||||
|
||||
#define HANGUL_JAMO_LAST 0x11FF
|
||||
|
||||
|
||||
#define HANGUL_L_FIRST 0x1100
|
||||
|
||||
|
||||
#define HANGUL_L_LAST 0x1112
|
||||
|
||||
|
||||
#define HANGUL_L_COUNT 19
|
||||
|
||||
|
||||
#define HANGUL_V_FIRST 0x1161
|
||||
|
||||
|
||||
#define HANGUL_V_LAST 0x1175
|
||||
|
||||
|
||||
#define HANGUL_V_COUNT 21
|
||||
|
||||
|
||||
#define HANGUL_T_FIRST 0x11A7
|
||||
|
||||
|
||||
#define HANGUL_T_LAST 0x11C2
|
||||
|
||||
|
||||
#define HANGUL_T_COUNT 28
|
||||
|
||||
|
||||
#define HANGUL_N_COUNT 588 /* VCount * TCount */
|
||||
|
||||
|
||||
#define HANGUL_S_FIRST 0xAC00
|
||||
|
||||
|
||||
#define HANGUL_S_LAST 0xD7A3
|
||||
|
||||
|
||||
#define HANGUL_S_COUNT 11172 /* LCount * NCount */
|
||||
|
||||
#define CP_LATIN_CAPITAL_LETTER_I 0x0049
|
||||
#define CP_LATIN_CAPITAL_LETTER_J 0x004A
|
||||
#define CP_LATIN_SMALL_LETTER_I 0x0069
|
||||
#define CP_LATIN_SMALL_LETTER_J 0x006A
|
||||
#define CP_LATIN_CAPITAL_LETTER_I_WITH_GRAVE 0x00CC
|
||||
#define CP_LATIN_CAPITAL_LETTER_I_WITH_ACUTE 0x00CD
|
||||
#define CP_LATIN_CAPITAL_LETTER_I_WITH_TILDE 0x0128
|
||||
#define CP_LATIN_CAPITAL_LETTER_I_WITH_OGONEK 0x012E
|
||||
#define CP_LATIN_SMALL_LETTER_I_WITH_OGONEK 0x012F
|
||||
#define CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x0130
|
||||
#define CP_LATIN_SMALL_LETTER_DOTLESS_I 0x0131
|
||||
#define CP_COMBINING_GRAVE_ACCENT 0x0300
|
||||
#define CP_COMBINING_ACUTE_ACCENT 0x0301
|
||||
#define CP_COMBINING_TILDE_ACCENT 0x0303
|
||||
#define CP_COMBINING_DOT_ABOVE 0x0307
|
||||
#define CP_COMBINING_GREEK_YPOGEGRAMMENI 0x0345
|
||||
#define CP_COMBINING_GRAPHEME_JOINER 0x034F
|
||||
#define CP_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
|
||||
|
||||
#define CCC_NOT_REORDERED 0
|
||||
#define CCC_OVERLAY 1
|
||||
#define CCC_NUKTA 7
|
||||
#define CCC_KANA_VOICING 8
|
||||
#define CCC_VIRAMA 9
|
||||
#define CCC_FIXED_POSITION_START 10
|
||||
#define CCC_FIXED_POSITION_END 199
|
||||
#define CCC_ATTACHED_BELOW_LEFT 200
|
||||
#define CCC_ATTACHED_BELOW 202
|
||||
#define CCC_ATTACHED_BOTTOM_RIGHT 204
|
||||
#define CCC_ATTACHED_LEFT 208
|
||||
#define CCC_ATTACHED_RIGHT 210
|
||||
#define CCC_ATTACHED_TOP_LEFT 212
|
||||
#define CCC_ATTACHED_ABOVE 214
|
||||
#define CCC_ATTACHED_ABOVE_RIGHT 216
|
||||
#define CCC_BELOW_LEFT 218
|
||||
#define CCC_BELOW 220
|
||||
#define CCC_BELOW_RIGHT 222
|
||||
#define CCC_LEFT 224
|
||||
#define CCC_RIGHT 226
|
||||
#define CCC_ABOVE_LEFT 228
|
||||
#define CCC_ABOVE 230
|
||||
#define CCC_ABOVE_RIGHT 232
|
||||
#define CCC_DOUBLE_BELOW 233
|
||||
#define CCC_DOUBLE_ABOVE 234
|
||||
#define CCC_IOTA_SUBSCRIPT 240
|
||||
#define CCC_INVALID 255
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* _UTF8REWIND_INTERNAL_CODEPOINT_H_ */
|
||||
|
||||
#ifndef _UTF8REWIND_INTERNAL_DATABASE_H_
|
||||
#define _UTF8REWIND_INTERNAL_DATABASE_H_
|
||||
|
||||
|
||||
|
||||
|
||||
typedef enum QuickCheckCaseMapped
|
||||
{
|
||||
QuickCheckCaseMapped_Uppercase = 0x01,
|
||||
QuickCheckCaseMapped_Lowercase = 0x02,
|
||||
QuickCheckCaseMapped_Titlecase = 0x04,
|
||||
QuickCheckCaseMapped_Casefolded = 0x08,
|
||||
} QuickCheckCaseMapped;
|
||||
|
||||
typedef enum QuickCheckResult
|
||||
{
|
||||
QuickCheckResult_Yes,
|
||||
QuickCheckResult_Maybe,
|
||||
QuickCheckResult_No,
|
||||
} QuickCheckResult;
|
||||
|
||||
#define PROPERTY_INDEX_SHIFT (5)
|
||||
|
||||
static const unicode_t PROPERTY_DATA_MASK = (1 << PROPERTY_INDEX_SHIFT) - 1;
|
||||
|
||||
#define PROPERTY_GET(_indexArray, _dataArray, _cp) \
|
||||
(_dataArray)[ \
|
||||
(_indexArray)[(_cp) >> PROPERTY_INDEX_SHIFT] + \
|
||||
((_cp) & PROPERTY_DATA_MASK)]
|
||||
|
||||
#define PROPERTY_GET_GC(_cp) \
|
||||
PROPERTY_GET(GeneralCategoryIndexPtr, GeneralCategoryDataPtr, _cp)
|
||||
|
||||
#define PROPERTY_GET_CCC(_cp) \
|
||||
PROPERTY_GET(CanonicalCombiningClassIndexPtr, CanonicalCombiningClassDataPtr, _cp)
|
||||
|
||||
#define PROPERTY_GET_CM(_cp) \
|
||||
PROPERTY_GET(QuickCheckCaseMappedIndexPtr, QuickCheckCaseMappedDataPtr, _cp)
|
||||
|
||||
#define PROPERTY_GET_NFC(_cp) \
|
||||
PROPERTY_GET(QuickCheckNFCIndexPtr, QuickCheckNFCDataPtr, _cp)
|
||||
|
||||
#define PROPERTY_GET_NFD(_cp) \
|
||||
PROPERTY_GET(QuickCheckNFDIndexPtr, QuickCheckNFDDataPtr, _cp)
|
||||
|
||||
#define PROPERTY_GET_NFKC(_cp) \
|
||||
PROPERTY_GET(QuickCheckNFKCIndexPtr, QuickCheckNFKCDataPtr, _cp)
|
||||
|
||||
#define PROPERTY_GET_NFKD(_cp) \
|
||||
PROPERTY_GET(QuickCheckNFKDIndexPtr, QuickCheckNFKDDataPtr, _cp)
|
||||
|
||||
|
||||
|
||||
#endif /* _UTF8REWIND_INTERNAL_DATABASE_H_ */
|
||||
|
||||
|
||||
#ifndef _UTF8REWIND_INTERNAL_STREAMING_H_
|
||||
#define _UTF8REWIND_INTERNAL_STREAMING_H_
|
||||
|
||||
|
||||
|
||||
|
||||
/*
|
||||
UAX15-D4. Stream-Safe Text Process
|
||||
|
||||
This is the process of producing a Unicode string in Stream-Safe Text Format by processing that string
|
||||
from start to finish, inserting U+034F COMBINING GRAPHEME JOINER (CGJ) within long sequences of
|
||||
non-starters. The exact position of the inserted CGJs are determined according to the following algorithm,
|
||||
which describes the generation of an output string from an input string:
|
||||
|
||||
* If the input string is empty, return an empty output string.
|
||||
* Set nonStarterCount to zero.
|
||||
* For each code point C in the input string:
|
||||
* Produce the NFKD decomposition S.
|
||||
* If nonStarterCount plus the number of initial non-starters in S is greater than 30, append a CGJ to
|
||||
the output string and set the nonStarterCount to zero.
|
||||
* Append C to the output string.
|
||||
* If there are no starters in S, increment nonStarterCount by the number of code points in S; otherwise,
|
||||
set nonStarterCount to the number of trailing non-starters in S (which may be zero).
|
||||
* Return the output string.
|
||||
*/
|
||||
|
||||
#define STREAM_SAFE_MAX 30
|
||||
#define STREAM_BUFFER_MAX 32
|
||||
|
||||
typedef struct {
|
||||
const char* src;
|
||||
size_t src_size;
|
||||
uint8_t index;
|
||||
uint8_t current;
|
||||
uint8_t filled;
|
||||
uint8_t stable;
|
||||
uint8_t last_length;
|
||||
unicode_t codepoint[STREAM_BUFFER_MAX];
|
||||
uint8_t quick_check[STREAM_BUFFER_MAX];
|
||||
uint8_t canonical_combining_class[STREAM_BUFFER_MAX];
|
||||
} StreamState;
|
||||
|
||||
|
||||
|
||||
|
||||
#endif /* _UTF8REWIND_INTERNAL_STREAMING_H_ */
|
||||
|
||||
#ifndef _UTF8REWIND_INTERNAL_COMPOSITION_H_
|
||||
#define _UTF8REWIND_INTERNAL_COMPOSITION_H_
|
||||
|
||||
|
||||
|
||||
|
||||
typedef struct {
|
||||
StreamState* input;
|
||||
StreamState* output;
|
||||
const size_t* qc_index;
|
||||
const uint8_t* qc_data;
|
||||
} ComposeState;
|
||||
|
||||
|
||||
|
||||
#endif /* _UTF8REWIND_INTERNAL_COMPOSITION_H_ */
|
||||
|
||||
#ifndef _UTF8REWIND_INTERNAL_DECOMPOSITION_H_
|
||||
#define _UTF8REWIND_INTERNAL_DECOMPOSITION_H_
|
||||
|
||||
|
||||
|
||||
|
||||
typedef struct {
|
||||
StreamState* input;
|
||||
StreamState* output;
|
||||
const size_t* qc_index;
|
||||
const uint8_t* qc_data;
|
||||
const uint32_t* property_index1;
|
||||
const uint32_t* property_index2;
|
||||
const uint32_t* property_data;
|
||||
unicode_t cache_codepoint[STREAM_BUFFER_MAX];
|
||||
uint8_t cache_canonical_combining_class[STREAM_BUFFER_MAX];
|
||||
uint8_t cache_current;
|
||||
uint8_t cache_filled;
|
||||
} DecomposeState;
|
||||
|
||||
|
||||
|
||||
#endif /* _UTF8REWIND_INTERNAL_DECOMPOSITION_H_ */
|
||||
|
||||
#ifndef _UTF8REWIND_UNICODEDATABASE_H_
|
||||
#define _UTF8REWIND_UNICODEDATABASE_H_
|
||||
|
||||
|
||||
|
||||
|
||||
typedef struct {
|
||||
unicode_t codepoint;
|
||||
uint32_t length_and_offset;
|
||||
} DecompositionRecord;
|
||||
|
||||
typedef struct {
|
||||
uint64_t key;
|
||||
unicode_t value;
|
||||
} CompositionRecord;
|
||||
|
||||
|
||||
|
||||
#endif /* _UTF8REWIND_UNICODEDATABASE_H_ */
|
||||
|
||||
|
||||
#endif /*U8_H*/
|
755
libdispatch/utf8proc.c
Normal file
755
libdispatch/utf8proc.c
Normal file
@ -0,0 +1,755 @@
|
||||
/* -*- mode: c; c-basic-offset: 2; tab-width: 2; indent-tabs-mode: nil -*- */
|
||||
/*
|
||||
* Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
|
||||
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
* This library contains derived data from a modified version of the
|
||||
* Unicode data files.
|
||||
*
|
||||
* The original data files are available at
|
||||
* http://www.unicode.org/Public/UNIDATA/
|
||||
*
|
||||
* Please notice the copyright statement in the file "utf8proc_data.c".
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* File name: utf8proc.c
|
||||
*
|
||||
* Description:
|
||||
* Implementation of libutf8proc.
|
||||
*/
|
||||
|
||||
|
||||
#include "utf8proc.h"
|
||||
#include "utf8proc_data.c"
|
||||
|
||||
|
||||
UTF8PROC_DLLEXPORT const utf8proc_int8_t utf8proc_utf8class[256] = {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
||||
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
||||
4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
#define UTF8PROC_HANGUL_SBASE 0xAC00
|
||||
#define UTF8PROC_HANGUL_LBASE 0x1100
|
||||
#define UTF8PROC_HANGUL_VBASE 0x1161
|
||||
#define UTF8PROC_HANGUL_TBASE 0x11A7
|
||||
#define UTF8PROC_HANGUL_LCOUNT 19
|
||||
#define UTF8PROC_HANGUL_VCOUNT 21
|
||||
#define UTF8PROC_HANGUL_TCOUNT 28
|
||||
#define UTF8PROC_HANGUL_NCOUNT 588
|
||||
#define UTF8PROC_HANGUL_SCOUNT 11172
|
||||
/* END is exclusive */
|
||||
#define UTF8PROC_HANGUL_L_START 0x1100
|
||||
#define UTF8PROC_HANGUL_L_END 0x115A
|
||||
#define UTF8PROC_HANGUL_L_FILLER 0x115F
|
||||
#define UTF8PROC_HANGUL_V_START 0x1160
|
||||
#define UTF8PROC_HANGUL_V_END 0x11A3
|
||||
#define UTF8PROC_HANGUL_T_START 0x11A8
|
||||
#define UTF8PROC_HANGUL_T_END 0x11FA
|
||||
#define UTF8PROC_HANGUL_S_START 0xAC00
|
||||
#define UTF8PROC_HANGUL_S_END 0xD7A4
|
||||
|
||||
/* Should follow semantic-versioning rules (semver.org) based on API
|
||||
compatibility. (Note that the shared-library version number will
|
||||
be different, being based on ABI compatibility.): */
|
||||
#define STRINGIZEx(x) #x
|
||||
#define STRINGIZE(x) STRINGIZEx(x)
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_version(void) {
|
||||
return STRINGIZE(UTF8PROC_VERSION_MAJOR) "." STRINGIZE(UTF8PROC_VERSION_MINOR) "." STRINGIZE(UTF8PROC_VERSION_PATCH) "";
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode) {
|
||||
switch (errcode) {
|
||||
case UTF8PROC_ERROR_NOMEM:
|
||||
return "Memory for processing UTF-8 data could not be allocated.";
|
||||
case UTF8PROC_ERROR_OVERFLOW:
|
||||
return "UTF-8 string is too long to be processed.";
|
||||
case UTF8PROC_ERROR_INVALIDUTF8:
|
||||
return "Invalid UTF-8 string";
|
||||
case UTF8PROC_ERROR_NOTASSIGNED:
|
||||
return "Unassigned Unicode code point found in UTF-8 string.";
|
||||
case UTF8PROC_ERROR_INVALIDOPTS:
|
||||
return "Invalid options for UTF-8 processing chosen.";
|
||||
default:
|
||||
return "An unknown error occurred while processing UTF-8 data.";
|
||||
}
|
||||
}
|
||||
|
||||
#define utf_cont(ch) (((ch) & 0xc0) == 0x80)
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
|
||||
) {
|
||||
utf8proc_uint32_t uc;
|
||||
const utf8proc_uint8_t *end;
|
||||
|
||||
*dst = -1;
|
||||
if (!strlen) return 0;
|
||||
end = str + ((strlen < 0) ? 4 : strlen);
|
||||
uc = *str++;
|
||||
if (uc < 0x80) {
|
||||
*dst = uc;
|
||||
return 1;
|
||||
}
|
||||
// Must be between 0xc2 and 0xf4 inclusive to be valid
|
||||
if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
if (uc < 0xe0) { // 2-byte sequence
|
||||
// Must have valid continuation character
|
||||
if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
|
||||
return 2;
|
||||
}
|
||||
if (uc < 0xf0) { // 3-byte sequence
|
||||
if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
|
||||
return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
// Check for surrogate chars
|
||||
if (uc == 0xed && *str > 0x9f)
|
||||
return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
|
||||
if (uc < 0x800)
|
||||
return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
*dst = uc;
|
||||
return 3;
|
||||
}
|
||||
// 4-byte sequence
|
||||
// Must have 3 valid continuation characters
|
||||
if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
|
||||
return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
// Make sure in correct range (0x10000 - 0x10ffff)
|
||||
if (uc == 0xf0) {
|
||||
if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
} else if (uc == 0xf4) {
|
||||
if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
}
|
||||
*dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
|
||||
return 4;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
|
||||
return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
|
||||
if (uc < 0x00) {
|
||||
return 0;
|
||||
} else if (uc < 0x80) {
|
||||
dst[0] = (utf8proc_uint8_t) uc;
|
||||
return 1;
|
||||
} else if (uc < 0x800) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 2;
|
||||
// Note: we allow encoding 0xd800-0xdfff here, so as not to change
|
||||
// the API, however, these are actually invalid in UTF-8
|
||||
} else if (uc < 0x10000) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 3;
|
||||
} else if (uc < 0x110000) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
||||
dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 4;
|
||||
} else return 0;
|
||||
}
|
||||
|
||||
/* internal "unsafe" version that does not check whether uc is in range */
|
||||
static utf8proc_ssize_t unsafe_encode_char(utf8proc_int32_t uc, utf8proc_uint8_t *dst) {
|
||||
if (uc < 0x00) {
|
||||
return 0;
|
||||
} else if (uc < 0x80) {
|
||||
dst[0] = (utf8proc_uint8_t)uc;
|
||||
return 1;
|
||||
} else if (uc < 0x800) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xC0 + (uc >> 6));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 2;
|
||||
} else if (uc == 0xFFFF) {
|
||||
dst[0] = (utf8proc_uint8_t)0xFF;
|
||||
return 1;
|
||||
} else if (uc == 0xFFFE) {
|
||||
dst[0] = (utf8proc_uint8_t)0xFE;
|
||||
return 1;
|
||||
} else if (uc < 0x10000) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xE0 + (uc >> 12));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[2] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 3;
|
||||
} else if (uc < 0x110000) {
|
||||
dst[0] = (utf8proc_uint8_t)(0xF0 + (uc >> 18));
|
||||
dst[1] = (utf8proc_uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
||||
dst[2] = (utf8proc_uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
||||
dst[3] = (utf8proc_uint8_t)(0x80 + (uc & 0x3F));
|
||||
return 4;
|
||||
} else return 0;
|
||||
}
|
||||
|
||||
/* internal "unsafe" version that does not check whether uc is in range */
|
||||
static const utf8proc_property_t *unsafe_get_property(utf8proc_int32_t uc) {
|
||||
/* ASSERT: uc >= 0 && uc < 0x110000 */
|
||||
return utf8proc_properties + (
|
||||
utf8proc_stage2table[
|
||||
utf8proc_stage1table[uc >> 8] + (uc & 0xFF)
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t uc) {
|
||||
return uc < 0 || uc >= 0x110000 ? utf8proc_properties : unsafe_get_property(uc);
|
||||
}
|
||||
|
||||
/* return whether there is a grapheme break between boundclasses lbc and tbc
|
||||
(according to the definition of extended grapheme clusters)
|
||||
|
||||
Rule numbering refers to TR29 Version 29 (Unicode 9.0.0):
|
||||
http://www.unicode.org/reports/tr29/tr29-29.html
|
||||
|
||||
CAVEATS:
|
||||
Please note that evaluation of GB10 (grapheme breaks between emoji zwj sequences)
|
||||
and GB 12/13 (regional indicator code points) require knowledge of previous characters
|
||||
and are thus not handled by this function. This may result in an incorrect break before
|
||||
an E_Modifier class codepoint and an incorrectly missing break between two
|
||||
REGIONAL_INDICATOR class code points if such support does not exist in the caller.
|
||||
|
||||
See the special support in grapheme_break_extended, for required bookkeeping by the caller.
|
||||
*/
|
||||
static utf8proc_bool grapheme_break_simple(int lbc, int tbc) {
|
||||
return
|
||||
(lbc == UTF8PROC_BOUNDCLASS_START) ? true : // GB1
|
||||
(lbc == UTF8PROC_BOUNDCLASS_CR && // GB3
|
||||
tbc == UTF8PROC_BOUNDCLASS_LF) ? false : // ---
|
||||
(lbc >= UTF8PROC_BOUNDCLASS_CR && lbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB4
|
||||
(tbc >= UTF8PROC_BOUNDCLASS_CR && tbc <= UTF8PROC_BOUNDCLASS_CONTROL) ? true : // GB5
|
||||
(lbc == UTF8PROC_BOUNDCLASS_L && // GB6
|
||||
(tbc == UTF8PROC_BOUNDCLASS_L || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_V || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_LV || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_LVT)) ? false : // ---
|
||||
((lbc == UTF8PROC_BOUNDCLASS_LV || // GB7
|
||||
lbc == UTF8PROC_BOUNDCLASS_V) && // ---
|
||||
(tbc == UTF8PROC_BOUNDCLASS_V || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_T)) ? false : // ---
|
||||
((lbc == UTF8PROC_BOUNDCLASS_LVT || // GB8
|
||||
lbc == UTF8PROC_BOUNDCLASS_T) && // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_T) ? false : // ---
|
||||
(tbc == UTF8PROC_BOUNDCLASS_EXTEND || // GB9
|
||||
tbc == UTF8PROC_BOUNDCLASS_ZWJ || // ---
|
||||
tbc == UTF8PROC_BOUNDCLASS_SPACINGMARK || // GB9a
|
||||
lbc == UTF8PROC_BOUNDCLASS_PREPEND) ? false : // GB9b
|
||||
((lbc == UTF8PROC_BOUNDCLASS_E_BASE || // GB10 (requires additional handling below)
|
||||
lbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) && // ----
|
||||
tbc == UTF8PROC_BOUNDCLASS_E_MODIFIER) ? false : // ----
|
||||
(lbc == UTF8PROC_BOUNDCLASS_ZWJ && // GB11
|
||||
(tbc == UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ || // ----
|
||||
tbc == UTF8PROC_BOUNDCLASS_E_BASE_GAZ)) ? false : // ----
|
||||
(lbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR && // GB12/13 (requires additional handling below)
|
||||
tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR) ? false : // ----
|
||||
true; // GB999
|
||||
}
|
||||
|
||||
static utf8proc_bool grapheme_break_extended(int lbc, int tbc, utf8proc_int32_t *state)
|
||||
{
|
||||
int lbc_override = lbc;
|
||||
if (state && *state != UTF8PROC_BOUNDCLASS_START)
|
||||
lbc_override = *state;
|
||||
utf8proc_bool break_permitted = grapheme_break_simple(lbc_override, tbc);
|
||||
if (state) {
|
||||
// Special support for GB 12/13 made possible by GB999. After two RI
|
||||
// class codepoints we want to force a break. Do this by resetting the
|
||||
// second RI's bound class to UTF8PROC_BOUNDCLASS_OTHER, to force a break
|
||||
// after that character according to GB999 (unless of course such a break is
|
||||
// forbidden by a different rule such as GB9).
|
||||
if (*state == tbc && tbc == UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR)
|
||||
*state = UTF8PROC_BOUNDCLASS_OTHER;
|
||||
// Special support for GB10. Fold any EXTEND codepoints into the previous
|
||||
// boundclass if we're dealing with an emoji base boundclass.
|
||||
else if ((*state == UTF8PROC_BOUNDCLASS_E_BASE ||
|
||||
*state == UTF8PROC_BOUNDCLASS_E_BASE_GAZ) &&
|
||||
tbc == UTF8PROC_BOUNDCLASS_EXTEND)
|
||||
*state = UTF8PROC_BOUNDCLASS_E_BASE;
|
||||
else
|
||||
*state = tbc;
|
||||
}
|
||||
return break_permitted;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
|
||||
utf8proc_int32_t c1, utf8proc_int32_t c2, utf8proc_int32_t *state) {
|
||||
|
||||
return grapheme_break_extended(utf8proc_get_property(c1)->boundclass,
|
||||
utf8proc_get_property(c2)->boundclass,
|
||||
state);
|
||||
}
|
||||
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
|
||||
utf8proc_int32_t c1, utf8proc_int32_t c2) {
|
||||
return utf8proc_grapheme_break_stateful(c1, c2, NULL);
|
||||
}
|
||||
|
||||
static utf8proc_int32_t seqindex_decode_entry(const utf8proc_uint16_t **entry)
|
||||
{
|
||||
utf8proc_int32_t entry_cp = **entry;
|
||||
if ((entry_cp & 0xF800) == 0xD800) {
|
||||
*entry = *entry + 1;
|
||||
entry_cp = ((entry_cp & 0x03FF) << 10) | (**entry & 0x03FF);
|
||||
entry_cp += 0x10000;
|
||||
}
|
||||
return entry_cp;
|
||||
}
|
||||
|
||||
static utf8proc_int32_t seqindex_decode_index(const utf8proc_uint32_t seqindex)
|
||||
{
|
||||
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex];
|
||||
return seqindex_decode_entry(&entry);
|
||||
}
|
||||
|
||||
static utf8proc_ssize_t seqindex_write_char_decomposed(utf8proc_uint16_t seqindex, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
||||
utf8proc_ssize_t written = 0;
|
||||
const utf8proc_uint16_t *entry = &utf8proc_sequences[seqindex & 0x1FFF];
|
||||
int len = seqindex >> 13;
|
||||
if (len >= 7) {
|
||||
len = *entry;
|
||||
entry++;
|
||||
}
|
||||
for (; len >= 0; entry++, len--) {
|
||||
utf8proc_int32_t entry_cp = seqindex_decode_entry(&entry);
|
||||
|
||||
written += utf8proc_decompose_char(entry_cp, dst+written,
|
||||
(bufsize > written) ? (bufsize - written) : 0, options,
|
||||
last_boundclass);
|
||||
if (written < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||
}
|
||||
return written;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
|
||||
{
|
||||
utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_seqindex;
|
||||
return cl != UINT16_MAX ? seqindex_decode_index(cl) : c;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
|
||||
{
|
||||
utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_seqindex;
|
||||
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c)
|
||||
{
|
||||
utf8proc_int32_t cu = utf8proc_get_property(c)->titlecase_seqindex;
|
||||
return cu != UINT16_MAX ? seqindex_decode_index(cu) : c;
|
||||
}
|
||||
|
||||
/* return a character width analogous to wcwidth (except portable and
|
||||
hopefully less buggy than most system wcwidth functions). */
|
||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
|
||||
return utf8proc_get_property(c)->charwidth;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t c) {
|
||||
return utf8proc_get_property(c)->category;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t c) {
|
||||
static const char s[][3] = {"Cn","Lu","Ll","Lt","Lm","Lo","Mn","Mc","Me","Nd","Nl","No","Pc","Pd","Ps","Pe","Pi","Pf","Po","Sm","Sc","Sk","So","Zs","Zl","Zp","Cc","Cf","Cs","Co"};
|
||||
return s[utf8proc_category(c)];
|
||||
}
|
||||
|
||||
#define utf8proc_decompose_lump(replacement_uc) \
|
||||
return utf8proc_decompose_char((replacement_uc), dst, bufsize, \
|
||||
options & ~UTF8PROC_LUMP, last_boundclass)
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(utf8proc_int32_t uc, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize, utf8proc_option_t options, int *last_boundclass) {
|
||||
const utf8proc_property_t *property;
|
||||
utf8proc_propval_t category;
|
||||
utf8proc_int32_t hangul_sindex;
|
||||
if (uc < 0 || uc >= 0x110000) return UTF8PROC_ERROR_NOTASSIGNED;
|
||||
property = unsafe_get_property(uc);
|
||||
category = property->category;
|
||||
hangul_sindex = uc - UTF8PROC_HANGUL_SBASE;
|
||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT) {
|
||||
utf8proc_int32_t hangul_tindex;
|
||||
if (bufsize >= 1) {
|
||||
dst[0] = UTF8PROC_HANGUL_LBASE +
|
||||
hangul_sindex / UTF8PROC_HANGUL_NCOUNT;
|
||||
if (bufsize >= 2) dst[1] = UTF8PROC_HANGUL_VBASE +
|
||||
(hangul_sindex % UTF8PROC_HANGUL_NCOUNT) / UTF8PROC_HANGUL_TCOUNT;
|
||||
}
|
||||
hangul_tindex = hangul_sindex % UTF8PROC_HANGUL_TCOUNT;
|
||||
if (!hangul_tindex) return 2;
|
||||
if (bufsize >= 3) dst[2] = UTF8PROC_HANGUL_TBASE + hangul_tindex;
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_REJECTNA) {
|
||||
if (!category) return UTF8PROC_ERROR_NOTASSIGNED;
|
||||
}
|
||||
if (options & UTF8PROC_IGNORE) {
|
||||
if (property->ignorable) return 0;
|
||||
}
|
||||
if (options & UTF8PROC_LUMP) {
|
||||
if (category == UTF8PROC_CATEGORY_ZS) utf8proc_decompose_lump(0x0020);
|
||||
if (uc == 0x2018 || uc == 0x2019 || uc == 0x02BC || uc == 0x02C8)
|
||||
utf8proc_decompose_lump(0x0027);
|
||||
if (category == UTF8PROC_CATEGORY_PD || uc == 0x2212)
|
||||
utf8proc_decompose_lump(0x002D);
|
||||
if (uc == 0x2044 || uc == 0x2215) utf8proc_decompose_lump(0x002F);
|
||||
if (uc == 0x2236) utf8proc_decompose_lump(0x003A);
|
||||
if (uc == 0x2039 || uc == 0x2329 || uc == 0x3008)
|
||||
utf8proc_decompose_lump(0x003C);
|
||||
if (uc == 0x203A || uc == 0x232A || uc == 0x3009)
|
||||
utf8proc_decompose_lump(0x003E);
|
||||
if (uc == 0x2216) utf8proc_decompose_lump(0x005C);
|
||||
if (uc == 0x02C4 || uc == 0x02C6 || uc == 0x2038 || uc == 0x2303)
|
||||
utf8proc_decompose_lump(0x005E);
|
||||
if (category == UTF8PROC_CATEGORY_PC || uc == 0x02CD)
|
||||
utf8proc_decompose_lump(0x005F);
|
||||
if (uc == 0x02CB) utf8proc_decompose_lump(0x0060);
|
||||
if (uc == 0x2223) utf8proc_decompose_lump(0x007C);
|
||||
if (uc == 0x223C) utf8proc_decompose_lump(0x007E);
|
||||
if ((options & UTF8PROC_NLF2LS) && (options & UTF8PROC_NLF2PS)) {
|
||||
if (category == UTF8PROC_CATEGORY_ZL ||
|
||||
category == UTF8PROC_CATEGORY_ZP)
|
||||
utf8proc_decompose_lump(0x000A);
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_STRIPMARK) {
|
||||
if (category == UTF8PROC_CATEGORY_MN ||
|
||||
category == UTF8PROC_CATEGORY_MC ||
|
||||
category == UTF8PROC_CATEGORY_ME) return 0;
|
||||
}
|
||||
if (options & UTF8PROC_CASEFOLD) {
|
||||
if (property->casefold_seqindex != UINT16_MAX) {
|
||||
return seqindex_write_char_decomposed(property->casefold_seqindex, dst, bufsize, options, last_boundclass);
|
||||
}
|
||||
}
|
||||
if (options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) {
|
||||
if (property->decomp_seqindex != UINT16_MAX &&
|
||||
(!property->decomp_type || (options & UTF8PROC_COMPAT))) {
|
||||
return seqindex_write_char_decomposed(property->decomp_seqindex, dst, bufsize, options, last_boundclass);
|
||||
}
|
||||
}
|
||||
if (options & UTF8PROC_CHARBOUND) {
|
||||
utf8proc_bool boundary;
|
||||
int tbc = property->boundclass;
|
||||
boundary = grapheme_break_extended(*last_boundclass, tbc, last_boundclass);
|
||||
if (boundary) {
|
||||
if (bufsize >= 1) dst[0] = 0xFFFF;
|
||||
if (bufsize >= 2) dst[1] = uc;
|
||||
return 2;
|
||||
}
|
||||
}
|
||||
if (bufsize >= 1) *dst = uc;
|
||||
return 1;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||
) {
|
||||
return utf8proc_decompose_custom(str, strlen, buffer, bufsize, options, NULL, NULL);
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
|
||||
utf8proc_custom_func custom_func, void *custom_data
|
||||
) {
|
||||
/* strlen will be ignored, if UTF8PROC_NULLTERM is set in options */
|
||||
utf8proc_ssize_t wpos = 0;
|
||||
if ((options & UTF8PROC_COMPOSE) && (options & UTF8PROC_DECOMPOSE))
|
||||
return UTF8PROC_ERROR_INVALIDOPTS;
|
||||
if ((options & UTF8PROC_STRIPMARK) &&
|
||||
!(options & UTF8PROC_COMPOSE) && !(options & UTF8PROC_DECOMPOSE))
|
||||
return UTF8PROC_ERROR_INVALIDOPTS;
|
||||
{
|
||||
utf8proc_int32_t uc;
|
||||
utf8proc_ssize_t rpos = 0;
|
||||
utf8proc_ssize_t decomp_result;
|
||||
int boundclass = UTF8PROC_BOUNDCLASS_START;
|
||||
while (1) {
|
||||
if (options & UTF8PROC_NULLTERM) {
|
||||
rpos += utf8proc_iterate(str + rpos, -1, &uc);
|
||||
/* checking of return value is not necessary,
|
||||
as 'uc' is < 0 in case of error */
|
||||
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
if (rpos < 0) return UTF8PROC_ERROR_OVERFLOW;
|
||||
if (uc == 0) break;
|
||||
} else {
|
||||
if (rpos >= strlen) break;
|
||||
rpos += utf8proc_iterate(str + rpos, strlen - rpos, &uc);
|
||||
if (uc < 0) return UTF8PROC_ERROR_INVALIDUTF8;
|
||||
}
|
||||
if (custom_func != NULL) {
|
||||
uc = custom_func(uc, custom_data); /* user-specified custom mapping */
|
||||
}
|
||||
decomp_result = utf8proc_decompose_char(
|
||||
uc, buffer + wpos, (bufsize > wpos) ? (bufsize - wpos) : 0, options,
|
||||
&boundclass
|
||||
);
|
||||
if (decomp_result < 0) return decomp_result;
|
||||
wpos += decomp_result;
|
||||
/* prohibiting integer overflows due to too long strings: */
|
||||
if (wpos < 0 ||
|
||||
wpos > (utf8proc_ssize_t)(SSIZE_MAX/sizeof(utf8proc_int32_t)/2))
|
||||
return UTF8PROC_ERROR_OVERFLOW;
|
||||
}
|
||||
}
|
||||
if ((options & (UTF8PROC_COMPOSE|UTF8PROC_DECOMPOSE)) && bufsize >= wpos) {
|
||||
utf8proc_ssize_t pos = 0;
|
||||
while (pos < wpos-1) {
|
||||
utf8proc_int32_t uc1, uc2;
|
||||
const utf8proc_property_t *property1, *property2;
|
||||
uc1 = buffer[pos];
|
||||
uc2 = buffer[pos+1];
|
||||
property1 = unsafe_get_property(uc1);
|
||||
property2 = unsafe_get_property(uc2);
|
||||
if (property1->combining_class > property2->combining_class &&
|
||||
property2->combining_class > 0) {
|
||||
buffer[pos] = uc2;
|
||||
buffer[pos+1] = uc1;
|
||||
if (pos > 0) pos--; else pos++;
|
||||
} else {
|
||||
pos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return wpos;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored */
|
||||
if (options & (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS | UTF8PROC_STRIPCC)) {
|
||||
utf8proc_ssize_t rpos;
|
||||
utf8proc_ssize_t wpos = 0;
|
||||
utf8proc_int32_t uc;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
if (uc == 0x000D && rpos < length-1 && buffer[rpos+1] == 0x000A) rpos++;
|
||||
if (uc == 0x000A || uc == 0x000D || uc == 0x0085 ||
|
||||
((options & UTF8PROC_STRIPCC) && (uc == 0x000B || uc == 0x000C))) {
|
||||
if (options & UTF8PROC_NLF2LS) {
|
||||
if (options & UTF8PROC_NLF2PS) {
|
||||
buffer[wpos++] = 0x000A;
|
||||
} else {
|
||||
buffer[wpos++] = 0x2028;
|
||||
}
|
||||
} else {
|
||||
if (options & UTF8PROC_NLF2PS) {
|
||||
buffer[wpos++] = 0x2029;
|
||||
} else {
|
||||
buffer[wpos++] = 0x0020;
|
||||
}
|
||||
}
|
||||
} else if ((options & UTF8PROC_STRIPCC) &&
|
||||
(uc < 0x0020 || (uc >= 0x007F && uc < 0x00A0))) {
|
||||
if (uc == 0x0009) buffer[wpos++] = 0x0020;
|
||||
} else {
|
||||
buffer[wpos++] = uc;
|
||||
}
|
||||
}
|
||||
length = wpos;
|
||||
}
|
||||
if (options & UTF8PROC_COMPOSE) {
|
||||
utf8proc_int32_t *starter = NULL;
|
||||
utf8proc_int32_t current_char;
|
||||
const utf8proc_property_t *starter_property = NULL, *current_property;
|
||||
utf8proc_propval_t max_combining_class = -1;
|
||||
utf8proc_ssize_t rpos;
|
||||
utf8proc_ssize_t wpos = 0;
|
||||
utf8proc_int32_t composition;
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
current_char = buffer[rpos];
|
||||
current_property = unsafe_get_property(current_char);
|
||||
if (starter && current_property->combining_class > max_combining_class) {
|
||||
/* combination perhaps possible */
|
||||
utf8proc_int32_t hangul_lindex;
|
||||
utf8proc_int32_t hangul_sindex;
|
||||
hangul_lindex = *starter - UTF8PROC_HANGUL_LBASE;
|
||||
if (hangul_lindex >= 0 && hangul_lindex < UTF8PROC_HANGUL_LCOUNT) {
|
||||
utf8proc_int32_t hangul_vindex;
|
||||
hangul_vindex = current_char - UTF8PROC_HANGUL_VBASE;
|
||||
if (hangul_vindex >= 0 && hangul_vindex < UTF8PROC_HANGUL_VCOUNT) {
|
||||
*starter = UTF8PROC_HANGUL_SBASE +
|
||||
(hangul_lindex * UTF8PROC_HANGUL_VCOUNT + hangul_vindex) *
|
||||
UTF8PROC_HANGUL_TCOUNT;
|
||||
starter_property = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
hangul_sindex = *starter - UTF8PROC_HANGUL_SBASE;
|
||||
if (hangul_sindex >= 0 && hangul_sindex < UTF8PROC_HANGUL_SCOUNT &&
|
||||
(hangul_sindex % UTF8PROC_HANGUL_TCOUNT) == 0) {
|
||||
utf8proc_int32_t hangul_tindex;
|
||||
hangul_tindex = current_char - UTF8PROC_HANGUL_TBASE;
|
||||
if (hangul_tindex >= 0 && hangul_tindex < UTF8PROC_HANGUL_TCOUNT) {
|
||||
*starter += hangul_tindex;
|
||||
starter_property = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (!starter_property) {
|
||||
starter_property = unsafe_get_property(*starter);
|
||||
}
|
||||
if (starter_property->comb_index < 0x8000 &&
|
||||
current_property->comb_index != UINT16_MAX &&
|
||||
current_property->comb_index >= 0x8000) {
|
||||
int sidx = starter_property->comb_index;
|
||||
int idx = (current_property->comb_index & 0x3FFF) - utf8proc_combinations[sidx];
|
||||
if (idx >= 0 && idx <= utf8proc_combinations[sidx + 1] ) {
|
||||
idx += sidx + 2;
|
||||
if (current_property->comb_index & 0x4000) {
|
||||
composition = (utf8proc_combinations[idx] << 16) | utf8proc_combinations[idx+1];
|
||||
} else
|
||||
composition = utf8proc_combinations[idx];
|
||||
|
||||
if (composition > 0 && (!(options & UTF8PROC_STABLE) ||
|
||||
!(unsafe_get_property(composition)->comp_exclusion))) {
|
||||
*starter = composition;
|
||||
starter_property = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
buffer[wpos] = current_char;
|
||||
if (current_property->combining_class) {
|
||||
if (current_property->combining_class > max_combining_class) {
|
||||
max_combining_class = current_property->combining_class;
|
||||
}
|
||||
} else {
|
||||
starter = buffer + wpos;
|
||||
starter_property = NULL;
|
||||
max_combining_class = -1;
|
||||
}
|
||||
wpos++;
|
||||
}
|
||||
length = wpos;
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options) {
|
||||
/* UTF8PROC_NULLTERM option will be ignored, 'length' is never ignored
|
||||
ASSERT: 'buffer' has one spare byte of free space at the end! */
|
||||
length = utf8proc_normalize_utf32(buffer, length, options);
|
||||
if (length < 0) return length;
|
||||
{
|
||||
utf8proc_ssize_t rpos, wpos = 0;
|
||||
utf8proc_int32_t uc;
|
||||
if (options & UTF8PROC_CHARBOUND) {
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
wpos += unsafe_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
|
||||
}
|
||||
} else {
|
||||
for (rpos = 0; rpos < length; rpos++) {
|
||||
uc = buffer[rpos];
|
||||
wpos += utf8proc_encode_char(uc, ((utf8proc_uint8_t *)buffer) + wpos);
|
||||
}
|
||||
}
|
||||
((utf8proc_uint8_t *)buffer)[wpos] = 0;
|
||||
return wpos;
|
||||
}
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||
) {
|
||||
return utf8proc_map_custom(str, strlen, dstptr, options, NULL, NULL);
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
|
||||
utf8proc_custom_func custom_func, void *custom_data
|
||||
) {
|
||||
utf8proc_int32_t *buffer;
|
||||
utf8proc_ssize_t result;
|
||||
*dstptr = NULL;
|
||||
result = utf8proc_decompose_custom(str, strlen, NULL, 0, options, custom_func, custom_data);
|
||||
if (result < 0) return result;
|
||||
buffer = (utf8proc_int32_t *) malloc(result * sizeof(utf8proc_int32_t) + 1);
|
||||
if (!buffer) return UTF8PROC_ERROR_NOMEM;
|
||||
result = utf8proc_decompose_custom(str, strlen, buffer, result, options, custom_func, custom_data);
|
||||
if (result < 0) {
|
||||
free(buffer);
|
||||
return result;
|
||||
}
|
||||
result = utf8proc_reencode(buffer, result, options);
|
||||
if (result < 0) {
|
||||
free(buffer);
|
||||
return result;
|
||||
}
|
||||
{
|
||||
utf8proc_int32_t *newptr;
|
||||
newptr = (utf8proc_int32_t *) realloc(buffer, (size_t)result+1);
|
||||
if (newptr) buffer = newptr;
|
||||
}
|
||||
*dstptr = (utf8proc_uint8_t *)buffer;
|
||||
return result;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_DECOMPOSE);
|
||||
return retval;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_COMPOSE);
|
||||
return retval;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_DECOMPOSE | UTF8PROC_COMPAT);
|
||||
return retval;
|
||||
}
|
||||
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
|
||||
utf8proc_uint8_t *retval;
|
||||
utf8proc_map(str, 0, &retval, UTF8PROC_NULLTERM | UTF8PROC_STABLE |
|
||||
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
|
||||
return retval;
|
||||
}
|
699
libdispatch/utf8proc.h
Normal file
699
libdispatch/utf8proc.h
Normal file
@ -0,0 +1,699 @@
|
||||
/*
|
||||
* Copyright (c) 2015 Steven G. Johnson, Jiahao Chen, Peter Colberg, Tony Kelman, Scott P. Jones, and other contributors.
|
||||
* Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
/**
|
||||
* @mainpage
|
||||
*
|
||||
* utf8proc is a free/open-source (MIT/expat licensed) C library
|
||||
* providing Unicode normalization, case-folding, and other operations
|
||||
* for strings in the UTF-8 encoding, supporting Unicode version
|
||||
* 8.0.0. See the utf8proc home page (http://julialang.org/utf8proc/)
|
||||
* for downloads and other information, or the source code on github
|
||||
* (https://github.com/JuliaLang/utf8proc).
|
||||
*
|
||||
* For the utf8proc API documentation, see: @ref utf8proc.h
|
||||
*
|
||||
* The features of utf8proc include:
|
||||
*
|
||||
* - Transformation of strings (@ref utf8proc_map) to:
|
||||
* - decompose (@ref UTF8PROC_DECOMPOSE) or compose (@ref UTF8PROC_COMPOSE) Unicode combining characters (http://en.wikipedia.org/wiki/Combining_character)
|
||||
* - canonicalize Unicode compatibility characters (@ref UTF8PROC_COMPAT)
|
||||
* - strip "ignorable" (@ref UTF8PROC_IGNORE) characters, control characters (@ref UTF8PROC_STRIPCC), or combining characters such as accents (@ref UTF8PROC_STRIPMARK)
|
||||
* - case-folding (@ref UTF8PROC_CASEFOLD)
|
||||
* - Unicode normalization: @ref utf8proc_NFD, @ref utf8proc_NFC, @ref utf8proc_NFKD, @ref utf8proc_NFKC
|
||||
* - Detecting grapheme boundaries (@ref utf8proc_grapheme_break and @ref UTF8PROC_CHARBOUND)
|
||||
* - Character-width computation: @ref utf8proc_charwidth
|
||||
* - Classification of characters by Unicode category: @ref utf8proc_category and @ref utf8proc_category_string
|
||||
* - Encode (@ref utf8proc_encode_char) and decode (@ref utf8proc_iterate) Unicode codepoints to/from UTF-8.
|
||||
*/
|
||||
|
||||
/** @file */
|
||||
|
||||
#ifndef UTF8PROC_H
|
||||
#define UTF8PROC_H
|
||||
|
||||
/** @name API version
|
||||
*
|
||||
* The utf8proc API version MAJOR.MINOR.PATCH, following
|
||||
* semantic-versioning rules (http://semver.org) based on API
|
||||
* compatibility.
|
||||
*
|
||||
* This is also returned at runtime by @ref utf8proc_version; however, the
|
||||
* runtime version may append a string like "-dev" to the version number
|
||||
* for prerelease versions.
|
||||
*
|
||||
* @note The shared-library version number in the Makefile
|
||||
* (and CMakeLists.txt, and MANIFEST) may be different,
|
||||
* being based on ABI compatibility rather than API compatibility.
|
||||
*/
|
||||
/** @{ */
|
||||
/** The MAJOR version number (increased when backwards API compatibility is broken). */
|
||||
#define UTF8PROC_VERSION_MAJOR 2
|
||||
/** The MINOR version number (increased when new functionality is added in a backwards-compatible manner). */
|
||||
#define UTF8PROC_VERSION_MINOR 1
|
||||
/** The PATCH version (increased for fixes that do not change the API). */
|
||||
#define UTF8PROC_VERSION_PATCH 0
|
||||
/** @} */
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
#if defined(_MSC_VER) && _MSC_VER < 1800
|
||||
// MSVC prior to 2013 lacked stdbool.h and inttypes.h
|
||||
typedef signed char utf8proc_int8_t;
|
||||
typedef unsigned char utf8proc_uint8_t;
|
||||
typedef short utf8proc_int16_t;
|
||||
typedef unsigned short utf8proc_uint16_t;
|
||||
typedef int utf8proc_int32_t;
|
||||
typedef unsigned int utf8proc_uint32_t;
|
||||
# ifdef _WIN64
|
||||
typedef __int64 utf8proc_ssize_t;
|
||||
typedef unsigned __int64 utf8proc_size_t;
|
||||
# else
|
||||
typedef int utf8proc_ssize_t;
|
||||
typedef unsigned int utf8proc_size_t;
|
||||
# endif
|
||||
# ifndef __cplusplus
|
||||
// emulate C99 bool
|
||||
typedef unsigned char utf8proc_bool;
|
||||
# ifndef __bool_true_false_are_defined
|
||||
# define false 0
|
||||
# define true 1
|
||||
# define __bool_true_false_are_defined 1
|
||||
# endif
|
||||
# else
|
||||
typedef bool utf8proc_bool;
|
||||
# endif
|
||||
#else
|
||||
# include <stddef.h>
|
||||
# include <stdbool.h>
|
||||
# include <inttypes.h>
|
||||
typedef int8_t utf8proc_int8_t;
|
||||
typedef uint8_t utf8proc_uint8_t;
|
||||
typedef int16_t utf8proc_int16_t;
|
||||
typedef uint16_t utf8proc_uint16_t;
|
||||
typedef int32_t utf8proc_int32_t;
|
||||
typedef uint32_t utf8proc_uint32_t;
|
||||
typedef size_t utf8proc_size_t;
|
||||
typedef ptrdiff_t utf8proc_ssize_t;
|
||||
typedef bool utf8proc_bool;
|
||||
#endif
|
||||
#include <limits.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
# ifdef UTF8PROC_EXPORTS
|
||||
# define UTF8PROC_DLLEXPORT __declspec(dllexport)
|
||||
# else
|
||||
# define UTF8PROC_DLLEXPORT __declspec(dllimport)
|
||||
# endif
|
||||
#elif __GNUC__ >= 4
|
||||
# define UTF8PROC_DLLEXPORT __attribute__ ((visibility("default")))
|
||||
#else
|
||||
# define UTF8PROC_DLLEXPORT
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifndef SSIZE_MAX
|
||||
#define SSIZE_MAX ((size_t)SIZE_MAX/2)
|
||||
#endif
|
||||
|
||||
#ifndef UINT16_MAX
|
||||
# define UINT16_MAX 65535U
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Option flags used by several functions in the library.
|
||||
*/
|
||||
typedef enum {
|
||||
/** The given UTF-8 input is NULL terminated. */
|
||||
UTF8PROC_NULLTERM = (1<<0),
|
||||
/** Unicode Versioning Stability has to be respected. */
|
||||
UTF8PROC_STABLE = (1<<1),
|
||||
/** Compatibility decomposition (i.e. formatting information is lost). */
|
||||
UTF8PROC_COMPAT = (1<<2),
|
||||
/** Return a result with decomposed characters. */
|
||||
UTF8PROC_COMPOSE = (1<<3),
|
||||
/** Return a result with decomposed characters. */
|
||||
UTF8PROC_DECOMPOSE = (1<<4),
|
||||
/** Strip "default ignorable characters" such as SOFT-HYPHEN or ZERO-WIDTH-SPACE. */
|
||||
UTF8PROC_IGNORE = (1<<5),
|
||||
/** Return an error, if the input contains unassigned codepoints. */
|
||||
UTF8PROC_REJECTNA = (1<<6),
|
||||
/**
|
||||
* Indicating that NLF-sequences (LF, CRLF, CR, NEL) are representing a
|
||||
* line break, and should be converted to the codepoint for line
|
||||
* separation (LS).
|
||||
*/
|
||||
UTF8PROC_NLF2LS = (1<<7),
|
||||
/**
|
||||
* Indicating that NLF-sequences are representing a paragraph break, and
|
||||
* should be converted to the codepoint for paragraph separation
|
||||
* (PS).
|
||||
*/
|
||||
UTF8PROC_NLF2PS = (1<<8),
|
||||
/** Indicating that the meaning of NLF-sequences is unknown. */
|
||||
UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS),
|
||||
/** Strips and/or convers control characters.
|
||||
*
|
||||
* NLF-sequences are transformed into space, except if one of the
|
||||
* NLF2LS/PS/LF options is given. HorizontalTab (HT) and FormFeed (FF)
|
||||
* are treated as a NLF-sequence in this case. All other control
|
||||
* characters are simply removed.
|
||||
*/
|
||||
UTF8PROC_STRIPCC = (1<<9),
|
||||
/**
|
||||
* Performs unicode case folding, to be able to do a case-insensitive
|
||||
* string comparison.
|
||||
*/
|
||||
UTF8PROC_CASEFOLD = (1<<10),
|
||||
/**
|
||||
* Inserts 0xFF bytes at the beginning of each sequence which is
|
||||
* representing a single grapheme cluster (see UAX#29).
|
||||
*/
|
||||
UTF8PROC_CHARBOUND = (1<<11),
|
||||
/** Lumps certain characters together.
|
||||
*
|
||||
* E.g. HYPHEN U+2010 and MINUS U+2212 to ASCII "-". See lump.md for details.
|
||||
*
|
||||
* If NLF2LF is set, this includes a transformation of paragraph and
|
||||
* line separators to ASCII line-feed (LF).
|
||||
*/
|
||||
UTF8PROC_LUMP = (1<<12),
|
||||
/** Strips all character markings.
|
||||
*
|
||||
* This includes non-spacing, spacing and enclosing (i.e. accents).
|
||||
* @note This option works only with @ref UTF8PROC_COMPOSE or
|
||||
* @ref UTF8PROC_DECOMPOSE
|
||||
*/
|
||||
UTF8PROC_STRIPMARK = (1<<13),
|
||||
} utf8proc_option_t;
|
||||
|
||||
/** @name Error codes
|
||||
* Error codes being returned by almost all functions.
|
||||
*/
|
||||
/** @{ */
|
||||
/** Memory could not be allocated. */
|
||||
#define UTF8PROC_ERROR_NOMEM -1
|
||||
/** The given string is too long to be processed. */
|
||||
#define UTF8PROC_ERROR_OVERFLOW -2
|
||||
/** The given string is not a legal UTF-8 string. */
|
||||
#define UTF8PROC_ERROR_INVALIDUTF8 -3
|
||||
/** The @ref UTF8PROC_REJECTNA flag was set and an unassigned codepoint was found. */
|
||||
#define UTF8PROC_ERROR_NOTASSIGNED -4
|
||||
/** Invalid options have been used. */
|
||||
#define UTF8PROC_ERROR_INVALIDOPTS -5
|
||||
/** @} */
|
||||
|
||||
/* @name Types */
|
||||
|
||||
/** Holds the value of a property. */
|
||||
typedef utf8proc_int16_t utf8proc_propval_t;
|
||||
|
||||
/** Struct containing information about a codepoint. */
|
||||
typedef struct utf8proc_property_struct {
|
||||
/**
|
||||
* Unicode category.
|
||||
* @see utf8proc_category_t.
|
||||
*/
|
||||
utf8proc_propval_t category;
|
||||
utf8proc_propval_t combining_class;
|
||||
/**
|
||||
* Bidirectional class.
|
||||
* @see utf8proc_bidi_class_t.
|
||||
*/
|
||||
utf8proc_propval_t bidi_class;
|
||||
/**
|
||||
* @anchor Decomposition type.
|
||||
* @see utf8proc_decomp_type_t.
|
||||
*/
|
||||
utf8proc_propval_t decomp_type;
|
||||
utf8proc_uint16_t decomp_seqindex;
|
||||
utf8proc_uint16_t casefold_seqindex;
|
||||
utf8proc_uint16_t uppercase_seqindex;
|
||||
utf8proc_uint16_t lowercase_seqindex;
|
||||
utf8proc_uint16_t titlecase_seqindex;
|
||||
utf8proc_uint16_t comb_index;
|
||||
unsigned bidi_mirrored:1;
|
||||
unsigned comp_exclusion:1;
|
||||
/**
|
||||
* Can this codepoint be ignored?
|
||||
*
|
||||
* Used by @ref utf8proc_decompose_char when @ref UTF8PROC_IGNORE is
|
||||
* passed as an option.
|
||||
*/
|
||||
unsigned ignorable:1;
|
||||
unsigned control_boundary:1;
|
||||
/** The width of the codepoint. */
|
||||
unsigned charwidth:2;
|
||||
unsigned pad:2;
|
||||
/**
|
||||
* Boundclass.
|
||||
* @see utf8proc_boundclass_t.
|
||||
*/
|
||||
unsigned boundclass:8;
|
||||
} utf8proc_property_t;
|
||||
|
||||
/** Unicode categories. */
|
||||
typedef enum {
|
||||
UTF8PROC_CATEGORY_CN = 0, /**< Other, not assigned */
|
||||
UTF8PROC_CATEGORY_LU = 1, /**< Letter, uppercase */
|
||||
UTF8PROC_CATEGORY_LL = 2, /**< Letter, lowercase */
|
||||
UTF8PROC_CATEGORY_LT = 3, /**< Letter, titlecase */
|
||||
UTF8PROC_CATEGORY_LM = 4, /**< Letter, modifier */
|
||||
UTF8PROC_CATEGORY_LO = 5, /**< Letter, other */
|
||||
UTF8PROC_CATEGORY_MN = 6, /**< Mark, nonspacing */
|
||||
UTF8PROC_CATEGORY_MC = 7, /**< Mark, spacing combining */
|
||||
UTF8PROC_CATEGORY_ME = 8, /**< Mark, enclosing */
|
||||
UTF8PROC_CATEGORY_ND = 9, /**< Number, decimal digit */
|
||||
UTF8PROC_CATEGORY_NL = 10, /**< Number, letter */
|
||||
UTF8PROC_CATEGORY_NO = 11, /**< Number, other */
|
||||
UTF8PROC_CATEGORY_PC = 12, /**< Punctuation, connector */
|
||||
UTF8PROC_CATEGORY_PD = 13, /**< Punctuation, dash */
|
||||
UTF8PROC_CATEGORY_PS = 14, /**< Punctuation, open */
|
||||
UTF8PROC_CATEGORY_PE = 15, /**< Punctuation, close */
|
||||
UTF8PROC_CATEGORY_PI = 16, /**< Punctuation, initial quote */
|
||||
UTF8PROC_CATEGORY_PF = 17, /**< Punctuation, final quote */
|
||||
UTF8PROC_CATEGORY_PO = 18, /**< Punctuation, other */
|
||||
UTF8PROC_CATEGORY_SM = 19, /**< Symbol, math */
|
||||
UTF8PROC_CATEGORY_SC = 20, /**< Symbol, currency */
|
||||
UTF8PROC_CATEGORY_SK = 21, /**< Symbol, modifier */
|
||||
UTF8PROC_CATEGORY_SO = 22, /**< Symbol, other */
|
||||
UTF8PROC_CATEGORY_ZS = 23, /**< Separator, space */
|
||||
UTF8PROC_CATEGORY_ZL = 24, /**< Separator, line */
|
||||
UTF8PROC_CATEGORY_ZP = 25, /**< Separator, paragraph */
|
||||
UTF8PROC_CATEGORY_CC = 26, /**< Other, control */
|
||||
UTF8PROC_CATEGORY_CF = 27, /**< Other, format */
|
||||
UTF8PROC_CATEGORY_CS = 28, /**< Other, surrogate */
|
||||
UTF8PROC_CATEGORY_CO = 29, /**< Other, private use */
|
||||
} utf8proc_category_t;
|
||||
|
||||
/** Bidirectional character classes. */
|
||||
typedef enum {
|
||||
UTF8PROC_BIDI_CLASS_L = 1, /**< Left-to-Right */
|
||||
UTF8PROC_BIDI_CLASS_LRE = 2, /**< Left-to-Right Embedding */
|
||||
UTF8PROC_BIDI_CLASS_LRO = 3, /**< Left-to-Right Override */
|
||||
UTF8PROC_BIDI_CLASS_R = 4, /**< Right-to-Left */
|
||||
UTF8PROC_BIDI_CLASS_AL = 5, /**< Right-to-Left Arabic */
|
||||
UTF8PROC_BIDI_CLASS_RLE = 6, /**< Right-to-Left Embedding */
|
||||
UTF8PROC_BIDI_CLASS_RLO = 7, /**< Right-to-Left Override */
|
||||
UTF8PROC_BIDI_CLASS_PDF = 8, /**< Pop Directional Format */
|
||||
UTF8PROC_BIDI_CLASS_EN = 9, /**< European Number */
|
||||
UTF8PROC_BIDI_CLASS_ES = 10, /**< European Separator */
|
||||
UTF8PROC_BIDI_CLASS_ET = 11, /**< European Number Terminator */
|
||||
UTF8PROC_BIDI_CLASS_AN = 12, /**< Arabic Number */
|
||||
UTF8PROC_BIDI_CLASS_CS = 13, /**< Common Number Separator */
|
||||
UTF8PROC_BIDI_CLASS_NSM = 14, /**< Nonspacing Mark */
|
||||
UTF8PROC_BIDI_CLASS_BN = 15, /**< Boundary Neutral */
|
||||
UTF8PROC_BIDI_CLASS_B = 16, /**< Paragraph Separator */
|
||||
UTF8PROC_BIDI_CLASS_S = 17, /**< Segment Separator */
|
||||
UTF8PROC_BIDI_CLASS_WS = 18, /**< Whitespace */
|
||||
UTF8PROC_BIDI_CLASS_ON = 19, /**< Other Neutrals */
|
||||
UTF8PROC_BIDI_CLASS_LRI = 20, /**< Left-to-Right Isolate */
|
||||
UTF8PROC_BIDI_CLASS_RLI = 21, /**< Right-to-Left Isolate */
|
||||
UTF8PROC_BIDI_CLASS_FSI = 22, /**< First Strong Isolate */
|
||||
UTF8PROC_BIDI_CLASS_PDI = 23, /**< Pop Directional Isolate */
|
||||
} utf8proc_bidi_class_t;
|
||||
|
||||
/** Decomposition type. */
|
||||
typedef enum {
|
||||
UTF8PROC_DECOMP_TYPE_FONT = 1, /**< Font */
|
||||
UTF8PROC_DECOMP_TYPE_NOBREAK = 2, /**< Nobreak */
|
||||
UTF8PROC_DECOMP_TYPE_INITIAL = 3, /**< Initial */
|
||||
UTF8PROC_DECOMP_TYPE_MEDIAL = 4, /**< Medial */
|
||||
UTF8PROC_DECOMP_TYPE_FINAL = 5, /**< Final */
|
||||
UTF8PROC_DECOMP_TYPE_ISOLATED = 6, /**< Isolated */
|
||||
UTF8PROC_DECOMP_TYPE_CIRCLE = 7, /**< Circle */
|
||||
UTF8PROC_DECOMP_TYPE_SUPER = 8, /**< Super */
|
||||
UTF8PROC_DECOMP_TYPE_SUB = 9, /**< Sub */
|
||||
UTF8PROC_DECOMP_TYPE_VERTICAL = 10, /**< Vertical */
|
||||
UTF8PROC_DECOMP_TYPE_WIDE = 11, /**< Wide */
|
||||
UTF8PROC_DECOMP_TYPE_NARROW = 12, /**< Narrow */
|
||||
UTF8PROC_DECOMP_TYPE_SMALL = 13, /**< Small */
|
||||
UTF8PROC_DECOMP_TYPE_SQUARE = 14, /**< Square */
|
||||
UTF8PROC_DECOMP_TYPE_FRACTION = 15, /**< Fraction */
|
||||
UTF8PROC_DECOMP_TYPE_COMPAT = 16, /**< Compat */
|
||||
} utf8proc_decomp_type_t;
|
||||
|
||||
/** Boundclass property. (TR29) */
|
||||
typedef enum {
|
||||
UTF8PROC_BOUNDCLASS_START = 0, /**< Start */
|
||||
UTF8PROC_BOUNDCLASS_OTHER = 1, /**< Other */
|
||||
UTF8PROC_BOUNDCLASS_CR = 2, /**< Cr */
|
||||
UTF8PROC_BOUNDCLASS_LF = 3, /**< Lf */
|
||||
UTF8PROC_BOUNDCLASS_CONTROL = 4, /**< Control */
|
||||
UTF8PROC_BOUNDCLASS_EXTEND = 5, /**< Extend */
|
||||
UTF8PROC_BOUNDCLASS_L = 6, /**< L */
|
||||
UTF8PROC_BOUNDCLASS_V = 7, /**< V */
|
||||
UTF8PROC_BOUNDCLASS_T = 8, /**< T */
|
||||
UTF8PROC_BOUNDCLASS_LV = 9, /**< Lv */
|
||||
UTF8PROC_BOUNDCLASS_LVT = 10, /**< Lvt */
|
||||
UTF8PROC_BOUNDCLASS_REGIONAL_INDICATOR = 11, /**< Regional indicator */
|
||||
UTF8PROC_BOUNDCLASS_SPACINGMARK = 12, /**< Spacingmark */
|
||||
UTF8PROC_BOUNDCLASS_PREPEND = 13, /**< Prepend */
|
||||
UTF8PROC_BOUNDCLASS_ZWJ = 14, /**< Zero Width Joiner */
|
||||
UTF8PROC_BOUNDCLASS_E_BASE = 15, /**< Emoji Base */
|
||||
UTF8PROC_BOUNDCLASS_E_MODIFIER = 16, /**< Emoji Modifier */
|
||||
UTF8PROC_BOUNDCLASS_GLUE_AFTER_ZWJ = 17, /**< Glue_After_ZWJ */
|
||||
UTF8PROC_BOUNDCLASS_E_BASE_GAZ = 18, /**< E_BASE + GLUE_AFTER_ZJW */
|
||||
} utf8proc_boundclass_t;
|
||||
|
||||
/**
|
||||
* Function pointer type passed to @ref utf8proc_map_custom and
|
||||
* @ref utf8proc_decompose_custom, which is used to specify a user-defined
|
||||
* mapping of codepoints to be applied in conjunction with other mappings.
|
||||
*/
|
||||
typedef utf8proc_int32_t (*utf8proc_custom_func)(utf8proc_int32_t codepoint, void *data);
|
||||
|
||||
/**
|
||||
* Array containing the byte lengths of a UTF-8 encoded codepoint based
|
||||
* on the first byte.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT extern const utf8proc_int8_t utf8proc_utf8class[256];
|
||||
|
||||
/**
|
||||
* Returns the utf8proc API version as a string MAJOR.MINOR.PATCH
|
||||
* (http://semver.org format), possibly with a "-dev" suffix for
|
||||
* development versions.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_version(void);
|
||||
|
||||
/**
|
||||
* Returns an informative error string for the given utf8proc error code
|
||||
* (e.g. the error codes returned by @ref utf8proc_map).
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
|
||||
|
||||
/**
|
||||
* Reads a single codepoint from the UTF-8 sequence being pointed to by `str`.
|
||||
* The maximum number of bytes read is `strlen`, unless `strlen` is
|
||||
* negative (in which case up to 4 bytes are read).
|
||||
*
|
||||
* If a valid codepoint could be read, it is stored in the variable
|
||||
* pointed to by `codepoint_ref`, otherwise that variable will be set to -1.
|
||||
* In case of success, the number of bytes read is returned; otherwise, a
|
||||
* negative error code is returned.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
|
||||
|
||||
/**
|
||||
* Check if a codepoint is valid (regardless of whether it has been
|
||||
* assigned a value by the current Unicode standard).
|
||||
*
|
||||
* @return 1 if the given `codepoint` is valid and otherwise return 0.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Encodes the codepoint as an UTF-8 string in the byte array pointed
|
||||
* to by `dst`. This array must be at least 4 bytes long.
|
||||
*
|
||||
* In case of success the number of bytes written is returned, and
|
||||
* otherwise 0 is returned.
|
||||
*
|
||||
* This function does not check whether `codepoint` is valid Unicode.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_encode_char(utf8proc_int32_t codepoint, utf8proc_uint8_t *dst);
|
||||
|
||||
/**
|
||||
* Look up the properties for a given codepoint.
|
||||
*
|
||||
* @param codepoint The Unicode codepoint.
|
||||
*
|
||||
* @returns
|
||||
* A pointer to a (constant) struct containing information about
|
||||
* the codepoint.
|
||||
* @par
|
||||
* If the codepoint is unassigned or invalid, a pointer to a special struct is
|
||||
* returned in which `category` is 0 (@ref UTF8PROC_CATEGORY_CN).
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const utf8proc_property_t *utf8proc_get_property(utf8proc_int32_t codepoint);
|
||||
|
||||
/** Decompose a codepoint into an array of codepoints.
|
||||
*
|
||||
* @param codepoint the codepoint.
|
||||
* @param dst the destination buffer.
|
||||
* @param bufsize the size of the destination buffer.
|
||||
* @param options one or more of the following flags:
|
||||
* - @ref UTF8PROC_REJECTNA - return an error `codepoint` is unassigned
|
||||
* - @ref UTF8PROC_IGNORE - strip "default ignorable" codepoints
|
||||
* - @ref UTF8PROC_CASEFOLD - apply Unicode casefolding
|
||||
* - @ref UTF8PROC_COMPAT - replace certain codepoints with their
|
||||
* compatibility decomposition
|
||||
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
|
||||
* - @ref UTF8PROC_LUMP - lump certain different codepoints together
|
||||
* - @ref UTF8PROC_STRIPMARK - remove all character marks
|
||||
* @param last_boundclass
|
||||
* Pointer to an integer variable containing
|
||||
* the previous codepoint's boundary class if the @ref UTF8PROC_CHARBOUND
|
||||
* option is used. Otherwise, this parameter is ignored.
|
||||
*
|
||||
* @return
|
||||
* In case of success, the number of codepoints written is returned; in case
|
||||
* of an error, a negative error code is returned (@ref utf8proc_errmsg).
|
||||
* @par
|
||||
* If the number of written codepoints would be bigger than `bufsize`, the
|
||||
* required buffer size is returned, while the buffer will be overwritten with
|
||||
* undefined data.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_char(
|
||||
utf8proc_int32_t codepoint, utf8proc_int32_t *dst, utf8proc_ssize_t bufsize,
|
||||
utf8proc_option_t options, int *last_boundclass
|
||||
);
|
||||
|
||||
/**
|
||||
* The same as @ref utf8proc_decompose_char, but acts on a whole UTF-8
|
||||
* string and orders the decomposed sequences correctly.
|
||||
*
|
||||
* If the @ref UTF8PROC_NULLTERM flag in `options` is set, processing
|
||||
* will be stopped, when a NULL byte is encounted, otherwise `strlen`
|
||||
* bytes are processed. The result (in the form of 32-bit unicode
|
||||
* codepoints) is written into the buffer being pointed to by
|
||||
* `buffer` (which must contain at least `bufsize` entries). In case of
|
||||
* success, the number of codepoints written is returned; in case of an
|
||||
* error, a negative error code is returned (@ref utf8proc_errmsg).
|
||||
* See @ref utf8proc_decompose_custom to supply additional transformations.
|
||||
*
|
||||
* If the number of written codepoints would be bigger than `bufsize`, the
|
||||
* required buffer size is returned, while the buffer will be overwritten with
|
||||
* undefined data.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options
|
||||
);
|
||||
|
||||
/**
|
||||
* The same as @ref utf8proc_decompose, but also takes a `custom_func` mapping function
|
||||
* that is called on each codepoint in `str` before any other transformations
|
||||
* (along with a `custom_data` pointer that is passed through to `custom_func`).
|
||||
* The `custom_func` argument is ignored if it is `NULL`. See also @ref utf8proc_map_custom.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_decompose_custom(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen,
|
||||
utf8proc_int32_t *buffer, utf8proc_ssize_t bufsize, utf8proc_option_t options,
|
||||
utf8proc_custom_func custom_func, void *custom_data
|
||||
);
|
||||
|
||||
/**
|
||||
* Normalizes the sequence of `length` codepoints pointed to by `buffer`
|
||||
* in-place (i.e., the result is also stored in `buffer`).
|
||||
*
|
||||
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
||||
* @param length the length (in codepoints) of the buffer.
|
||||
* @param options a bitwise or (`|`) of one or more of the following flags:
|
||||
* - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
|
||||
* - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
|
||||
* - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
|
||||
* - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
|
||||
* - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
|
||||
* codepoints
|
||||
* - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
|
||||
* the unicode versioning stability
|
||||
*
|
||||
* @return
|
||||
* In case of success, the length (in codepoints) of the normalized UTF-32 string is
|
||||
* returned; otherwise, a negative error code is returned (@ref utf8proc_errmsg).
|
||||
*
|
||||
* @warning The entries of the array pointed to by `str` have to be in the
|
||||
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_normalize_utf32(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
||||
|
||||
/**
|
||||
* Reencodes the sequence of `length` codepoints pointed to by `buffer`
|
||||
* UTF-8 data in-place (i.e., the result is also stored in `buffer`).
|
||||
* Can optionally normalize the UTF-32 sequence prior to UTF-8 conversion.
|
||||
*
|
||||
* @param buffer the (native-endian UTF-32) unicode codepoints to re-encode.
|
||||
* @param length the length (in codepoints) of the buffer.
|
||||
* @param options a bitwise or (`|`) of one or more of the following flags:
|
||||
* - @ref UTF8PROC_NLF2LS - convert LF, CRLF, CR and NEL into LS
|
||||
* - @ref UTF8PROC_NLF2PS - convert LF, CRLF, CR and NEL into PS
|
||||
* - @ref UTF8PROC_NLF2LF - convert LF, CRLF, CR and NEL into LF
|
||||
* - @ref UTF8PROC_STRIPCC - strip or convert all non-affected control characters
|
||||
* - @ref UTF8PROC_COMPOSE - try to combine decomposed codepoints into composite
|
||||
* codepoints
|
||||
* - @ref UTF8PROC_STABLE - prohibit combining characters that would violate
|
||||
* the unicode versioning stability
|
||||
* - @ref UTF8PROC_CHARBOUND - insert 0xFF bytes before each grapheme cluster
|
||||
*
|
||||
* @return
|
||||
* In case of success, the length (in bytes) of the resulting nul-terminated
|
||||
* UTF-8 string is returned; otherwise, a negative error code is returned
|
||||
* (@ref utf8proc_errmsg).
|
||||
*
|
||||
* @warning The amount of free space pointed to by `buffer` must
|
||||
* exceed the amount of the input data by one byte, and the
|
||||
* entries of the array pointed to by `str` have to be in the
|
||||
* range `0x0000` to `0x10FFFF`. Otherwise, the program might crash!
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, utf8proc_ssize_t length, utf8proc_option_t options);
|
||||
|
||||
/**
|
||||
* Given a pair of consecutive codepoints, return whether a grapheme break is
|
||||
* permitted between them (as defined by the extended grapheme clusters in UAX#29).
|
||||
*
|
||||
* @param state Beginning with Version 29 (Unicode 9.0.0), this algorithm requires
|
||||
* state to break graphemes. This state can be passed in as a pointer
|
||||
* in the `state` argument and should initially be set to 0. If the
|
||||
* state is not passed in (i.e. a null pointer is passed), UAX#29 rules
|
||||
* GB10/12/13 which require this state will not be applied, essentially
|
||||
* matching the rules in Unicode 8.0.0.
|
||||
*
|
||||
* @warning If the state parameter is used, `utf8proc_grapheme_break_stateful` must
|
||||
* be called IN ORDER on ALL potential breaks in a string.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break_stateful(
|
||||
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2, utf8proc_int32_t *state);
|
||||
|
||||
/**
|
||||
* Same as @ref utf8proc_grapheme_break_stateful, except without support for the
|
||||
* Unicode 9 additions to the algorithm. Supported for legacy reasons.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(
|
||||
utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
|
||||
|
||||
|
||||
/**
|
||||
* Given a codepoint `c`, return the codepoint of the corresponding
|
||||
* lower-case character, if any; otherwise (if there is no lower-case
|
||||
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
|
||||
|
||||
/**
|
||||
* Given a codepoint `c`, return the codepoint of the corresponding
|
||||
* upper-case character, if any; otherwise (if there is no upper-case
|
||||
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
|
||||
|
||||
/**
|
||||
* Given a codepoint `c`, return the codepoint of the corresponding
|
||||
* title-case character, if any; otherwise (if there is no title-case
|
||||
* variant, or if `c` is not a valid codepoint) return `c`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_totitle(utf8proc_int32_t c);
|
||||
|
||||
/**
|
||||
* Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
|
||||
* except that a width of 0 is returned for non-printable codepoints
|
||||
* instead of -1 as in `wcwidth`.
|
||||
*
|
||||
* @note
|
||||
* If you want to check for particular types of non-printable characters,
|
||||
* (analogous to `isprint` or `iscntrl`), use @ref utf8proc_category. */
|
||||
UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Return the Unicode category for the codepoint (one of the
|
||||
* @ref utf8proc_category_t constants.)
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_category_t utf8proc_category(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Return the two-letter (nul-terminated) Unicode category string for
|
||||
* the codepoint (e.g. `"Lu"` or `"Co"`).
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoint);
|
||||
|
||||
/**
|
||||
* Maps the given UTF-8 string pointed to by `str` to a new UTF-8
|
||||
* string, allocated dynamically by `malloc` and returned via `dstptr`.
|
||||
*
|
||||
* If the @ref UTF8PROC_NULLTERM flag in the `options` field is set,
|
||||
* the length is determined by a NULL terminator, otherwise the
|
||||
* parameter `strlen` is evaluated to determine the string length, but
|
||||
* in any case the result will be NULL terminated (though it might
|
||||
* contain NULL characters with the string if `str` contained NULL
|
||||
* characters). Other flags in the `options` field are passed to the
|
||||
* functions defined above, and regarded as described. See also
|
||||
* @ref utfproc_map_custom to supply a custom codepoint transformation.
|
||||
*
|
||||
* In case of success the length of the new string is returned,
|
||||
* otherwise a negative error code is returned.
|
||||
*
|
||||
* @note The memory of the new UTF-8 string will have been allocated
|
||||
* with `malloc`, and should therefore be deallocated with `free`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options
|
||||
);
|
||||
|
||||
/**
|
||||
* Like @ref utf8proc_map, but also takes a `custom_func` mapping function
|
||||
* that is called on each codepoint in `str` before any other transformations
|
||||
* (along with a `custom_data` pointer that is passed through to `custom_func`).
|
||||
* The `custom_func` argument is ignored if it is `NULL`.
|
||||
*/
|
||||
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom(
|
||||
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_uint8_t **dstptr, utf8proc_option_t options,
|
||||
utf8proc_custom_func custom_func, void *custom_data
|
||||
);
|
||||
|
||||
/** @name Unicode normalization
|
||||
*
|
||||
* Returns a pointer to newly allocated memory of a NFD, NFC, NFKD or NFKC
|
||||
* normalized version of the null-terminated string `str`. These
|
||||
* are shortcuts to calling @ref utf8proc_map with @ref UTF8PROC_NULLTERM
|
||||
* combined with @ref UTF8PROC_STABLE and flags indicating the normalization.
|
||||
*/
|
||||
/** @{ */
|
||||
/** NFD normalization (@ref UTF8PROC_DECOMPOSE). */
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFD(const utf8proc_uint8_t *str);
|
||||
/** NFC normalization (@ref UTF8PROC_COMPOSE). */
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFC(const utf8proc_uint8_t *str);
|
||||
/** NFKD normalization (@ref UTF8PROC_DECOMPOSE and @ref UTF8PROC_COMPAT). */
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKD(const utf8proc_uint8_t *str);
|
||||
/** NFKC normalization (@ref UTF8PROC_COMPOSE and @ref UTF8PROC_COMPAT). */
|
||||
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
|
||||
/** @} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
14386
libdispatch/utf8proc_data.c
Normal file
14386
libdispatch/utf8proc_data.c
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -18,8 +18,7 @@ dnl
|
||||
#include "ncx.h"
|
||||
#include "fbits.h"
|
||||
#include "rnd.h"
|
||||
#include "utf8proc.h"
|
||||
|
||||
#include "ncutf8.h"
|
||||
|
||||
/*
|
||||
* Free attr
|
||||
@ -117,9 +116,11 @@ new_NC_attr(
|
||||
{
|
||||
NC_string *strp;
|
||||
NC_attr *attrp;
|
||||
char *name;
|
||||
int stat;
|
||||
|
||||
char *name = (char *)utf8proc_NFC((const unsigned char *)uname);
|
||||
if(name == NULL)
|
||||
stat = nc_utf8_normalize((const unsigned char *)uname,(unsigned char**)&name);
|
||||
if(stat != NC_NOERR)
|
||||
return NULL;
|
||||
assert(name != NULL && *name != 0);
|
||||
|
||||
@ -344,6 +345,7 @@ NC_findattr(const NC_attrarray *ncap, const char *uname)
|
||||
size_t attrid;
|
||||
size_t slen;
|
||||
char *name;
|
||||
int stat;
|
||||
|
||||
assert(ncap != NULL);
|
||||
|
||||
@ -353,8 +355,8 @@ NC_findattr(const NC_attrarray *ncap, const char *uname)
|
||||
attrpp = (NC_attr **) ncap->value;
|
||||
|
||||
/* normalized version of uname */
|
||||
name = (char *)utf8proc_NFC((const unsigned char *)uname);
|
||||
if(name == NULL)
|
||||
stat = nc_utf8_normalize((const unsigned char *)uname,(unsigned char**)&name);
|
||||
if(stat != NC_NOERR)
|
||||
return NULL; /* TODO: need better way to indicate no memory */
|
||||
slen = strlen(name);
|
||||
|
||||
@ -531,9 +533,9 @@ NC3_rename_att( int ncid, int varid, const char *name, const char *unewname)
|
||||
}
|
||||
|
||||
old = attrp->name;
|
||||
newname = (char *)utf8proc_NFC((const unsigned char *)unewname);
|
||||
if(newname == NULL)
|
||||
return NC_EBADNAME;
|
||||
status = nc_utf8_normalize((const unsigned char *)unewname,(unsigned char**)&newname);
|
||||
if(status != NC_NOERR)
|
||||
return status;
|
||||
if(NC_indef(ncp))
|
||||
{
|
||||
newStr = new_NC_string(strlen(newname), newname);
|
||||
@ -587,11 +589,12 @@ NC3_del_att(int ncid, int varid, const char *uname)
|
||||
return NC_ENOTVAR;
|
||||
|
||||
{
|
||||
char *name = (char *)utf8proc_NFC((const unsigned char *)uname);
|
||||
if(name == NULL)
|
||||
return NC_ENOMEM;
|
||||
char* name;
|
||||
int stat = nc_utf8_normalize((const unsigned char *)uname,(unsigned char**)&name);
|
||||
if(stat != NC_NOERR)
|
||||
return stat;
|
||||
|
||||
/* sortof inline NC_findattr() */
|
||||
/* sortof inline NC_findattr() */
|
||||
slen = strlen(name);
|
||||
|
||||
attrpp = (NC_attr **) ncap->value;
|
||||
|
21
libsrc/dim.c
21
libsrc/dim.c
@ -10,7 +10,7 @@
|
||||
#include <assert.h>
|
||||
#include "ncx.h"
|
||||
#include "fbits.h"
|
||||
#include "utf8proc.h"
|
||||
#include "ncutf8.h"
|
||||
|
||||
/*
|
||||
* Free dim
|
||||
@ -52,9 +52,11 @@ new_NC_dim(const char *uname, size_t size)
|
||||
{
|
||||
NC_string *strp;
|
||||
NC_dim *dimp;
|
||||
int stat;
|
||||
char* name;
|
||||
|
||||
char *name = (char *)utf8proc_NFC((const unsigned char *)uname);
|
||||
if(name == NULL)
|
||||
stat = nc_utf8_normalize((const unsigned char *)uname,(unsigned char **)&name);
|
||||
if(stat != NC_NOERR)
|
||||
return NULL;
|
||||
strp = new_NC_string(strlen(name), name);
|
||||
free(name);
|
||||
@ -135,13 +137,14 @@ NC_finddim(const NC_dimarray *ncap, const char *uname, NC_dim **dimpp)
|
||||
return -1;
|
||||
|
||||
{
|
||||
int stat;
|
||||
dimid = 0;
|
||||
loc = (NC_dim **) ncap->value;
|
||||
|
||||
/* normalized version of uname */
|
||||
name = (char *)utf8proc_NFC((const unsigned char *)uname);
|
||||
if(name == NULL)
|
||||
return NC_ENOMEM;
|
||||
stat = nc_utf8_normalize((const unsigned char *)uname,(unsigned char **)&name);
|
||||
if(stat != NC_NOERR)
|
||||
return stat;
|
||||
dimid = (int)NC_hashmapGetDim(ncap, name);
|
||||
free(name);
|
||||
if (dimid >= 0) {
|
||||
@ -468,9 +471,9 @@ NC3_rename_dim( int ncid, int dimid, const char *unewname)
|
||||
return NC_EBADDIM;
|
||||
|
||||
old = dimp->name;
|
||||
newname = (char *)utf8proc_NFC((const unsigned char *)unewname);
|
||||
if(newname == NULL)
|
||||
return NC_ENOMEM;
|
||||
status = nc_utf8_normalize((const unsigned char *)unewname,(unsigned char **)&newname);
|
||||
if(status != NC_NOERR)
|
||||
return status;
|
||||
if(NC_indef(ncp))
|
||||
{
|
||||
NC_string *newStr = new_NC_string(strlen(newname), newname);
|
||||
|
21
libsrc/var.c
21
libsrc/var.c
@ -12,7 +12,7 @@
|
||||
#include <limits.h>
|
||||
#include "ncx.h"
|
||||
#include "rnd.h"
|
||||
#include "utf8proc.h"
|
||||
#include "ncutf8.h"
|
||||
|
||||
#ifndef OFF_T_MAX
|
||||
//#define OFF_T_MAX (~ (off_t) 0 - (~ (off_t) 0 << (CHAR_BIT * sizeof (off_t) - 1)))
|
||||
@ -124,9 +124,11 @@ new_NC_var(const char *uname, nc_type type,
|
||||
{
|
||||
NC_string *strp = NULL;
|
||||
NC_var *varp = NULL;
|
||||
int stat;
|
||||
char* name;
|
||||
|
||||
char *name = (char *)utf8proc_NFC((const unsigned char *)uname);
|
||||
if(name == NULL)
|
||||
stat = nc_utf8_normalize((const unsigned char *)uname,(unsigned char **)&name);
|
||||
if(stat != NC_NOERR)
|
||||
return NULL;
|
||||
strp = new_NC_string(strlen(name), name);
|
||||
free(name);
|
||||
@ -353,6 +355,7 @@ NC_findvar(const NC_vararray *ncap, const char *uname, NC_var **varpp)
|
||||
{
|
||||
int hash_var_id;
|
||||
char *name;
|
||||
int stat;
|
||||
|
||||
assert(ncap != NULL);
|
||||
|
||||
@ -361,9 +364,9 @@ NC_findvar(const NC_vararray *ncap, const char *uname, NC_var **varpp)
|
||||
|
||||
|
||||
/* normalized version of uname */
|
||||
name = (char *)utf8proc_NFC((const unsigned char *)uname);
|
||||
if(name == NULL)
|
||||
return NC_ENOMEM;
|
||||
stat = nc_utf8_normalize((const unsigned char *)uname,(unsigned char **)&name);
|
||||
if(stat != NC_NOERR)
|
||||
return stat;
|
||||
|
||||
hash_var_id = (int)NC_hashmapGetVar(ncap, name);
|
||||
free(name);
|
||||
@ -752,9 +755,9 @@ NC3_rename_var(int ncid, int varid, const char *unewname)
|
||||
|
||||
|
||||
old = varp->name;
|
||||
newname = (char *)utf8proc_NFC((const unsigned char *)unewname);
|
||||
if(newname == NULL)
|
||||
return NC_ENOMEM;
|
||||
status = nc_utf8_normalize((const unsigned char *)unewname,(unsigned char **)&newname);
|
||||
if(status != NC_NOERR)
|
||||
return status;
|
||||
if(NC_indef(ncp))
|
||||
{
|
||||
/* Remove old name from hashmap; add new... */
|
||||
|
@ -16,8 +16,8 @@ conditions.
|
||||
#include "nc4internal.h"
|
||||
#include "nc.h" /* from libsrc */
|
||||
#include "ncdispatch.h" /* from libdispatch */
|
||||
#include "ncutf8.h"
|
||||
#include "H5DSpublic.h"
|
||||
#include <utf8proc.h>
|
||||
|
||||
#define MEGABYTE 1048576
|
||||
|
||||
@ -99,8 +99,9 @@ nc4_check_name(const char *name, char *norm_name)
|
||||
return retval;
|
||||
|
||||
/* Normalize the name. */
|
||||
if (!(temp = (char *)utf8proc_NFC((const unsigned char *)name)))
|
||||
return NC_EINVAL;
|
||||
retval = nc_utf8_normalize((const unsigned char *)name,(unsigned char**)&temp);
|
||||
if(retval != NC_NOERR)
|
||||
return retval;
|
||||
strcpy(norm_name, temp);
|
||||
free(temp);
|
||||
|
||||
@ -1407,8 +1408,9 @@ int
|
||||
nc4_normalize_name(const char *name, char *norm_name)
|
||||
{
|
||||
char *temp_name;
|
||||
if (!(temp_name = (char *)utf8proc_NFC((const unsigned char *)name)))
|
||||
return NC_EINVAL;
|
||||
int stat = nc_utf8_normalize((const unsigned char *)name,(unsigned char **)&temp_name);
|
||||
if(stat != NC_NOERR)
|
||||
return stat;
|
||||
if (strlen(temp_name) > NC_MAX_NAME)
|
||||
{
|
||||
free(temp_name);
|
||||
|
@ -1,5 +1,5 @@
|
||||
# Test c output
|
||||
T=tst_small
|
||||
T=tst_utf8_normalize2
|
||||
#CMD=valgrind --leak-check=full
|
||||
CMD=gdb --args
|
||||
|
||||
|
@ -19,7 +19,7 @@ unlim.nc tst_inq_type.nc
|
||||
# These are the tests which are always run.
|
||||
TESTPROGRAMS = t_nc tst_small nc_test tst_misc tst_norm \
|
||||
tst_names tst_nofill tst_nofill2 tst_nofill3 tst_atts3 \
|
||||
tst_meta tst_inq_type
|
||||
tst_meta tst_inq_type tst_utf8_validate tst_utf8_phrases
|
||||
|
||||
if USE_NETCDF4
|
||||
TESTPROGRAMS += tst_atts tst_put_vars
|
||||
@ -63,15 +63,15 @@ endif
|
||||
TESTS = $(TESTPROGRAMS)
|
||||
|
||||
if BUILD_UTILITIES
|
||||
if BUILD_DISKLESS
|
||||
if BUILD_DISKLESS
|
||||
TESTS += run_diskless.sh
|
||||
if BUILD_MMAP
|
||||
if BUILD_MMAP
|
||||
TESTS += run_mmap.sh
|
||||
endif
|
||||
if LARGE_FILE_TESTS
|
||||
endif
|
||||
if LARGE_FILE_TESTS
|
||||
TESTS += run_diskless2.sh
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
if USE_PNETCDF
|
||||
|
342
nc_test/tst_utf8_phrases.c
Normal file
342
nc_test/tst_utf8_phrases.c
Normal file
@ -0,0 +1,342 @@
|
||||
|
||||
/*
|
||||
* Copyright 1998-2015 University Corporation for Atmospheric Research/Unidata
|
||||
* See the LICENSE file for more information.
|
||||
*/
|
||||
|
||||
#include <config.h>
|
||||
#include <stdlib.h>
|
||||
#include <nc_tests.h>
|
||||
#include "err_macros.h"
|
||||
#include <netcdf.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "ncutf8.h"
|
||||
|
||||
/*
|
||||
The test here are taken from the UTF-8 SAMPLER
|
||||
|
||||
Frank da Cruz
|
||||
The Kermit Project <http://kermitproject.org/index.html>
|
||||
New York City
|
||||
fdc@kermitproject.org <mailto:fdc@kermitproject.org>
|
||||
|
||||
/Last update:/ Tue Jan 31 16:56:13 2017
|
||||
*/
|
||||
|
||||
|
||||
struct Test {
|
||||
int xfail;
|
||||
const char* id;
|
||||
const char* description;
|
||||
const char* data;
|
||||
};
|
||||
#define NULLTEST {0,NULL,NULL,NULL}
|
||||
|
||||
static const struct Test utf8currency[] = {
|
||||
{0,"1.1","Currencies", "¥£€$¢₡₢₣₤₥₦₧₨₩₪₫₭₮₯₹"},
|
||||
NULLTEST
|
||||
};
|
||||
|
||||
static const struct Test utf8poems[] = {
|
||||
{0,"2.1","Runes",
|
||||
"ᚠᛇᚻ᛫ᛒᛦᚦ᛫ᚠᚱᚩᚠᚢᚱ᛫ᚠᛁᚱᚪ᛫ᚷᛖᚻᚹᛦᛚᚳᚢᛗ\nᛋᚳᛖᚪᛚ᛫ᚦᛖᚪᚻ᛫ᛗᚪᚾᚾᚪ᛫ᚷᛖᚻᚹᛦᛚᚳ᛫ᛗᛁᚳᛚᚢᚾ᛫ᚻᛦᛏ᛫ᛞᚫᛚᚪᚾ\nᚷᛁᚠ᛫ᚻᛖ᛫ᚹᛁᛚᛖ᛫ᚠᚩᚱ᛫ᛞᚱᛁᚻᛏᚾᛖ᛫ᛞᚩᛗᛖᛋ᛫ᚻᛚᛇᛏᚪᚾ᛬\n"
|
||||
},
|
||||
{0,"2.2","Middle English",
|
||||
"An preost wes on leoden, Laȝamon was ihoten"
|
||||
"He wes Leovenaðes sone -- liðe him be Drihten."
|
||||
"He wonede at Ernleȝe at æðelen are chirechen,"
|
||||
"Uppen Sevarne staþe, sel þar him þuhte,"
|
||||
"Onfest Radestone, þer he bock radde."
|
||||
},
|
||||
{0,"2.3","Middle High German",
|
||||
"Sîne klâwen durh die wolken sint geslagen,"
|
||||
"er stîget ûf mit grôzer kraft,"
|
||||
"ich sih in grâwen tägelîch als er wil tagen,"
|
||||
"den tac, der im geselleschaft"
|
||||
"erwenden wil, dem werden man,"
|
||||
"den ich mit sorgen în verliez."
|
||||
"ich bringe in hinnen, ob ich kan."
|
||||
"sîn vil manegiu tugent michz leisten hiez."
|
||||
},
|
||||
|
||||
{0,"2.4",
|
||||
"Greek.1",
|
||||
"Τη γλώσσα μου έδωσαν ελληνικ το σπίτι φτωχικό στις αμμουδιές του Ομήρου. Μονάχη έγνοια η γλώσσα μου στις αμμουδιές του Ομήρου. από το Άξιον Εστί του Οδυσσέα Ελύτη"
|
||||
},
|
||||
{0,"2.5",
|
||||
"Greek.2",
|
||||
"Τὴ γλῶσσα μοῦ ἔδωσαν ἑλληνικὴτὸ σπίτι φτωχικὸ στὶς ἀμμουδιὲς τοῦ Ὁμήρου. Μονάχη ἔγνοια ἡ γλῶσσα μου στὶς ἀμμουδιὲς τοῦ Ὁμήρου.ἀπὸ τὸ Ἄξιον ἐστί τοῦ Ὀδυσσέα Ἐλύτη"
|
||||
},
|
||||
{0,"2.6",
|
||||
"Russion",
|
||||
"На берегу пустынных волнСтоял он, дум великих полн,И вдаль глядел. Пред ним широкоРека неслася; бедный чёлнПо ней стремился одиноко.По мшистым, топким берегамЧернели избы здесь и там,Приют убогого чухонца;И лес, неведомый лучамВ тумане спрятанного солнца, Кругом шумел."
|
||||
},
|
||||
{0,"2.7",
|
||||
"Georgian",
|
||||
"ვეპხის ტყაოსანი შოთა რუსთაველიღმერთსი შემვედრე, ნუთუ კვლა დამხსნას სოფლისა შრომასა, ცეცხლს, წყალსადა მიწასა, ჰაერთა თანა მრომასა; მომცნეს ფრთენი და აღვფრინდე,მივჰხვდე მას ჩემსა ნდომასა, დღისით და ღამით ვჰხედვიდე მზისა ელვათა კრთომაასა."
|
||||
},
|
||||
{0,"2.8",
|
||||
"Tamil.1",
|
||||
"யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம்,பாமரராய் விலங்குகளாய், உலகனைத்தும் இகழ்ச்சிசொலப் பான்மை கெட்டு,நாமமது தமிழரெனக் கொண்டு இங்கு வாழ்ந்திடுதல் நன்றோ? சொல்லீர்! தேமதுரத் தமிழோசை உலகமெலாம் பரவும்வகை செய்தல் வேண்டும்."
|
||||
},
|
||||
{0,"2.9",
|
||||
"Tamil.2",
|
||||
"ಬಾ ಇಲ್ಲಿ ಸಂಭವಿಸು ಇಂದೆನ್ನ ಹೃದಯದಲಿನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗೀ...ಮಣ್ಣಾಗಿ ಮರವಾಗಿ ಮಿಗವಾಗಿ ಕಗವಾಗಿಭವ ಭವದಿ ಭತಿಸಿಹೇ ಭವತಿ ದೂರ ನಿತ್ಯವೂ ಅವತರಿಪ ಸತ್ಯಾವತಾರ || ಬಾ ಇಲ್ಲಿ ||"
|
||||
},
|
||||
NULLTEST
|
||||
};
|
||||
|
||||
static const struct Test utf8phrases1[] = {
|
||||
{0,"3.1","Sanskrit", "काचं शक्नोम्यत्तुम् । नोपहिनस्ति माम् ॥"},
|
||||
{0,"3.2","Sanskrit/(standard transcription)", "kācaṃ śaknomyattum nopahinasti mām."},
|
||||
{0,"3.3","Classical Greek", "ὕαλον ϕαγεῖν δύναμαι· τοῦτο οὔ με βλάπτει."},
|
||||
{0,"3.4","Greek (monotonic)", "Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."},
|
||||
{0,"3.5","Greek (polytonic)", "Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα."},
|
||||
{0,"3.6","Latin", "Vitrum edere possum; mihi non nocet."},
|
||||
{0,"3.7","Old French", "Je puis mangier del voirre. Ne me nuit."},
|
||||
{0,"3.8","French", "Je peux manger du verre, ça ne me fait pas mal."},
|
||||
{0,"3.9","Provençal / Occitan", "Pòdi manjar de veire, me nafrariá pas."},
|
||||
{0,"3.10","Québécois", "J'peux manger d'la vitre, ça m'fa pas mal."},
|
||||
{0,"3.11","Walloon", "Dji pou magnî do vêre, çoula m' freut nén må."},
|
||||
{0,"3.12","Picard", "Ch'peux mingi du verre, cha m'foé mie n'ma."},
|
||||
{0,"3.13","Kreyòl Ayisyen (Haitï)", "Mwen kap manje vè, li pa blese'm."},
|
||||
{0,"3.14","Basque", "Kristala jan dezaket, ez dit minik ematen."},
|
||||
{0,"3.15","Catalan / Català", "Puc menjar vidre, que no em fa mal."},
|
||||
{0,"3.16","Spanish", "Puedo comer vidrio, no me hace daño."},
|
||||
{0,"3.17","Aragonés", "Puedo minchar beire, no me'n fa mal ."},
|
||||
{0,"3.18","Galician", "Eu podo xantar cristais e non cortarme."},
|
||||
{0,"3.19","European Portuguese", "Posso comer vidro, não me faz mal."},
|
||||
{0,"3.20","Brazilian Portuguese (8 <#notes>)", "Posso comer vidro, não me machuca."},
|
||||
{0,"3.21","Caboverdiano/Kabuverdianu (Cape Verde)", "M' podê cumê vidru, ca ta maguâ-m'."},
|
||||
{0,"3.22","Papiamentu", "Ami por kome glas anto e no ta hasimi daño."},
|
||||
{0,"3.23","Italian", "Posso mangiare il vetro e non mi fa male."},
|
||||
{0,"3.24","Milanese", "Sôn bôn de magnà el véder, el me fa minga mal."},
|
||||
{0,"3.25","Roman", "Me posso magna' er vetro, e nun me fa male."},
|
||||
{0,"3.26","Napoletano", "M' pozz magna' o'vetr, e nun m' fa mal."},
|
||||
{0,"3.27","Venetian", "Mi posso magnare el vetro, no'l me fa mae."},
|
||||
{0,"3.28","Zeneise /(Genovese)", "/ Pòsso mangiâ o veddro e o no me fà mâ."},
|
||||
{0,"3.29","Sicilian", "Puotsu mangiari u vitru, nun mi fa mali."},
|
||||
{0,"3.30","Romansch (Grischun)", "Jau sai mangiar vaider, senza che quai fa donn a mai."},
|
||||
{0,"3.31","Romanian", "Pot să mănânc sticlă și ea nu mă rănește."},
|
||||
{0,"3.32","Esperanto", "Mi povas manĝi vitron, ĝi ne damaĝas min."},
|
||||
{0,"3.33","Cornish", "Mý a yl dybry gwéder hag éf ny wra ow ankenya."},
|
||||
{0,"3.34","Welsh", "Dw i'n gallu bwyta gwydr, 'dyw e ddim yn gwneud dolur i mi."},
|
||||
{0,"3.35","Manx Gaelic", "Foddym gee glonney agh cha jean eh gortaghey mee."},
|
||||
{0,"3.36","Old Irish /(Ogham)", "/ ᚛᚛ᚉᚑᚅᚔᚉᚉᚔᚋ ᚔᚈᚔ ᚍᚂᚐᚅᚑ ᚅᚔᚋᚌᚓᚅᚐ᚜"},
|
||||
{0,"3.37","Old Irish /(Latin)", "/ Con·iccim ithi nglano. Ním·géna."},
|
||||
{0,"3.38","Irish", "Is féidir liom gloinne a ithe. Ní dhéanann sí dochar ar bith dom."},
|
||||
{0,"3.39","Ulster Gaelic", "Ithim-sa gloine agus ní miste damh é."},
|
||||
{0,"3.40","Scottish Gaelic", "S urrainn dhomh gloinne ithe; cha ghoirtich i mi."},
|
||||
{0,"3.41","Anglo-Saxon /(Runes)", "/ ᛁᚳ᛫ᛗᚨᚷ᛫ᚷᛚᚨᛋ᛫ᛖᚩᛏᚪᚾ᛫ᚩᚾᛞ᛫ᚻᛁᛏ᛫ᚾᛖ᛫ᚻᛖᚪᚱᛗᛁᚪᚧ᛫ᛗᛖ᛬"},
|
||||
{0,"3.42","Anglo-Saxon /(Latin)", "/ Ic mæg glæs eotan ond hit ne hearmiað me."},
|
||||
{0,"3.43","Middle English", "Ich canne glas eten and hit hirtiþ me nouȝt."},
|
||||
{0,"3.44","English", "I can eat glass and it doesn't hurt me."},
|
||||
{0,"3.45","English /(IPA)", "/ [aɪ kæn iːt glɑːs ænd ɪt dɐz nɒt hɜːt miː]"},
|
||||
{0,"3.46","English /(Braille)", "/ ⠊⠀⠉⠁⠝⠀⠑⠁⠞⠀⠛⠇⠁⠎⠎⠀⠁⠝⠙⠀⠊⠞⠀⠙⠕⠑⠎⠝⠞⠀⠓⠥⠗⠞⠀⠍⠑"},
|
||||
{0,"3.47","Jamaican", "Mi kian niam glas han i neba hot mi."},
|
||||
{0,"3.48","Lalland Scots / Doric", "Ah can eat gless, it disnae hurt us."},
|
||||
{0,"3.49","Gothic (4)", "𐌼𐌰𐌲 𐌲𐌻𐌴𐍃 𐌹̈𐍄𐌰𐌽, 𐌽𐌹 𐌼𐌹𐍃 𐍅𐌿 𐌽𐌳𐌰𐌽 𐌱𐍂𐌹𐌲𐌲𐌹𐌸."},
|
||||
{0,"3.50","Old Norse /(Runes)", "/ ᛖᚴ ᚷᛖᛏ ᛖᛏᛁ ᚧ ᚷᛚᛖᚱ ᛘᚾ ᚦᛖᛋᛋ ᚨᚧ ᚡᛖ ᚱᚧᚨ ᛋᚨᚱ"},
|
||||
{0,"3.51","Old Norse /(Latin)", "/ Ek get etið gler án þess að verða sár."},
|
||||
{0,"3.52","Norsk / Norwegian (Nynorsk)", " Eg kan eta glas utan å skada meg."},
|
||||
{0,"3.53","Norsk / Norwegian (Bokmål)", " Jeg kan spise glass uten å skade meg."},
|
||||
{0,"3.54","Føroyskt / Faroese", "Eg kann eta glas, skaðaleysur."},
|
||||
{0,"3.55","Íslenska / Icelandic", "Ég get etið gler án þess að meiða mig."},
|
||||
{0,"3.56","Svenska / Swedish", "Jag kan äta glas utan att skada mig."},
|
||||
{0,"3.57","Dansk / Danish", "Jeg kan spise glas, det gør ikke ondt på mig."},
|
||||
{0,"3.58","Sønderjysk", "Æ ka æe glass uhen at det go mæ naue."},
|
||||
{0,"3.59","Frysk / Frisian", "Ik kin glês ite, it docht me net sear."},
|
||||
{0,"3.60","Nederlands / Dutch", "Ik kan glas eten, het doet mij geen kwaad."},
|
||||
{0,"3.61","Kirchröadsj/Bôchesserplat", "Iech ken glaas èèse, mer 't deet miech jing pieng."},
|
||||
{0,"3.62","Afrikaans", "Ek kan glas eet, maar dit doen my nie skade nie."},
|
||||
{0,"3.63","Lëtzebuergescht / Luxemburgish", "Ech kan Glas iessen, daat deet mir nët wei."},
|
||||
{0,"3.64","Deutsch / German", "Ich kann Glas essen, ohne mir zu schaden."},
|
||||
{0,"3.65","Ruhrdeutsch", "Ich kann Glas verkasematuckeln, ohne dattet mich wat jucken tut."},
|
||||
{0,"3.66","Langenfelder Platt", "Isch kann Jlaas kimmeln, uuhne datt mich datt weh dääd."},
|
||||
{0,"3.67","Lausitzer Mundart (Lusatian)", "Ich koann Gloos assn und doas dudd merr ni wii."},
|
||||
{0,"3.68","Odenwälderisch", "Iech konn glaasch voschbachteln ohne dass es mir ebbs daun doun dud."},
|
||||
{0,"3.69","Sächsisch / Saxon", "'sch kann Glos essn, ohne dass'sch mer wehtue."},
|
||||
{0,"3.70","Pfälzisch", "Isch konn Glass fresse ohne dasses mer ebbes ausmache dud."},
|
||||
{0,"3.71","Schwäbisch / Swabian", "I kå Glas frässa, ond des macht mr nix!"},
|
||||
{0,"3.72","Deutsch (Voralberg)", "I ka glas eassa, ohne dass mar weh tuat."},
|
||||
{0,"3.73","Bayrisch / Bavarian", "I koh Glos esa, und es duard ma ned wei."},
|
||||
{0,"3.74","Allemannisch", "I kaun Gloos essen, es tuat ma ned weh."},
|
||||
{0,"3.75","Schwyzerdütsch (Zürich)", "Ich chan Glaas ässe, das schadt mir nöd."},
|
||||
{0,"3.76","Schwyzerdütsch (Luzern)", "Ech cha Glâs ässe, das schadt mer ned."},
|
||||
{0,"3.77","Hungarian", "Meg tudom enni az üveget, nem lesz tőle bajom."},
|
||||
{0,"3.78","Suomi / Finnish", "Voin syödä lasia, se ei vahingoita minua."},
|
||||
{0,"3.79","Sami (Northern)", "Sáhtán borrat lása, dat ii leat bávččas."},
|
||||
{0,"3.80","Erzian", "Мон ярсан суликадо, ды зыян эйстэнзэ а ули."},
|
||||
{0,"3.81","Northern Karelian", "Mie voin syvvä lasie ta minla ei ole kipie."},
|
||||
{0,"3.82","Southern Karelian", "Minä voin syvvä st'oklua dai minule ei ole kibie."},
|
||||
{0,"3.83","Estonian", "Ma võin klaasi süüa, see ei tee mulle midagi."},
|
||||
{0,"3.84","Latvian", "Es varu ēst stiklu, tas man nekaitē."},
|
||||
{0,"3.85","Lithuanian", "Aš galiu valgyti stiklą ir jis manęs nežeidžia"},
|
||||
{0,"3.86","Czech", "Mohu jíst sklo, neublíží mi."},
|
||||
{0,"3.87","Slovak", "Môžem jesť sklo. Nezraní ma."},
|
||||
{0,"3.88","Polska / Polish", "Mogę jeść szkło i mi nie szkodzi."},
|
||||
{0,"3.89","Slovenian", "Lahko jem steklo, ne da bi mi škodovalo."},
|
||||
{0,"3.90","Bosnian, Croatian, Montenegrin and Serbian /(Latin)/", "Ja mogu jesti staklo, i to mi ne šteti."},
|
||||
{0,"3.91","Bosnian, Montenegrin and Serbian /(Cyrillic)/", "Ја могу јести стакло, и то ми не штети."},
|
||||
{0,"3.92","Macedonian", "Можам да јадам стакло, а не ме штета."},
|
||||
{0,"3.93","Russian", "Я могу есть стекло, оно мне не вредит."},
|
||||
{0,"3.94","Belarusian /(Cyrillic)", "Я магу есці шкло, яно мне не шкодзіць."},
|
||||
{0,"3.95","Belarusian /(Lacinka)", "Ja mahu jeści škło, jano mne ne škodzić."},
|
||||
{0,"3.96","Ukrainian", "Я можу їсти скло, і воно мені не зашкодить."},
|
||||
{0,"3.97","Bulgarian", "Мога да ям стъкло, то не ми вреди."},
|
||||
{0,"3.98","Georgian", "მინას ვჭამ და არა მტკივა."},
|
||||
{0,"3.99","Armenian", "Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"},
|
||||
{0,"3.100","Albanian", "Unë mund të ha qelq dhe nuk më gjen gjë."},
|
||||
{0,"3.101","Turkish", "Cam yiyebilirim, bana zararı dokunmaz."},
|
||||
{0,"3.102","Turkish /(Ottoman)", "جام ييه بلورم بڭا ضررى طوقونمز"},
|
||||
{0,"3.103","Uzbek / O’zbekcha /(Roman)", "Men shisha yeyishim mumkin, ammo u menga zarar keltirmaydi."},
|
||||
{0,"3.104","Uzbek / Ўзбекча /(Cyrillic)/", "Мен шиша ейишим мумкин, аммо у менга зарар келтирмайди"},
|
||||
{0,"3.105","Bangla / Bengali", "আমি কাঁচ খেতে পারি, তাতে আমার কোনো ক্ষতি হয় না।"},
|
||||
{0,"3.106","Marathi", "मी काच खाऊ शकतो, मला ते दुखत नाही."},
|
||||
{0,"3.107","Kannada", "ನನಗೆ ಹಾನಿ ಆಗದೆ, ನಾನು ಗಜನ್ನು ತಿನಬಹುದು"},
|
||||
{0,"3.108","Hindi", "मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती."},
|
||||
{0,"3.109","Malayalam", "എനിക്ക് ഗ്ലാസ് തിന്നാം. അതെന്നെ വേദനിപ്പിക്കില്ല."},
|
||||
{0,"3.110","Tamil", "நான் கண்ணாடி சாப்பிடுவேன், அதனால் எனக்கு ஒரு கேடும் வராது."},
|
||||
{0,"3.111","Telugu", "నేను గాజు తినగలను మరియు అలా చేసినా నాకు ఏమి ఇబ్బంది లేదు"},
|
||||
{0,"3.112","Sinhalese", "මට වීදුරු කෑමට හැකියි. එයින් මට කිසි හානියක් සිදු නොවේ."},
|
||||
{0,"3.113","Urdu(3)", "میں کانچ کھا سکتا ہوں اور مجھے تکلیف نہیں ہوتی ۔"},
|
||||
{0,"3.114","Pashto(3)", "زه شيشه خوړلې شم، هغه ما نه خوږوي"},
|
||||
{0,"3.115","Farsi / Persian(3)", ".من می توانم بدونِ احساس درد شيشه بخورم"},
|
||||
{0,"3.116","Arabic(3)", "أنا قادر على أكل الزجاج و هذا لا يؤلمني."},
|
||||
{0,"3.117","Maltese", "Nista' niekol il-ħġieġ u ma jagħmilli xejn."},
|
||||
{0,"3.118","Hebrew(3)", "אני יכול לאכול זכוכית וזה לא מזיק לי."},
|
||||
{0,"3.119","Yiddish(3)", "איך קען עסן גלאָז און עס טוט מיר נישט װײ."},
|
||||
{0,"3.120","Twi", "Metumi awe tumpan, ɜnyɜ me hwee."},
|
||||
{0,"3.121","Hausa (/Latin/)", "Inā iya taunar gilāshi kuma in gamā lāfiyā."},
|
||||
{0,"3.122","Hausa (/Ajami/) (2)", "إِنا إِىَ تَونَر غِلَاشِ كُمَ إِن غَمَا لَافِىَا"},
|
||||
{0,"3.123","Yoruba(4)", "Mo lè je̩ dígí, kò ní pa mí lára."},
|
||||
{0,"3.124","Lingala", "Nakokí kolíya biténi bya milungi, ekosála ngáí mabé tɛ́."},
|
||||
{0,"3.125","(Ki)Swahili", "Naweza kula bilauri na sikunyui."},
|
||||
{0,"3.126","Malay", "Saya boleh makan kaca dan ia tidak mencederakan saya."},
|
||||
{0,"3.127","Tagalog", "Kaya kong kumain nang bubog at hindi ako masaktan."},
|
||||
{0,"3.128","Chamorro", "Siña yo' chumocho krestat, ti ha na'lalamen yo'."},
|
||||
{0,"3.129","Fijian", "Au rawa ni kana iloilo, ia au sega ni vakacacani kina."},
|
||||
{0,"3.130","Javanese", "Aku isa mangan beling tanpa lara."},
|
||||
{0,"3.131","Burmese (Unicode 4.0)", "က္ယ္ဝန္တော္၊က္ယ္ဝန္မ မ္ယက္စားနုိင္သည္။ ၎က္ရောင့္ ထိခုိက္မ္ဟု မရ္ဟိပာ။"},
|
||||
{0,"3.132","Burmese (Unicode 5.0)", "ကျွန်တော် ကျွန်မ မှန်စားနိုင်တယ်။ ၎င်းကြောင့် ထိခိုက်မှုမရှိပါ။"},
|
||||
{0,"3.133","Vietnamese (quốc ngữ)", "Tôi có thể ăn thủy tinh mà không hại gì."},
|
||||
{0,"3.134","Vietnamese (nôm) (4)", "些 𣎏 世 咹 水 晶 𦓡 空 𣎏 害 咦"},
|
||||
{0,"3.135","Khmer", "ខ្ញុំអាចញុំកញ្ចក់បាន ដោយគ្មានបញ្ហារ"},
|
||||
{0,"3.136","Lao", "ຂອ້ຍກິນແກ້ວໄດ້ໂດຍທີ່ມັນບໍ່ໄດ້ເຮັດໃຫ້ຂອ້ຍເຈັບ."},
|
||||
{0,"3.137","Thai", "ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ"},
|
||||
{0,"3.138","Mongolian /(Cyrillic)", "Би шил идэй чадна, надад хортой биш"},
|
||||
{0,"3.139","Mongolian /(Classic)/ (5)", "ᠪᠢ ᠰᠢᠯᠢ ᠢᠳᠡᠶᠦ ᠴᠢᠳᠠᠨᠠ ᠂ ᠨᠠᠳᠤᠷ ᠬᠣᠤᠷᠠᠳᠠᠢ ᠪᠢᠰᠢ"},
|
||||
{0,"3.140","Nepali", "म काँच खान सक्छू र मलाई केहि नी हुन्न् ।"},
|
||||
{0,"3.141","Tibetan", "ཤེལ་སྒོ་ཟ་ནས་ང་ན་གི་མ་རེད།"},
|
||||
{0,"3.142","Chinese", "我能吞下玻璃而不伤身体。"},
|
||||
{0,"3.143","Chinese (Traditional)", "我能吞下玻璃而不傷身體。"},
|
||||
{0,"3.144","Taiwanese(6)", "Góa ē-tàng chia̍h po-lê, mā bē tio̍h-siong."},
|
||||
{0,"3.145","Japanese", "私はガラスを食べられます。それは私を傷つけません。"},
|
||||
{0,"3.146","Korean", "나는 유리를 먹을 수 있어요. 그래도 아프지 않아요"},
|
||||
{0,"3.147","Bislama", "Mi save kakae glas, hemi no save katem mi."},
|
||||
{0,"3.148","Hawaiian", "Hiki iaʻu ke ʻai i ke aniani; ʻaʻole nō lā au e ʻeha."},
|
||||
{0,"3.149","Marquesan", "E koʻana e kai i te karahi, mea ʻā, ʻaʻe hauhau."},
|
||||
{0,"3.150","Inuktitut (10)", "ᐊᓕᒍᖅ ᓂᕆᔭᕌᖓᒃᑯ ᓱᕋᙱᑦᑐᓐᓇᖅᑐᖓ"},
|
||||
{0,"3.151","Chinook Jargon", "Naika məkmək kakshət labutay, pi weyk ukuk munk-sik nay."},
|
||||
{0,"3.152","Navajo", "Tsésǫʼ yishą́ągo bííníshghah dóó doo shił neezgai da."},
|
||||
{0,"3.153","Lojban", "mi kakne le nu citka le blaci .iku'i le se go'i na xrani mi"},
|
||||
{0,"3.154","Nórdicg", "Ljœr ye caudran créneþ ý jor cẃran."},
|
||||
NULLTEST
|
||||
};
|
||||
|
||||
static const struct Test utf8phrases2[] = {
|
||||
{0,"4.1","Euro Symbol", "€."},
|
||||
{0,"4.2","Greek", "Μπορώ να φάω σπασμένα γυαλιά χωρίς να πάθω τίποτα."},
|
||||
{0,"4.3","Íslenska / Icelandic", "Ég get etið gler án þess að meiða mig."},
|
||||
{0,"4.4","Polish", "Mogę jeść szkło, i mi nie szkodzi."},
|
||||
{0,"4.5","Romanian", "Pot să mănânc sticlă și ea nu mă rănește."},
|
||||
{0,"4.6","Ukrainian", "Я можу їсти шкло, й воно мені не пошкодить."},
|
||||
{0,"4.7","Armenian", "Կրնամ ապակի ուտել և ինծի անհանգիստ չըներ։"},
|
||||
{0,"4.8","Georgian", "მინას ვჭამ და არა მტკივა."},
|
||||
{0,"4.9","Hindi", "मैं काँच खा सकता हूँ, मुझे उस से कोई पीडा नहीं होती."},
|
||||
{0,"4.10", "Hebrew", "אני יכול לאכול זכוכית וזה לא מזיק לי."},
|
||||
{0,"4.11","Yiddish", "איך קען עסן גלאָז און עס טוט מיר נישט װײ."},
|
||||
{0,"4.12","Arabic", "أنا قادر على أكل الزجاج و هذا لا يؤلمني."},
|
||||
{0,"4.13","Japanese", "私はガラスを食べられます。それは私を傷つけません。"},
|
||||
{0,"4.14","Thai", "ฉันกินกระจกได้ แต่มันไม่ทำให้ฉันเจ็บ "},
|
||||
NULLTEST
|
||||
};
|
||||
|
||||
static char*
|
||||
trim(const char* s)
|
||||
{
|
||||
int i;
|
||||
size_t l = strlen(s);
|
||||
char* t = strdup(s);
|
||||
for(i=l-1;l >= 0; i--) {
|
||||
if(t[i] != ' ') break;
|
||||
}
|
||||
t[i+1] = '\0';
|
||||
return t;
|
||||
}
|
||||
|
||||
static int
|
||||
test(const struct Test* tests, const char* title)
|
||||
{
|
||||
int status = NC_NOERR;
|
||||
int i,failures = 0;
|
||||
const struct Test* p;
|
||||
|
||||
fprintf(stderr,"Testing %s...\n",title);
|
||||
for(p=tests;p->id;p++) {
|
||||
unsigned char* normal;
|
||||
char* id;
|
||||
char* description;
|
||||
const char* pf;
|
||||
id = trim(p->id);
|
||||
description = trim(p->description);
|
||||
/* 1. validate the string */
|
||||
status = nc_utf8_validate((const unsigned char*)p->data);
|
||||
if(status != NC_NOERR) {pf = "Fail"; failures++; goto fail;}
|
||||
/* 2. normalize the string */
|
||||
status = nc_utf8_normalize((const unsigned char*)p->data,&normal);
|
||||
if(status != NC_NOERR) {pf = "Fail"; failures++; goto fail;}
|
||||
/* 3. re-validate the normalized string */
|
||||
status = nc_utf8_validate((const unsigned char*)normal);
|
||||
if(status != NC_NOERR) {pf = "Fail"; failures++; goto fail;}
|
||||
/* 3. compare input with output */
|
||||
{
|
||||
int dlen = strlen((const char*)p->data);
|
||||
int nlen = strlen((const char*)normal);
|
||||
int mlen,i;
|
||||
if(dlen != nlen)
|
||||
fprintf(stderr,"\t%s: length mismatch: in=%d norm=%d\n",p->id,dlen,nlen);
|
||||
mlen = (dlen < nlen ? dlen : nlen);
|
||||
for(i=0;i<mlen;i++) {
|
||||
unsigned char cd = p->data[i];
|
||||
unsigned char cn = normal[i];
|
||||
if(cd != cn) {
|
||||
fprintf(stderr,"\t%s: [%d] data=|%02x| normal=|%02x|\n",p->id,i,cd,cn);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
pf = "Pass";
|
||||
fail:
|
||||
fprintf(stderr,"%s: %s %s\n",pf,id,description);
|
||||
fflush(stderr);
|
||||
}
|
||||
return failures;
|
||||
}
|
||||
|
||||
int
|
||||
main(int argc, char** argv)
|
||||
{
|
||||
int i, status;
|
||||
int failures = 0;
|
||||
int tstcnt = 0;
|
||||
|
||||
printf("\n Testing UTF-8 sequences.\n");
|
||||
failures += test(utf8currency,"Currencies");
|
||||
failures += test(utf8poems,"Poetry");
|
||||
failures += test(utf8phrases1,"Phrases Set 1");
|
||||
failures += test(utf8phrases2,"Phrases Set 2");
|
||||
fprintf(stderr,"No. of failures = %d\n",failures);
|
||||
exit(failures == 0 ? 0 : 1);
|
||||
}
|
BIN
nc_test/tst_utf8_validate.c
Normal file
BIN
nc_test/tst_utf8_validate.c
Normal file
Binary file not shown.
@ -1,5 +1,5 @@
|
||||
|
||||
#line 3 "lex.ncg.c"
|
||||
#line 3 "ncgenl.c"
|
||||
|
||||
#define YY_INT_ALIGNED short int
|
||||
|
||||
@ -72,6 +72,7 @@ typedef int flex_int32_t;
|
||||
typedef unsigned char flex_uint8_t;
|
||||
typedef unsigned short int flex_uint16_t;
|
||||
typedef unsigned int flex_uint32_t;
|
||||
#endif /* ! C99 */
|
||||
|
||||
/* Limits of integral types. */
|
||||
#ifndef INT8_MIN
|
||||
@ -102,8 +103,6 @@ typedef unsigned int flex_uint32_t;
|
||||
#define UINT32_MAX (4294967295U)
|
||||
#endif
|
||||
|
||||
#endif /* ! C99 */
|
||||
|
||||
#endif /* ! FLEXINT_H */
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -160,15 +159,7 @@ typedef unsigned int flex_uint32_t;
|
||||
|
||||
/* Size of default input buffer. */
|
||||
#ifndef YY_BUF_SIZE
|
||||
#ifdef __ia64__
|
||||
/* On IA-64, the buffer size is 16k, not 8k.
|
||||
* Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case.
|
||||
* Ditto for the __ia64__ case accordingly.
|
||||
*/
|
||||
#define YY_BUF_SIZE 32768
|
||||
#else
|
||||
#define YY_BUF_SIZE 16384
|
||||
#endif /* __ia64__ */
|
||||
#endif
|
||||
|
||||
/* The state buf must be large enough to hold one state per character in the main buffer.
|
||||
@ -1328,7 +1319,7 @@ ID ([A-Za-z_]|{UTF8})([A-Z.@#\[\]a-z_0-9+-]|{UTF8})*
|
||||
/* Note: this definition of string will work for utf8 as well,
|
||||
although it is a very relaxed definition
|
||||
*/
|
||||
#line 1332 "lex.ncg.c"
|
||||
#line 1323 "ncgenl.c"
|
||||
|
||||
#define INITIAL 0
|
||||
#define ST_C_COMMENT 1
|
||||
@ -1411,12 +1402,7 @@ static int input (void );
|
||||
|
||||
/* Amount of stuff to slurp up with each read. */
|
||||
#ifndef YY_READ_BUF_SIZE
|
||||
#ifdef __ia64__
|
||||
/* On IA-64, the buffer size is 16k, not 8k */
|
||||
#define YY_READ_BUF_SIZE 16384
|
||||
#else
|
||||
#define YY_READ_BUF_SIZE 8192
|
||||
#endif /* __ia64__ */
|
||||
#endif
|
||||
|
||||
/* Copy whatever the last rule matched to the standard output. */
|
||||
@ -1424,7 +1410,7 @@ static int input (void );
|
||||
/* This used to be an fputs(), but since the string might contain NUL's,
|
||||
* we now use fwrite().
|
||||
*/
|
||||
#define ECHO do { if (fwrite( ncgtext, ncgleng, 1, ncgout )) {} } while (0)
|
||||
#define ECHO fwrite( ncgtext, ncgleng, 1, ncgout )
|
||||
#endif
|
||||
|
||||
/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL,
|
||||
@ -1435,7 +1421,7 @@ static int input (void );
|
||||
if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \
|
||||
{ \
|
||||
int c = '*'; \
|
||||
size_t n; \
|
||||
int n; \
|
||||
for ( n = 0; n < max_size && \
|
||||
(c = getc( ncgin )) != EOF && c != '\n'; ++n ) \
|
||||
buf[n] = (char) c; \
|
||||
@ -1519,7 +1505,7 @@ YY_DECL
|
||||
|
||||
#line 217 "ncgen.l"
|
||||
|
||||
#line 1523 "lex.ncg.c"
|
||||
#line 1509 "ncgenl.c"
|
||||
|
||||
if ( !(yy_init) )
|
||||
{
|
||||
@ -2132,7 +2118,7 @@ YY_RULE_SETUP
|
||||
#line 570 "ncgen.l"
|
||||
ECHO;
|
||||
YY_BREAK
|
||||
#line 2136 "lex.ncg.c"
|
||||
#line 2122 "ncgenl.c"
|
||||
case YY_STATE_EOF(INITIAL):
|
||||
case YY_STATE_EOF(TEXT):
|
||||
yyterminate();
|
||||
@ -2891,8 +2877,8 @@ YY_BUFFER_STATE ncg_scan_string (yyconst char * yystr )
|
||||
|
||||
/** Setup the input buffer state to scan the given bytes. The next call to ncglex() will
|
||||
* scan from a @e copy of @a bytes.
|
||||
* @param yybytes the byte buffer to scan
|
||||
* @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes.
|
||||
* @param bytes the byte buffer to scan
|
||||
* @param len the number of bytes in the buffer pointed to by @a bytes.
|
||||
*
|
||||
* @return the newly allocated buffer state object.
|
||||
*/
|
||||
|
2548
ncgen/ncgeny.c
2548
ncgen/ncgeny.c
File diff suppressed because it is too large
Load Diff
162
ncgen/ncgeny.h
162
ncgen/ncgeny.h
@ -1,20 +1,19 @@
|
||||
/* A Bison parser, made by GNU Bison 2.4.2. */
|
||||
/* A Bison parser, made by GNU Bison 3.0.4. */
|
||||
|
||||
/* Bison interface for Yacc-like parsers in C
|
||||
|
||||
Copyright (C) 1984, 1989-1990, 2000-2015 Free Software Foundation, Inc.
|
||||
|
||||
/* Skeleton interface for Bison's Yacc-like parsers in C
|
||||
|
||||
Copyright (C) 1984, 1989-1990, 2000-2006, 2009-2010 Free Software
|
||||
Foundation, Inc.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>. */
|
||||
|
||||
@ -27,79 +26,85 @@
|
||||
special exception, which will cause the skeleton and the resulting
|
||||
Bison output files to be licensed under the GNU General Public
|
||||
License without this special exception.
|
||||
|
||||
|
||||
This special exception was added by the Free Software Foundation in
|
||||
version 2.2 of Bison. */
|
||||
|
||||
|
||||
/* Tokens. */
|
||||
#ifndef YYTOKENTYPE
|
||||
# define YYTOKENTYPE
|
||||
/* Put the tokens into the symbol table, so that GDB and other debuggers
|
||||
know about them. */
|
||||
enum yytokentype {
|
||||
NC_UNLIMITED_K = 258,
|
||||
CHAR_K = 259,
|
||||
BYTE_K = 260,
|
||||
SHORT_K = 261,
|
||||
INT_K = 262,
|
||||
FLOAT_K = 263,
|
||||
DOUBLE_K = 264,
|
||||
UBYTE_K = 265,
|
||||
USHORT_K = 266,
|
||||
UINT_K = 267,
|
||||
INT64_K = 268,
|
||||
UINT64_K = 269,
|
||||
IDENT = 270,
|
||||
TERMSTRING = 271,
|
||||
CHAR_CONST = 272,
|
||||
BYTE_CONST = 273,
|
||||
SHORT_CONST = 274,
|
||||
INT_CONST = 275,
|
||||
INT64_CONST = 276,
|
||||
UBYTE_CONST = 277,
|
||||
USHORT_CONST = 278,
|
||||
UINT_CONST = 279,
|
||||
UINT64_CONST = 280,
|
||||
FLOAT_CONST = 281,
|
||||
DOUBLE_CONST = 282,
|
||||
DIMENSIONS = 283,
|
||||
VARIABLES = 284,
|
||||
NETCDF = 285,
|
||||
DATA = 286,
|
||||
TYPES = 287,
|
||||
COMPOUND = 288,
|
||||
ENUM = 289,
|
||||
OPAQUE = 290,
|
||||
OPAQUESTRING = 291,
|
||||
GROUP = 292,
|
||||
PATH = 293,
|
||||
FILLMARKER = 294,
|
||||
NIL = 295,
|
||||
_FILLVALUE = 296,
|
||||
_FORMAT = 297,
|
||||
_STORAGE = 298,
|
||||
_CHUNKSIZES = 299,
|
||||
_DEFLATELEVEL = 300,
|
||||
_SHUFFLE = 301,
|
||||
_ENDIANNESS = 302,
|
||||
_NOFILL = 303,
|
||||
_FLETCHER32 = 304,
|
||||
_NCPROPS = 305,
|
||||
_ISNETCDF4 = 306,
|
||||
_SUPERBLOCK = 307,
|
||||
DATASETID = 308
|
||||
};
|
||||
#ifndef YY_NCG_NCGEN_TAB_H_INCLUDED
|
||||
# define YY_NCG_NCGEN_TAB_H_INCLUDED
|
||||
/* Debug traces. */
|
||||
#ifndef YYDEBUG
|
||||
# define YYDEBUG 1
|
||||
#endif
|
||||
#if YYDEBUG
|
||||
extern int ncgdebug;
|
||||
#endif
|
||||
|
||||
/* Token type. */
|
||||
#ifndef YYTOKENTYPE
|
||||
# define YYTOKENTYPE
|
||||
enum yytokentype
|
||||
{
|
||||
NC_UNLIMITED_K = 258,
|
||||
CHAR_K = 259,
|
||||
BYTE_K = 260,
|
||||
SHORT_K = 261,
|
||||
INT_K = 262,
|
||||
FLOAT_K = 263,
|
||||
DOUBLE_K = 264,
|
||||
UBYTE_K = 265,
|
||||
USHORT_K = 266,
|
||||
UINT_K = 267,
|
||||
INT64_K = 268,
|
||||
UINT64_K = 269,
|
||||
IDENT = 270,
|
||||
TERMSTRING = 271,
|
||||
CHAR_CONST = 272,
|
||||
BYTE_CONST = 273,
|
||||
SHORT_CONST = 274,
|
||||
INT_CONST = 275,
|
||||
INT64_CONST = 276,
|
||||
UBYTE_CONST = 277,
|
||||
USHORT_CONST = 278,
|
||||
UINT_CONST = 279,
|
||||
UINT64_CONST = 280,
|
||||
FLOAT_CONST = 281,
|
||||
DOUBLE_CONST = 282,
|
||||
DIMENSIONS = 283,
|
||||
VARIABLES = 284,
|
||||
NETCDF = 285,
|
||||
DATA = 286,
|
||||
TYPES = 287,
|
||||
COMPOUND = 288,
|
||||
ENUM = 289,
|
||||
OPAQUE = 290,
|
||||
OPAQUESTRING = 291,
|
||||
GROUP = 292,
|
||||
PATH = 293,
|
||||
FILLMARKER = 294,
|
||||
NIL = 295,
|
||||
_FILLVALUE = 296,
|
||||
_FORMAT = 297,
|
||||
_STORAGE = 298,
|
||||
_CHUNKSIZES = 299,
|
||||
_DEFLATELEVEL = 300,
|
||||
_SHUFFLE = 301,
|
||||
_ENDIANNESS = 302,
|
||||
_NOFILL = 303,
|
||||
_FLETCHER32 = 304,
|
||||
_NCPROPS = 305,
|
||||
_ISNETCDF4 = 306,
|
||||
_SUPERBLOCK = 307,
|
||||
DATASETID = 308
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
/* Value type. */
|
||||
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
|
||||
typedef union YYSTYPE
|
||||
{
|
||||
|
||||
/* Line 1685 of yacc.c */
|
||||
#line 136 "ncgen.y"
|
||||
union YYSTYPE
|
||||
{
|
||||
#line 136 "ncgen.y" /* yacc.c:1909 */
|
||||
|
||||
Symbol* sym;
|
||||
unsigned long size; /* allow for zero size to indicate e.g. UNLIMITED*/
|
||||
@ -108,16 +113,17 @@ int nctype; /* for tracking attribute list type*/
|
||||
Datalist* datalist;
|
||||
NCConstant constant;
|
||||
|
||||
#line 117 "ncgeny.h" /* yacc.c:1909 */
|
||||
};
|
||||
|
||||
|
||||
/* Line 1685 of yacc.c */
|
||||
#line 115 "ncgen.tab.h"
|
||||
} YYSTYPE;
|
||||
typedef union YYSTYPE YYSTYPE;
|
||||
# define YYSTYPE_IS_TRIVIAL 1
|
||||
# define yystype YYSTYPE /* obsolescent; will be withdrawn */
|
||||
# define YYSTYPE_IS_DECLARED 1
|
||||
#endif
|
||||
|
||||
|
||||
extern YYSTYPE ncglval;
|
||||
|
||||
int ncgparse (void);
|
||||
|
||||
#endif /* !YY_NCG_NCGEN_TAB_H_INCLUDED */
|
||||
|
Loading…
x
Reference in New Issue
Block a user