mirror of
https://git.openldap.org/openldap/openldap.git
synced 2025-01-18 11:05:48 +08:00
Add UTF-8 wc/mb conversion routines contributed by Novell.
/****************************************************************************** * Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved. * * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. ******************************************************************************/
This commit is contained in:
parent
2f8f8b588e
commit
5082731e24
291
doc/devel/utfconv.txt
Normal file
291
doc/devel/utfconv.txt
Normal file
@ -0,0 +1,291 @@
|
|||||||
|
Dec 5, 2000
|
||||||
|
Dave Steck
|
||||||
|
Novell, Inc.
|
||||||
|
|
||||||
|
UTF-8 Conversion Functions
|
||||||
|
|
||||||
|
|
||||||
|
1. Strings in the LDAP C SDK should be encoded in UTF-8 format.
|
||||||
|
However, most platforms do not provide APIs for converting to
|
||||||
|
this format. If they do, they are platform-specific.
|
||||||
|
|
||||||
|
As a result, most applications (knowingly or not) use local strings
|
||||||
|
with LDAP functions. This works fine for 7-bit ASCII characters,
|
||||||
|
but will fail with 8-bit European characters, Asian characters, etc.
|
||||||
|
|
||||||
|
We propose adding the following platform-independent conversion functions
|
||||||
|
to the OpenLDAP SDK. There are 4 functions for converting between UTF-8
|
||||||
|
and wide characters, and 4 functions for converting between UTF-8 and
|
||||||
|
multibyte characters.
|
||||||
|
|
||||||
|
For multibyte to UTF-8 conversions, charset translation is necessary.
|
||||||
|
While a full charset translator is not practical or appropriate for the
|
||||||
|
LDAP SDK, we can pass the translator function in as an argument.
|
||||||
|
A NULL for this argument will use the ANSI C functions mbtowc, mbstowcs,
|
||||||
|
wctomb, and wcstombs.
|
||||||
|
|
||||||
|
2. UTF-8 <--> Wide Character conversions
|
||||||
|
|
||||||
|
The following new conversion routines will be added, following the pattern of
|
||||||
|
the ANSI C conversion routines (mbtowc, mbstowcs, etc). These routines use
|
||||||
|
the wchar_t type. wchar_t is 2 bytes on some systems and 4 bytes on others.
|
||||||
|
However the advantage of using wchar_t is that all the standard wide character
|
||||||
|
string functions may be used on these strings: wcslen, wcscpy, etc.
|
||||||
|
|
||||||
|
int ldap_x_utf8_to_wc - Convert a single UTF-8 encoded character to a wide character.
|
||||||
|
int ldap_x_utf8s_to_wcs - Convert a UTF-8 string to a wide character string.
|
||||||
|
int ldap_x_wc_to_utf8 - Convert a single wide character to a UTF-8 sequence.
|
||||||
|
int ldap_x_wcs_to_utf8s - Convert a wide character string to a UTF-8 string.
|
||||||
|
|
||||||
|
|
||||||
|
2.1 ldap_x_utf8_to_wc - Convert a single UTF-8 encoded character to a wide character.
|
||||||
|
|
||||||
|
int ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
|
||||||
|
|
||||||
|
wchar (OUT) Points to a wide character code to receive the
|
||||||
|
converted character.
|
||||||
|
|
||||||
|
utf8char (IN) Address of the UTF8 sequence of bytes.
|
||||||
|
|
||||||
|
Return Value:
|
||||||
|
If successful, the function returns the length in
|
||||||
|
bytes of the UTF-8 input character.
|
||||||
|
|
||||||
|
If utf8char is NULL or points to an empty string, the
|
||||||
|
function returns 1 and a NULL is written to wchar.
|
||||||
|
|
||||||
|
If utf8char contains an invalid UTF-8 sequence -1 is returned.
|
||||||
|
|
||||||
|
|
||||||
|
2.2 ldap_x_utf8s_to_wcs - Convert a UTF-8 string to a wide character string.
|
||||||
|
|
||||||
|
int ldap_x_utf8s_to_wcs (wchar_t *wcstr, const char *utf8str, size_t count)
|
||||||
|
|
||||||
|
wcstr (OUT) Points to a wide char buffer to receive the
|
||||||
|
converted wide char string. The output string will be
|
||||||
|
null terminated if there is space for it in the
|
||||||
|
buffer.
|
||||||
|
|
||||||
|
utf8str (IN) Address of the null-terminated UTF-8 string to convert.
|
||||||
|
|
||||||
|
count (IN) The number of UTF-8 characters to convert, or
|
||||||
|
equivalently, the size of the output buffer in wide
|
||||||
|
characters.
|
||||||
|
|
||||||
|
Return Value:
|
||||||
|
If successful, the function returns the number of wide
|
||||||
|
characters written to wcstr, excluding the null termination
|
||||||
|
character, if any.
|
||||||
|
|
||||||
|
If wcstr is NULL, the function returns the number of wide
|
||||||
|
characters required to contain the converted string,
|
||||||
|
excluding the null termination character.
|
||||||
|
|
||||||
|
If an invalid UTF-8 sequence is encountered, the
|
||||||
|
function returns -1.
|
||||||
|
|
||||||
|
If the return value equals count, there was not enough space to fit the
|
||||||
|
string and the null terminator in the buffer.
|
||||||
|
|
||||||
|
|
||||||
|
2.3 ldap_x_wc_to_utf8 - Convert a single wide character to a UTF-8 sequence.
|
||||||
|
|
||||||
|
int ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, count )
|
||||||
|
|
||||||
|
utf8char (OUT) Points to a byte array to receive the converted UTF-8
|
||||||
|
string.
|
||||||
|
|
||||||
|
wchar (IN) The wide character to convert.
|
||||||
|
|
||||||
|
count (IN) The maximum number of bytes to write to the output
|
||||||
|
buffer. Normally set this to LDAP_MAX_UTF8_LEN, which
|
||||||
|
is defined as 3 or 6 depending on the size of wchar_t.
|
||||||
|
A partial character will not be written.
|
||||||
|
|
||||||
|
Return Value:
|
||||||
|
If successful, the function returns the length in bytes of
|
||||||
|
the converted UTF-8 output character.
|
||||||
|
|
||||||
|
If wchar is NULL, the function returns 1 and a NULL is
|
||||||
|
written to utf8char.
|
||||||
|
|
||||||
|
If wchar cannot be converted to a UTF-8 character, the
|
||||||
|
function returns -1.
|
||||||
|
|
||||||
|
|
||||||
|
2.4 int ldap_x_wcs_to_utf8s - Convert a wide character string to a UTF-8 string.
|
||||||
|
|
||||||
|
int ldap_x_wcs_to_utf8s (char *utf8str, const wchar_t *wcstr, size_t count)
|
||||||
|
|
||||||
|
utf8str (OUT) Points to a byte array to receive the converted
|
||||||
|
UTF-8 string. The output string will be null
|
||||||
|
terminated if there is space for it in the
|
||||||
|
buffer.
|
||||||
|
|
||||||
|
|
||||||
|
wcstr (IN) Address of the null-terminated wide char string to convert.
|
||||||
|
|
||||||
|
count (IN) The size of the output buffer in bytes.
|
||||||
|
|
||||||
|
Return Value:
|
||||||
|
If successful, the function returns the number of bytes
|
||||||
|
written to utf8str, excluding the null termination
|
||||||
|
character, if any.
|
||||||
|
|
||||||
|
If utf8str is NULL, the function returns the number of
|
||||||
|
bytes required to contain the converted string, excluding
|
||||||
|
the null termination character. The 'count' parameter is ignored.
|
||||||
|
|
||||||
|
If the function encounters a wide character that cannot
|
||||||
|
be mapped to a UTF-8 sequence, the function returns -1.
|
||||||
|
|
||||||
|
If the return value equals count, there was not enough space to fit
|
||||||
|
the string and the null terminator in the buffer.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
3. Multi-byte <--> UTF-8 Conversions
|
||||||
|
|
||||||
|
These functions convert the string in a two-step process, from multibyte
|
||||||
|
to Wide, then from Wide to UTF8, or vice versa. This conversion requires a
|
||||||
|
charset translation routine, which is passed in as an argument.
|
||||||
|
|
||||||
|
ldap_x_mb_to_utf8 - Convert a multi-byte character to a UTF-8 character.
|
||||||
|
ldap_x_mbs_to_utf8s - Convert a multi-byte string to a UTF-8 string.
|
||||||
|
ldap_x_utf8_to_mb - Convert a UTF-8 character to a multi-byte character.
|
||||||
|
ldap_x_utf8s_to_mbs - Convert a UTF-8 string to a multi-byte string.
|
||||||
|
|
||||||
|
3.1 ldap_x_mb_to_utf8 - Convert a multi-byte character to a UTF-8 character.
|
||||||
|
|
||||||
|
int ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize, int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
|
||||||
|
|
||||||
|
utf8char (OUT) Points to a byte buffer to receive the converted
|
||||||
|
UTF-8 character. May be NULL. The output is not
|
||||||
|
null-terminated.
|
||||||
|
|
||||||
|
mbchar (IN) Address of a sequence of bytes forming a multibyte character.
|
||||||
|
|
||||||
|
mbsize (IN) The maximum number of bytes of the mbchar argument to
|
||||||
|
check. This should normally be MB_CUR_MAX.
|
||||||
|
|
||||||
|
f_mbtowc (IN) The function to use for converting a multibyte
|
||||||
|
character to a wide character. If NULL, the local
|
||||||
|
ANSI C routine mbtowc is used.
|
||||||
|
|
||||||
|
Return Value:
|
||||||
|
If successful, the function returns the length in bytes of
|
||||||
|
the UTF-8 output character.
|
||||||
|
|
||||||
|
If utf8char is NULL, count is ignored and the funtion
|
||||||
|
returns the number of bytes that would be written to the
|
||||||
|
output char.
|
||||||
|
|
||||||
|
If count is zero, 0 is returned and nothing is written to
|
||||||
|
utf8char.
|
||||||
|
|
||||||
|
If mbchar is NULL or points to an empty string, the
|
||||||
|
function returns 1 and a null byte is written to utf8char.
|
||||||
|
|
||||||
|
If mbchar contains an invalid multi-byte character, -1 is returned.
|
||||||
|
|
||||||
|
|
||||||
|
3.2 ldap_x_mbs_to_utf8s - Convert a multi-byte string to a UTF-8 string.
|
||||||
|
|
||||||
|
int ldap_x_mbs_to_utf8s (char *utf8str, const char *mbstr, size_t count,
|
||||||
|
size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count))
|
||||||
|
|
||||||
|
utf8str (OUT) Points to a buffer to receive the converted UTF-8 string.
|
||||||
|
May be NULL.
|
||||||
|
|
||||||
|
mbchar (IN) Address of the null-terminated multi-byte input string.
|
||||||
|
|
||||||
|
count (IN) The size of the output buffer in bytes.
|
||||||
|
|
||||||
|
f_mbstowcs (IN) The function to use for converting a multibyte string
|
||||||
|
to a wide character string. If NULL, the local ANSI
|
||||||
|
C routine mbstowcs is used.
|
||||||
|
|
||||||
|
Return Value:
|
||||||
|
If successful, the function returns the length in
|
||||||
|
bytes of the UTF-8 output string, excluding the null
|
||||||
|
terminator, if present.
|
||||||
|
|
||||||
|
If utf8str is NULL, count is ignored and the function
|
||||||
|
returns the number of bytes required for the output string,
|
||||||
|
excluding the NULL.
|
||||||
|
|
||||||
|
If count is zero, 0 is returned and nothing is written to utf8str.
|
||||||
|
|
||||||
|
If mbstr is NULL or points to an empty string, the
|
||||||
|
function returns 1 and a null byte is written to utf8str.
|
||||||
|
|
||||||
|
If mbstr contains an invalid multi-byte character, -1 is returned.
|
||||||
|
|
||||||
|
If the returned value is equal to count, the entire null-terminated
|
||||||
|
string would not fit in the output buffer.
|
||||||
|
|
||||||
|
|
||||||
|
3.3 ldap_x_utf8_to_mb - Convert a UTF-8 character to a multi-byte character.
|
||||||
|
|
||||||
|
int ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
|
||||||
|
int (*f_wctomb)(char *mbchar, wchar_t wchar) )
|
||||||
|
|
||||||
|
mbchar (OUT) Points to a byte buffer to receive the converted multi-byte
|
||||||
|
character. May be NULL.
|
||||||
|
|
||||||
|
utf8char (IN) Address of the UTF-8 character sequence.
|
||||||
|
|
||||||
|
f_wctomb (IN) The function to use for converting a wide character
|
||||||
|
to a multibyte character. If NULL, the local
|
||||||
|
ANSI C routine wctomb is used.
|
||||||
|
|
||||||
|
|
||||||
|
Return Value:
|
||||||
|
If successful, the function returns the length in
|
||||||
|
bytes of the multi-byte output character.
|
||||||
|
|
||||||
|
If utf8char is NULL or points to an empty string, the
|
||||||
|
function returns 1 and a null byte is written to mbchar.
|
||||||
|
|
||||||
|
If utf8char contains an invalid UTF-8 sequence, -1 is returned.
|
||||||
|
|
||||||
|
|
||||||
|
3.4 int ldap_x_utf8s_to_mbs - Convert a UTF-8 string to a multi-byte string.
|
||||||
|
|
||||||
|
|
||||||
|
int ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
|
||||||
|
size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
|
||||||
|
|
||||||
|
mbstr (OUT) Points to a byte buffer to receive the converted
|
||||||
|
multi-byte string. May be NULL.
|
||||||
|
|
||||||
|
utf8str (IN) Address of the null-terminated UTF-8 string to convert.
|
||||||
|
|
||||||
|
count (IN) The size of the output buffer in bytes.
|
||||||
|
|
||||||
|
f_wcstombs (IN) The function to use for converting a wide character
|
||||||
|
string to a multibyte string. If NULL, the local
|
||||||
|
ANSI C routine wcstombs is used.
|
||||||
|
|
||||||
|
Return Value:
|
||||||
|
If successful, the function returns the number of bytes
|
||||||
|
written to mbstr, excluding the null termination
|
||||||
|
character, if any.
|
||||||
|
|
||||||
|
If mbstr is NULL, count is ignored and the funtion
|
||||||
|
returns the number of bytes required for the output string,
|
||||||
|
excluding the NULL.
|
||||||
|
|
||||||
|
If count is zero, 0 is returned and nothing is written to
|
||||||
|
mbstr.
|
||||||
|
|
||||||
|
If utf8str is NULL or points to an empty string, the
|
||||||
|
function returns 1 and a null byte is written to mbstr.
|
||||||
|
|
||||||
|
If an invalid UTF-8 character is encountered, the
|
||||||
|
function returns -1.
|
||||||
|
|
||||||
|
The output string will be null terminated if there is space for it in
|
||||||
|
the output buffer.
|
||||||
|
|
||||||
|
|
86
include/ldap_utf8.h
Normal file
86
include/ldap_utf8.h
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
/* $OpenLDAP$ */
|
||||||
|
/* $Novell: /ldap/src/cldap/include/ldap_utf8.h,v 1.3 2000/12/04 20:23:20 dsteck Exp $
|
||||||
|
/*
|
||||||
|
* Copyright 2000 The OpenLDAP Foundation, Redwood City, California, USA
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms are permitted only
|
||||||
|
* as authorized by the OpenLDAP Public License. A copy of this
|
||||||
|
* license is available at http://www.OpenLDAP.org/license.html or
|
||||||
|
* in file LICENSE in the top-level directory of the distribution.
|
||||||
|
*/
|
||||||
|
/******************************************************************************
|
||||||
|
* This notice applies to changes, created by or for Novell, Inc.,
|
||||||
|
* to preexisting works for which notices appear elsewhere in this file.
|
||||||
|
*
|
||||||
|
* Copyright (C) 2000 Novell, Inc. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND TREATIES.
|
||||||
|
* USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT TO VERSION
|
||||||
|
* 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS AVAILABLE AT
|
||||||
|
* HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" IN THE
|
||||||
|
* TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION OF THIS
|
||||||
|
* WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP PUBLIC
|
||||||
|
* LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT THE
|
||||||
|
* PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
#ifndef _LDAP_UTF8_H
|
||||||
|
#define _LDAP_UTF8_H
|
||||||
|
|
||||||
|
LDAP_BEGIN_DECL
|
||||||
|
|
||||||
|
/*
|
||||||
|
* UTF-8 Utility Routines (in utf-8.c)
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define LDAP_UCS4_INVALID (0x80000000U)
|
||||||
|
|
||||||
|
/* LDAP_MAX_UTF8_LEN is 3 or 6 depending on size of wchar_t */
|
||||||
|
#define LDAP_MAX_UTF8_LEN sizeof(wchar_t)*3/2
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* UTF-8 Conversion Routines. (in utfconv.c)
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* UTF-8 character to Wide Char */
|
||||||
|
LDAP_F(int)
|
||||||
|
ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char );
|
||||||
|
|
||||||
|
/* UTF-8 string to Wide Char string */
|
||||||
|
LDAP_F(int)
|
||||||
|
ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count );
|
||||||
|
|
||||||
|
/* Wide Char to UTF-8 character */
|
||||||
|
LDAP_F(int)
|
||||||
|
ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count );
|
||||||
|
|
||||||
|
/* Wide Char string to UTF-8 string */
|
||||||
|
LDAP_F(int)
|
||||||
|
ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count );
|
||||||
|
|
||||||
|
|
||||||
|
/* UTF-8 character to MultiByte character */
|
||||||
|
LDAP_F(int)
|
||||||
|
ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
|
||||||
|
int (*f_wctomb)(char *mbchar, wchar_t wchar) );
|
||||||
|
|
||||||
|
/* UTF-8 string to MultiByte string */
|
||||||
|
LDAP_F(int)
|
||||||
|
ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
|
||||||
|
size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) );
|
||||||
|
|
||||||
|
/* MultiByte character to UTF-8 character */
|
||||||
|
LDAP_F(int)
|
||||||
|
ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
|
||||||
|
int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) );
|
||||||
|
|
||||||
|
/* MultiByte string to UTF-8 string */
|
||||||
|
LDAP_F(int)
|
||||||
|
ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
|
||||||
|
size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) );
|
||||||
|
|
||||||
|
LDAP_END_DECL
|
||||||
|
|
||||||
|
#endif /* _LDAP_UTF8_H */
|
@ -18,7 +18,7 @@ SRCS = bind.c open.c result.c error.c compare.c search.c \
|
|||||||
request.c os-ip.c url.c sortctrl.c vlvctrl.c \
|
request.c os-ip.c url.c sortctrl.c vlvctrl.c \
|
||||||
init.c options.c print.c string.c util-int.c schema.c \
|
init.c options.c print.c string.c util-int.c schema.c \
|
||||||
charray.c tls.c dn.c os-local.c dnssrv.c \
|
charray.c tls.c dn.c os-local.c dnssrv.c \
|
||||||
utf-8.c
|
utf-8.c utf-8-conv.c
|
||||||
OBJS = bind.lo open.lo result.lo error.lo compare.lo search.lo \
|
OBJS = bind.lo open.lo result.lo error.lo compare.lo search.lo \
|
||||||
controls.lo messages.lo references.lo extended.lo cyrus.lo \
|
controls.lo messages.lo references.lo extended.lo cyrus.lo \
|
||||||
modify.lo add.lo modrdn.lo delete.lo abandon.lo ufn.lo cache.lo \
|
modify.lo add.lo modrdn.lo delete.lo abandon.lo ufn.lo cache.lo \
|
||||||
@ -28,7 +28,7 @@ OBJS = bind.lo open.lo result.lo error.lo compare.lo search.lo \
|
|||||||
request.lo os-ip.lo url.lo sortctrl.lo vlvctrl.lo \
|
request.lo os-ip.lo url.lo sortctrl.lo vlvctrl.lo \
|
||||||
init.lo options.lo print.lo string.lo util-int.lo schema.lo \
|
init.lo options.lo print.lo string.lo util-int.lo schema.lo \
|
||||||
charray.lo tls.lo dn.lo os-local.lo dnssrv.lo \
|
charray.lo tls.lo dn.lo os-local.lo dnssrv.lo \
|
||||||
utf-8.lo
|
utf-8.lo utf-8-conv.lo
|
||||||
|
|
||||||
LDAP_INCDIR= ../../include
|
LDAP_INCDIR= ../../include
|
||||||
LDAP_LIBDIR= ../../libraries
|
LDAP_LIBDIR= ../../libraries
|
||||||
|
464
libraries/libldap/utf-8-conv.c
Normal file
464
libraries/libldap/utf-8-conv.c
Normal file
@ -0,0 +1,464 @@
|
|||||||
|
/* $OpenLDAP$ */
|
||||||
|
/*
|
||||||
|
* Copyright 2000 The OpenLDAP Foundation, All Rights Reserved.
|
||||||
|
* COPYING RESTRICTIONS APPLY, see COPYRIGHT file
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* $Novell: /ldap/src/cldap/libraries/libldap/utfconv.c,v 1.3 2000/12/11 19:35:37 dsteck Exp $ */
|
||||||
|
/******************************************************************************
|
||||||
|
* Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
|
||||||
|
*
|
||||||
|
* THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
|
||||||
|
* TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
|
||||||
|
* TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
|
||||||
|
* AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
|
||||||
|
* IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
|
||||||
|
* OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
|
||||||
|
* PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
|
||||||
|
* THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
|
||||||
|
******************************************************************************/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* UTF-8 Conversion Routines
|
||||||
|
*
|
||||||
|
* These routines convert between Wide Character and UTF-8,
|
||||||
|
* or between MultiByte and UTF-8 encodings.
|
||||||
|
*
|
||||||
|
* Both single character and string versions of the functions are provided.
|
||||||
|
* All functions return -1 if the character or string cannot be converted.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "portable.h"
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */
|
||||||
|
#include <ac/string.h>
|
||||||
|
#include <ac/time.h> /* for time_t */
|
||||||
|
|
||||||
|
#include "ldap-int.h"
|
||||||
|
|
||||||
|
#include <ldap_utf8.h>
|
||||||
|
|
||||||
|
static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
|
||||||
|
|
||||||
|
|
||||||
|
/*-----------------------------------------------------------------------------
|
||||||
|
UTF-8 Format Summary
|
||||||
|
|
||||||
|
ASCII chars 7 bits
|
||||||
|
0xxxxxxx
|
||||||
|
|
||||||
|
2-character UTF-8 sequence: 11 bits
|
||||||
|
110xxxxx 10xxxxxx
|
||||||
|
|
||||||
|
3-character UTF-8 16 bits
|
||||||
|
1110xxxx 10xxxxxx 10xxxxxx
|
||||||
|
|
||||||
|
4-char UTF-8 21 bits
|
||||||
|
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
|
||||||
|
5-char UTF-8 26 bits
|
||||||
|
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
|
||||||
|
6-char UTF-8 31 bits
|
||||||
|
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
|
||||||
|
Unicode address space (0 - 0x10FFFF) 21 bits
|
||||||
|
ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits
|
||||||
|
|
||||||
|
Note: This code does not prevent UTF-8 sequences which are longer than
|
||||||
|
necessary from being decoded.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*------------------------------------------------------------------------------
|
||||||
|
Convert a UTF-8 character to a wide char.
|
||||||
|
Return the length of the UTF-8 input character in bytes.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
|
||||||
|
{
|
||||||
|
int utflen, i;
|
||||||
|
wchar_t ch;
|
||||||
|
|
||||||
|
/* If input ptr is NULL, treat it as empty string. */
|
||||||
|
if (utf8char == NULL)
|
||||||
|
utf8char = "";
|
||||||
|
|
||||||
|
/* Get UTF-8 sequence length from 1st byte */
|
||||||
|
utflen = UTF8_CHARLEN(utf8char);
|
||||||
|
|
||||||
|
if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
|
||||||
|
return -1; /* Invalid input */
|
||||||
|
|
||||||
|
/* First byte minus length tag */
|
||||||
|
ch = (wchar_t)(utf8char[0] & mask[utflen]);
|
||||||
|
|
||||||
|
for(i=1; i < utflen; i++)
|
||||||
|
{
|
||||||
|
/* Subsequent bytes must start with 10 */
|
||||||
|
if ((utf8char[i] & 0xc0) != 0x80)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
ch <<= 6; /* 6 bits of data in each subsequent byte */
|
||||||
|
ch |= (wchar_t)(utf8char[i] & 0x3f);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wchar)
|
||||||
|
*wchar = ch;
|
||||||
|
|
||||||
|
return utflen;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*-----------------------------------------------------------------------------
|
||||||
|
Convert a UTF-8 string to a wide char string.
|
||||||
|
No more than 'count' wide chars will be written to the output buffer.
|
||||||
|
Return the size of the converted string in wide chars, excl null terminator.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
|
||||||
|
{
|
||||||
|
size_t wclen = 0;
|
||||||
|
int utflen, i;
|
||||||
|
wchar_t ch;
|
||||||
|
|
||||||
|
|
||||||
|
/* If input ptr is NULL, treat it as empty string. */
|
||||||
|
if (utf8str == NULL)
|
||||||
|
utf8str = "";
|
||||||
|
|
||||||
|
/* Examine next UTF-8 character. If output buffer is NULL, ignore count */
|
||||||
|
while ( *utf8str && (wcstr==NULL || wclen<count) )
|
||||||
|
{
|
||||||
|
/* Get UTF-8 sequence length from 1st byte */
|
||||||
|
utflen = UTF8_CHARLEN(utf8str);
|
||||||
|
|
||||||
|
if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
|
||||||
|
return -1; /* Invalid input */
|
||||||
|
|
||||||
|
/* First byte minus length tag */
|
||||||
|
ch = (wchar_t)(utf8str[0] & mask[utflen]);
|
||||||
|
|
||||||
|
for(i=1; i < utflen; i++)
|
||||||
|
{
|
||||||
|
/* Subsequent bytes must start with 10 */
|
||||||
|
if ((utf8str[i] & 0xc0) != 0x80)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
ch <<= 6; /* 6 bits of data in each subsequent byte */
|
||||||
|
ch |= (wchar_t)(utf8str[i] & 0x3f);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wcstr)
|
||||||
|
wcstr[wclen] = ch;
|
||||||
|
|
||||||
|
utf8str += utflen; /* Move to next UTF-8 character */
|
||||||
|
wclen++; /* Count number of wide chars stored/required */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Add null terminator if there's room in the buffer. */
|
||||||
|
if (wcstr && wclen < count)
|
||||||
|
wcstr[wclen] = 0;
|
||||||
|
|
||||||
|
return wclen;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*------------------------------------------------------------------------------
|
||||||
|
Convert one wide char to a UTF-8 character.
|
||||||
|
Return the length of the converted UTF-8 character in bytes.
|
||||||
|
No more than 'count' bytes will be written to the output buffer.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
|
||||||
|
{
|
||||||
|
int len=0;
|
||||||
|
|
||||||
|
if (utf8char == NULL) /* Just determine the required UTF-8 char length. */
|
||||||
|
{ /* Ignore count */
|
||||||
|
if( wchar < 0 )
|
||||||
|
return -1;
|
||||||
|
if( wchar < 0x80 )
|
||||||
|
return 1;
|
||||||
|
if( wchar < 0x800 )
|
||||||
|
return 2;
|
||||||
|
if( wchar < 0x10000 )
|
||||||
|
return 3;
|
||||||
|
if( wchar < 0x200000 )
|
||||||
|
return 4;
|
||||||
|
if( wchar < 0x4000000 )
|
||||||
|
return 5;
|
||||||
|
if( wchar < 0x80000000 )
|
||||||
|
return 6;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if ( wchar < 0 ) { /* Invalid wide character */
|
||||||
|
len = -1;
|
||||||
|
|
||||||
|
} else if( wchar < 0x80 ) {
|
||||||
|
if (count >= 1) {
|
||||||
|
utf8char[len++] = (char)wchar;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if( wchar < 0x800 ) {
|
||||||
|
if (count >=2) {
|
||||||
|
utf8char[len++] = 0xc0 | ( wchar >> 6 );
|
||||||
|
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if( wchar < 0x10000 ) {
|
||||||
|
if (count >= 3) {
|
||||||
|
utf8char[len++] = 0xe0 | ( wchar >> 12 );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if( wchar < 0x200000 ) {
|
||||||
|
if (count >= 4) {
|
||||||
|
utf8char[len++] = 0xf0 | ( wchar >> 18 );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if( wchar < 0x4000000 ) {
|
||||||
|
if (count >= 5) {
|
||||||
|
utf8char[len++] = 0xf8 | ( wchar >> 24 );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if( wchar < 0x80000000 ) {
|
||||||
|
if (count >= 6) {
|
||||||
|
utf8char[len++] = 0xfc | ( wchar >> 30 );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
|
||||||
|
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||||
|
}
|
||||||
|
|
||||||
|
} else
|
||||||
|
len = -1;
|
||||||
|
|
||||||
|
return len;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*-----------------------------------------------------------------------------
|
||||||
|
Convert a wide char string to a UTF-8 string.
|
||||||
|
No more than 'count' bytes will be written to the output buffer.
|
||||||
|
Return the # of bytes written to the output buffer, excl null terminator.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
|
||||||
|
{
|
||||||
|
int len = 0;
|
||||||
|
int n;
|
||||||
|
char *p = utf8str;
|
||||||
|
wchar_t empty = 0; /* To avoid use of L"" construct */
|
||||||
|
|
||||||
|
if (wcstr == NULL) /* Treat input ptr NULL as an empty string */
|
||||||
|
wcstr = ∅
|
||||||
|
|
||||||
|
if (utf8str == NULL) /* Just compute size of output, excl null */
|
||||||
|
{
|
||||||
|
while (*wcstr)
|
||||||
|
{
|
||||||
|
/* Get UTF-8 size of next wide char */
|
||||||
|
n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
|
||||||
|
if (n == -1)
|
||||||
|
return -1;
|
||||||
|
len += n;
|
||||||
|
}
|
||||||
|
|
||||||
|
return len;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Do the actual conversion. */
|
||||||
|
|
||||||
|
n = 1; /* In case of empty wcstr */
|
||||||
|
while (*wcstr)
|
||||||
|
{
|
||||||
|
n = ldap_x_wc_to_utf8( p, *wcstr++, count);
|
||||||
|
|
||||||
|
if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */
|
||||||
|
break;
|
||||||
|
|
||||||
|
p += n;
|
||||||
|
count -= n; /* Space left in output buffer */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If not enough room for last character, pad remainder with null
|
||||||
|
so that return value = original count, indicating buffer full. */
|
||||||
|
if (n == 0)
|
||||||
|
{
|
||||||
|
while (count--)
|
||||||
|
*p++ = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Add a null terminator if there's room. */
|
||||||
|
else if (count)
|
||||||
|
*p = 0;
|
||||||
|
|
||||||
|
if (n == -1) /* Conversion encountered invalid wide char. */
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
/* Return the number of bytes written to output buffer, excl null. */
|
||||||
|
return (p - utf8str);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*-----------------------------------------------------------------------------
|
||||||
|
Convert a UTF-8 character to a MultiByte character.
|
||||||
|
Return the size of the converted character in bytes.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
|
||||||
|
int (*f_wctomb)(char *mbchar, wchar_t wchar) )
|
||||||
|
{
|
||||||
|
wchar_t wchar;
|
||||||
|
int n;
|
||||||
|
char tmp[6]; /* Large enough for biggest multibyte char */
|
||||||
|
|
||||||
|
if (f_wctomb == NULL) /* If no conversion function was given... */
|
||||||
|
f_wctomb = wctomb; /* use the local ANSI C function */
|
||||||
|
|
||||||
|
/* First convert UTF-8 char to a wide char */
|
||||||
|
n = ldap_x_utf8_to_wc( &wchar, utf8char);
|
||||||
|
|
||||||
|
if (n == -1)
|
||||||
|
return -1; /* Invalid UTF-8 character */
|
||||||
|
|
||||||
|
if (mbchar == NULL)
|
||||||
|
n = f_wctomb( tmp, wchar );
|
||||||
|
else
|
||||||
|
n = f_wctomb( mbchar, wchar);
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*-----------------------------------------------------------------------------
|
||||||
|
Convert a UTF-8 string to a MultiByte string.
|
||||||
|
No more than 'count' bytes will be written to the output buffer.
|
||||||
|
Return the size of the converted string in bytes, excl null terminator.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
|
||||||
|
size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
|
||||||
|
{
|
||||||
|
wchar_t *wcs;
|
||||||
|
size_t wcsize;
|
||||||
|
int n;
|
||||||
|
|
||||||
|
if (f_wcstombs == NULL) /* If no conversion function was given... */
|
||||||
|
f_wcstombs = wcstombs; /* use the local ANSI C function */
|
||||||
|
|
||||||
|
if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */
|
||||||
|
{
|
||||||
|
if (mbstr)
|
||||||
|
*mbstr = 0;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Allocate memory for the maximum size wchar string that we could get. */
|
||||||
|
wcsize = strlen(utf8str) + 1;
|
||||||
|
wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
|
||||||
|
if (wcs == NULL)
|
||||||
|
return -1; /* Memory allocation failure. */
|
||||||
|
|
||||||
|
/* First convert the UTF-8 string to a wide char string */
|
||||||
|
n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
|
||||||
|
|
||||||
|
/* Then convert wide char string to multi-byte string */
|
||||||
|
if (n != -1)
|
||||||
|
{
|
||||||
|
n = f_wcstombs(mbstr, wcs, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
LDAP_FREE(wcs);
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*-----------------------------------------------------------------------------
|
||||||
|
Convert a MultiByte character to a UTF-8 character.
|
||||||
|
'mbsize' indicates the number of bytes of 'mbchar' to check.
|
||||||
|
Returns the number of bytes written to the output character.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
|
||||||
|
int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
|
||||||
|
{
|
||||||
|
wchar_t wchar;
|
||||||
|
int n;
|
||||||
|
|
||||||
|
if (f_mbtowc == NULL) /* If no conversion function was given... */
|
||||||
|
f_mbtowc = mbtowc; /* use the local ANSI C function */
|
||||||
|
|
||||||
|
if (mbsize == 0) /* 0 is not valid. */
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if (mbchar == NULL || *mbchar == 0)
|
||||||
|
{
|
||||||
|
if (utf8char)
|
||||||
|
*utf8char = 0;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* First convert the MB char to a Wide Char */
|
||||||
|
n = f_mbtowc( &wchar, mbchar, mbsize);
|
||||||
|
|
||||||
|
if (n == -1)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
/* Convert the Wide Char to a UTF-8 character. */
|
||||||
|
n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*-----------------------------------------------------------------------------
|
||||||
|
Convert a MultiByte string to a UTF-8 string.
|
||||||
|
No more than 'count' bytes will be written to the output buffer.
|
||||||
|
Return the size of the converted string in bytes, excl null terminator.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
|
||||||
|
size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
|
||||||
|
{
|
||||||
|
wchar_t *wcs;
|
||||||
|
int n;
|
||||||
|
size_t wcsize;
|
||||||
|
|
||||||
|
if (mbstr == NULL) /* Treat NULL input string as an empty string */
|
||||||
|
mbstr = "";
|
||||||
|
|
||||||
|
if (f_mbstowcs == NULL) /* If no conversion function was given... */
|
||||||
|
f_mbstowcs = mbstowcs; /* use the local ANSI C function */
|
||||||
|
|
||||||
|
/* Allocate memory for the maximum size wchar string that we could get. */
|
||||||
|
wcsize = strlen(mbstr) + 1;
|
||||||
|
wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
|
||||||
|
if (wcs == NULL)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
/* First convert multi-byte string to a wide char string */
|
||||||
|
n = f_mbstowcs(wcs, mbstr, wcsize);
|
||||||
|
|
||||||
|
/* Convert wide char string to UTF-8 string */
|
||||||
|
if (n != -1)
|
||||||
|
{
|
||||||
|
n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
|
||||||
|
}
|
||||||
|
|
||||||
|
LDAP_FREE(wcs);
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user