mirror of
https://git.openldap.org/openldap/openldap.git
synced 2025-01-18 11:05:48 +08:00
Add UTF-8 wc/mb conversion routines contributed by Novell.
/****************************************************************************** * Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved. * * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. ******************************************************************************/
This commit is contained in:
parent
2f8f8b588e
commit
5082731e24
291
doc/devel/utfconv.txt
Normal file
291
doc/devel/utfconv.txt
Normal file
@ -0,0 +1,291 @@
|
||||
Dec 5, 2000
|
||||
Dave Steck
|
||||
Novell, Inc.
|
||||
|
||||
UTF-8 Conversion Functions
|
||||
|
||||
|
||||
1. Strings in the LDAP C SDK should be encoded in UTF-8 format.
|
||||
However, most platforms do not provide APIs for converting to
|
||||
this format. If they do, they are platform-specific.
|
||||
|
||||
As a result, most applications (knowingly or not) use local strings
|
||||
with LDAP functions. This works fine for 7-bit ASCII characters,
|
||||
but will fail with 8-bit European characters, Asian characters, etc.
|
||||
|
||||
We propose adding the following platform-independent conversion functions
|
||||
to the OpenLDAP SDK. There are 4 functions for converting between UTF-8
|
||||
and wide characters, and 4 functions for converting between UTF-8 and
|
||||
multibyte characters.
|
||||
|
||||
For multibyte to UTF-8 conversions, charset translation is necessary.
|
||||
While a full charset translator is not practical or appropriate for the
|
||||
LDAP SDK, we can pass the translator function in as an argument.
|
||||
A NULL for this argument will use the ANSI C functions mbtowc, mbstowcs,
|
||||
wctomb, and wcstombs.
|
||||
|
||||
2. UTF-8 <--> Wide Character conversions
|
||||
|
||||
The following new conversion routines will be added, following the pattern of
|
||||
the ANSI C conversion routines (mbtowc, mbstowcs, etc). These routines use
|
||||
the wchar_t type. wchar_t is 2 bytes on some systems and 4 bytes on others.
|
||||
However the advantage of using wchar_t is that all the standard wide character
|
||||
string functions may be used on these strings: wcslen, wcscpy, etc.
|
||||
|
||||
int ldap_x_utf8_to_wc - Convert a single UTF-8 encoded character to a wide character.
|
||||
int ldap_x_utf8s_to_wcs - Convert a UTF-8 string to a wide character string.
|
||||
int ldap_x_wc_to_utf8 - Convert a single wide character to a UTF-8 sequence.
|
||||
int ldap_x_wcs_to_utf8s - Convert a wide character string to a UTF-8 string.
|
||||
|
||||
|
||||
2.1 ldap_x_utf8_to_wc - Convert a single UTF-8 encoded character to a wide character.
|
||||
|
||||
int ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
|
||||
|
||||
wchar (OUT) Points to a wide character code to receive the
|
||||
converted character.
|
||||
|
||||
utf8char (IN) Address of the UTF8 sequence of bytes.
|
||||
|
||||
Return Value:
|
||||
If successful, the function returns the length in
|
||||
bytes of the UTF-8 input character.
|
||||
|
||||
If utf8char is NULL or points to an empty string, the
|
||||
function returns 1 and a NULL is written to wchar.
|
||||
|
||||
If utf8char contains an invalid UTF-8 sequence -1 is returned.
|
||||
|
||||
|
||||
2.2 ldap_x_utf8s_to_wcs - Convert a UTF-8 string to a wide character string.
|
||||
|
||||
int ldap_x_utf8s_to_wcs (wchar_t *wcstr, const char *utf8str, size_t count)
|
||||
|
||||
wcstr (OUT) Points to a wide char buffer to receive the
|
||||
converted wide char string. The output string will be
|
||||
null terminated if there is space for it in the
|
||||
buffer.
|
||||
|
||||
utf8str (IN) Address of the null-terminated UTF-8 string to convert.
|
||||
|
||||
count (IN) The number of UTF-8 characters to convert, or
|
||||
equivalently, the size of the output buffer in wide
|
||||
characters.
|
||||
|
||||
Return Value:
|
||||
If successful, the function returns the number of wide
|
||||
characters written to wcstr, excluding the null termination
|
||||
character, if any.
|
||||
|
||||
If wcstr is NULL, the function returns the number of wide
|
||||
characters required to contain the converted string,
|
||||
excluding the null termination character.
|
||||
|
||||
If an invalid UTF-8 sequence is encountered, the
|
||||
function returns -1.
|
||||
|
||||
If the return value equals count, there was not enough space to fit the
|
||||
string and the null terminator in the buffer.
|
||||
|
||||
|
||||
2.3 ldap_x_wc_to_utf8 - Convert a single wide character to a UTF-8 sequence.
|
||||
|
||||
int ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, count )
|
||||
|
||||
utf8char (OUT) Points to a byte array to receive the converted UTF-8
|
||||
string.
|
||||
|
||||
wchar (IN) The wide character to convert.
|
||||
|
||||
count (IN) The maximum number of bytes to write to the output
|
||||
buffer. Normally set this to LDAP_MAX_UTF8_LEN, which
|
||||
is defined as 3 or 6 depending on the size of wchar_t.
|
||||
A partial character will not be written.
|
||||
|
||||
Return Value:
|
||||
If successful, the function returns the length in bytes of
|
||||
the converted UTF-8 output character.
|
||||
|
||||
If wchar is NULL, the function returns 1 and a NULL is
|
||||
written to utf8char.
|
||||
|
||||
If wchar cannot be converted to a UTF-8 character, the
|
||||
function returns -1.
|
||||
|
||||
|
||||
2.4 int ldap_x_wcs_to_utf8s - Convert a wide character string to a UTF-8 string.
|
||||
|
||||
int ldap_x_wcs_to_utf8s (char *utf8str, const wchar_t *wcstr, size_t count)
|
||||
|
||||
utf8str (OUT) Points to a byte array to receive the converted
|
||||
UTF-8 string. The output string will be null
|
||||
terminated if there is space for it in the
|
||||
buffer.
|
||||
|
||||
|
||||
wcstr (IN) Address of the null-terminated wide char string to convert.
|
||||
|
||||
count (IN) The size of the output buffer in bytes.
|
||||
|
||||
Return Value:
|
||||
If successful, the function returns the number of bytes
|
||||
written to utf8str, excluding the null termination
|
||||
character, if any.
|
||||
|
||||
If utf8str is NULL, the function returns the number of
|
||||
bytes required to contain the converted string, excluding
|
||||
the null termination character. The 'count' parameter is ignored.
|
||||
|
||||
If the function encounters a wide character that cannot
|
||||
be mapped to a UTF-8 sequence, the function returns -1.
|
||||
|
||||
If the return value equals count, there was not enough space to fit
|
||||
the string and the null terminator in the buffer.
|
||||
|
||||
|
||||
|
||||
3. Multi-byte <--> UTF-8 Conversions
|
||||
|
||||
These functions convert the string in a two-step process, from multibyte
|
||||
to Wide, then from Wide to UTF8, or vice versa. This conversion requires a
|
||||
charset translation routine, which is passed in as an argument.
|
||||
|
||||
ldap_x_mb_to_utf8 - Convert a multi-byte character to a UTF-8 character.
|
||||
ldap_x_mbs_to_utf8s - Convert a multi-byte string to a UTF-8 string.
|
||||
ldap_x_utf8_to_mb - Convert a UTF-8 character to a multi-byte character.
|
||||
ldap_x_utf8s_to_mbs - Convert a UTF-8 string to a multi-byte string.
|
||||
|
||||
3.1 ldap_x_mb_to_utf8 - Convert a multi-byte character to a UTF-8 character.
|
||||
|
||||
int ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize, int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
|
||||
|
||||
utf8char (OUT) Points to a byte buffer to receive the converted
|
||||
UTF-8 character. May be NULL. The output is not
|
||||
null-terminated.
|
||||
|
||||
mbchar (IN) Address of a sequence of bytes forming a multibyte character.
|
||||
|
||||
mbsize (IN) The maximum number of bytes of the mbchar argument to
|
||||
check. This should normally be MB_CUR_MAX.
|
||||
|
||||
f_mbtowc (IN) The function to use for converting a multibyte
|
||||
character to a wide character. If NULL, the local
|
||||
ANSI C routine mbtowc is used.
|
||||
|
||||
Return Value:
|
||||
If successful, the function returns the length in bytes of
|
||||
the UTF-8 output character.
|
||||
|
||||
If utf8char is NULL, count is ignored and the funtion
|
||||
returns the number of bytes that would be written to the
|
||||
output char.
|
||||
|
||||
If count is zero, 0 is returned and nothing is written to
|
||||
utf8char.
|
||||
|
||||
If mbchar is NULL or points to an empty string, the
|
||||
function returns 1 and a null byte is written to utf8char.
|
||||
|
||||
If mbchar contains an invalid multi-byte character, -1 is returned.
|
||||
|
||||
|
||||
3.2 ldap_x_mbs_to_utf8s - Convert a multi-byte string to a UTF-8 string.
|
||||
|
||||
int ldap_x_mbs_to_utf8s (char *utf8str, const char *mbstr, size_t count,
|
||||
size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count))
|
||||
|
||||
utf8str (OUT) Points to a buffer to receive the converted UTF-8 string.
|
||||
May be NULL.
|
||||
|
||||
mbchar (IN) Address of the null-terminated multi-byte input string.
|
||||
|
||||
count (IN) The size of the output buffer in bytes.
|
||||
|
||||
f_mbstowcs (IN) The function to use for converting a multibyte string
|
||||
to a wide character string. If NULL, the local ANSI
|
||||
C routine mbstowcs is used.
|
||||
|
||||
Return Value:
|
||||
If successful, the function returns the length in
|
||||
bytes of the UTF-8 output string, excluding the null
|
||||
terminator, if present.
|
||||
|
||||
If utf8str is NULL, count is ignored and the function
|
||||
returns the number of bytes required for the output string,
|
||||
excluding the NULL.
|
||||
|
||||
If count is zero, 0 is returned and nothing is written to utf8str.
|
||||
|
||||
If mbstr is NULL or points to an empty string, the
|
||||
function returns 1 and a null byte is written to utf8str.
|
||||
|
||||
If mbstr contains an invalid multi-byte character, -1 is returned.
|
||||
|
||||
If the returned value is equal to count, the entire null-terminated
|
||||
string would not fit in the output buffer.
|
||||
|
||||
|
||||
3.3 ldap_x_utf8_to_mb - Convert a UTF-8 character to a multi-byte character.
|
||||
|
||||
int ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
|
||||
int (*f_wctomb)(char *mbchar, wchar_t wchar) )
|
||||
|
||||
mbchar (OUT) Points to a byte buffer to receive the converted multi-byte
|
||||
character. May be NULL.
|
||||
|
||||
utf8char (IN) Address of the UTF-8 character sequence.
|
||||
|
||||
f_wctomb (IN) The function to use for converting a wide character
|
||||
to a multibyte character. If NULL, the local
|
||||
ANSI C routine wctomb is used.
|
||||
|
||||
|
||||
Return Value:
|
||||
If successful, the function returns the length in
|
||||
bytes of the multi-byte output character.
|
||||
|
||||
If utf8char is NULL or points to an empty string, the
|
||||
function returns 1 and a null byte is written to mbchar.
|
||||
|
||||
If utf8char contains an invalid UTF-8 sequence, -1 is returned.
|
||||
|
||||
|
||||
3.4 int ldap_x_utf8s_to_mbs - Convert a UTF-8 string to a multi-byte string.
|
||||
|
||||
|
||||
int ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
|
||||
size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
|
||||
|
||||
mbstr (OUT) Points to a byte buffer to receive the converted
|
||||
multi-byte string. May be NULL.
|
||||
|
||||
utf8str (IN) Address of the null-terminated UTF-8 string to convert.
|
||||
|
||||
count (IN) The size of the output buffer in bytes.
|
||||
|
||||
f_wcstombs (IN) The function to use for converting a wide character
|
||||
string to a multibyte string. If NULL, the local
|
||||
ANSI C routine wcstombs is used.
|
||||
|
||||
Return Value:
|
||||
If successful, the function returns the number of bytes
|
||||
written to mbstr, excluding the null termination
|
||||
character, if any.
|
||||
|
||||
If mbstr is NULL, count is ignored and the funtion
|
||||
returns the number of bytes required for the output string,
|
||||
excluding the NULL.
|
||||
|
||||
If count is zero, 0 is returned and nothing is written to
|
||||
mbstr.
|
||||
|
||||
If utf8str is NULL or points to an empty string, the
|
||||
function returns 1 and a null byte is written to mbstr.
|
||||
|
||||
If an invalid UTF-8 character is encountered, the
|
||||
function returns -1.
|
||||
|
||||
The output string will be null terminated if there is space for it in
|
||||
the output buffer.
|
||||
|
||||
|
86
include/ldap_utf8.h
Normal file
86
include/ldap_utf8.h
Normal file
@ -0,0 +1,86 @@
|
||||
/* $OpenLDAP$ */
|
||||
/* $Novell: /ldap/src/cldap/include/ldap_utf8.h,v 1.3 2000/12/04 20:23:20 dsteck Exp $
|
||||
/*
|
||||
* Copyright 2000 The OpenLDAP Foundation, Redwood City, California, USA
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms are permitted only
|
||||
* as authorized by the OpenLDAP Public License. A copy of this
|
||||
* license is available at http://www.OpenLDAP.org/license.html or
|
||||
* in file LICENSE in the top-level directory of the distribution.
|
||||
*/
|
||||
/******************************************************************************
|
||||
* This notice applies to changes, created by or for Novell, Inc.,
|
||||
* to preexisting works for which notices appear elsewhere in this file.
|
||||
*
|
||||
* Copyright (C) 2000 Novell, Inc. All Rights Reserved.
|
||||
*
|
||||
* THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND TREATIES.
|
||||
* USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT TO VERSION
|
||||
* 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS AVAILABLE AT
|
||||
* HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" IN THE
|
||||
* TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION OF THIS
|
||||
* WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP PUBLIC
|
||||
* LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT THE
|
||||
* PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
|
||||
******************************************************************************/
|
||||
|
||||
#ifndef _LDAP_UTF8_H
|
||||
#define _LDAP_UTF8_H
|
||||
|
||||
LDAP_BEGIN_DECL
|
||||
|
||||
/*
|
||||
* UTF-8 Utility Routines (in utf-8.c)
|
||||
*/
|
||||
|
||||
#define LDAP_UCS4_INVALID (0x80000000U)
|
||||
|
||||
/* LDAP_MAX_UTF8_LEN is 3 or 6 depending on size of wchar_t */
|
||||
#define LDAP_MAX_UTF8_LEN sizeof(wchar_t)*3/2
|
||||
|
||||
|
||||
/*
|
||||
* UTF-8 Conversion Routines. (in utfconv.c)
|
||||
*/
|
||||
|
||||
/* UTF-8 character to Wide Char */
|
||||
LDAP_F(int)
|
||||
ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char );
|
||||
|
||||
/* UTF-8 string to Wide Char string */
|
||||
LDAP_F(int)
|
||||
ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count );
|
||||
|
||||
/* Wide Char to UTF-8 character */
|
||||
LDAP_F(int)
|
||||
ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count );
|
||||
|
||||
/* Wide Char string to UTF-8 string */
|
||||
LDAP_F(int)
|
||||
ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count );
|
||||
|
||||
|
||||
/* UTF-8 character to MultiByte character */
|
||||
LDAP_F(int)
|
||||
ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
|
||||
int (*f_wctomb)(char *mbchar, wchar_t wchar) );
|
||||
|
||||
/* UTF-8 string to MultiByte string */
|
||||
LDAP_F(int)
|
||||
ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
|
||||
size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) );
|
||||
|
||||
/* MultiByte character to UTF-8 character */
|
||||
LDAP_F(int)
|
||||
ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
|
||||
int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) );
|
||||
|
||||
/* MultiByte string to UTF-8 string */
|
||||
LDAP_F(int)
|
||||
ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
|
||||
size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) );
|
||||
|
||||
LDAP_END_DECL
|
||||
|
||||
#endif /* _LDAP_UTF8_H */
|
@ -18,7 +18,7 @@ SRCS = bind.c open.c result.c error.c compare.c search.c \
|
||||
request.c os-ip.c url.c sortctrl.c vlvctrl.c \
|
||||
init.c options.c print.c string.c util-int.c schema.c \
|
||||
charray.c tls.c dn.c os-local.c dnssrv.c \
|
||||
utf-8.c
|
||||
utf-8.c utf-8-conv.c
|
||||
OBJS = bind.lo open.lo result.lo error.lo compare.lo search.lo \
|
||||
controls.lo messages.lo references.lo extended.lo cyrus.lo \
|
||||
modify.lo add.lo modrdn.lo delete.lo abandon.lo ufn.lo cache.lo \
|
||||
@ -28,7 +28,7 @@ OBJS = bind.lo open.lo result.lo error.lo compare.lo search.lo \
|
||||
request.lo os-ip.lo url.lo sortctrl.lo vlvctrl.lo \
|
||||
init.lo options.lo print.lo string.lo util-int.lo schema.lo \
|
||||
charray.lo tls.lo dn.lo os-local.lo dnssrv.lo \
|
||||
utf-8.lo
|
||||
utf-8.lo utf-8-conv.lo
|
||||
|
||||
LDAP_INCDIR= ../../include
|
||||
LDAP_LIBDIR= ../../libraries
|
||||
|
464
libraries/libldap/utf-8-conv.c
Normal file
464
libraries/libldap/utf-8-conv.c
Normal file
@ -0,0 +1,464 @@
|
||||
/* $OpenLDAP$ */
|
||||
/*
|
||||
* Copyright 2000 The OpenLDAP Foundation, All Rights Reserved.
|
||||
* COPYING RESTRICTIONS APPLY, see COPYRIGHT file
|
||||
*/
|
||||
|
||||
/* $Novell: /ldap/src/cldap/libraries/libldap/utfconv.c,v 1.3 2000/12/11 19:35:37 dsteck Exp $ */
|
||||
/******************************************************************************
|
||||
* Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
|
||||
*
|
||||
* THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
|
||||
* TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
|
||||
* TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
|
||||
* AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
|
||||
* IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
|
||||
* OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
|
||||
* PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
|
||||
* THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
|
||||
******************************************************************************/
|
||||
|
||||
/*
|
||||
* UTF-8 Conversion Routines
|
||||
*
|
||||
* These routines convert between Wide Character and UTF-8,
|
||||
* or between MultiByte and UTF-8 encodings.
|
||||
*
|
||||
* Both single character and string versions of the functions are provided.
|
||||
* All functions return -1 if the character or string cannot be converted.
|
||||
*/
|
||||
|
||||
#include "portable.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */
|
||||
#include <ac/string.h>
|
||||
#include <ac/time.h> /* for time_t */
|
||||
|
||||
#include "ldap-int.h"
|
||||
|
||||
#include <ldap_utf8.h>
|
||||
|
||||
static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
|
||||
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
UTF-8 Format Summary
|
||||
|
||||
ASCII chars 7 bits
|
||||
0xxxxxxx
|
||||
|
||||
2-character UTF-8 sequence: 11 bits
|
||||
110xxxxx 10xxxxxx
|
||||
|
||||
3-character UTF-8 16 bits
|
||||
1110xxxx 10xxxxxx 10xxxxxx
|
||||
|
||||
4-char UTF-8 21 bits
|
||||
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
|
||||
5-char UTF-8 26 bits
|
||||
111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
|
||||
6-char UTF-8 31 bits
|
||||
1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||
|
||||
Unicode address space (0 - 0x10FFFF) 21 bits
|
||||
ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits
|
||||
|
||||
Note: This code does not prevent UTF-8 sequences which are longer than
|
||||
necessary from being decoded.
|
||||
*/
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
Convert a UTF-8 character to a wide char.
|
||||
Return the length of the UTF-8 input character in bytes.
|
||||
*/
|
||||
int
|
||||
ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
|
||||
{
|
||||
int utflen, i;
|
||||
wchar_t ch;
|
||||
|
||||
/* If input ptr is NULL, treat it as empty string. */
|
||||
if (utf8char == NULL)
|
||||
utf8char = "";
|
||||
|
||||
/* Get UTF-8 sequence length from 1st byte */
|
||||
utflen = UTF8_CHARLEN(utf8char);
|
||||
|
||||
if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
|
||||
return -1; /* Invalid input */
|
||||
|
||||
/* First byte minus length tag */
|
||||
ch = (wchar_t)(utf8char[0] & mask[utflen]);
|
||||
|
||||
for(i=1; i < utflen; i++)
|
||||
{
|
||||
/* Subsequent bytes must start with 10 */
|
||||
if ((utf8char[i] & 0xc0) != 0x80)
|
||||
return -1;
|
||||
|
||||
ch <<= 6; /* 6 bits of data in each subsequent byte */
|
||||
ch |= (wchar_t)(utf8char[i] & 0x3f);
|
||||
}
|
||||
|
||||
if (wchar)
|
||||
*wchar = ch;
|
||||
|
||||
return utflen;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Convert a UTF-8 string to a wide char string.
|
||||
No more than 'count' wide chars will be written to the output buffer.
|
||||
Return the size of the converted string in wide chars, excl null terminator.
|
||||
*/
|
||||
int
|
||||
ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
|
||||
{
|
||||
size_t wclen = 0;
|
||||
int utflen, i;
|
||||
wchar_t ch;
|
||||
|
||||
|
||||
/* If input ptr is NULL, treat it as empty string. */
|
||||
if (utf8str == NULL)
|
||||
utf8str = "";
|
||||
|
||||
/* Examine next UTF-8 character. If output buffer is NULL, ignore count */
|
||||
while ( *utf8str && (wcstr==NULL || wclen<count) )
|
||||
{
|
||||
/* Get UTF-8 sequence length from 1st byte */
|
||||
utflen = UTF8_CHARLEN(utf8str);
|
||||
|
||||
if( utflen==0 || utflen > LDAP_MAX_UTF8_LEN )
|
||||
return -1; /* Invalid input */
|
||||
|
||||
/* First byte minus length tag */
|
||||
ch = (wchar_t)(utf8str[0] & mask[utflen]);
|
||||
|
||||
for(i=1; i < utflen; i++)
|
||||
{
|
||||
/* Subsequent bytes must start with 10 */
|
||||
if ((utf8str[i] & 0xc0) != 0x80)
|
||||
return -1;
|
||||
|
||||
ch <<= 6; /* 6 bits of data in each subsequent byte */
|
||||
ch |= (wchar_t)(utf8str[i] & 0x3f);
|
||||
}
|
||||
|
||||
if (wcstr)
|
||||
wcstr[wclen] = ch;
|
||||
|
||||
utf8str += utflen; /* Move to next UTF-8 character */
|
||||
wclen++; /* Count number of wide chars stored/required */
|
||||
}
|
||||
|
||||
/* Add null terminator if there's room in the buffer. */
|
||||
if (wcstr && wclen < count)
|
||||
wcstr[wclen] = 0;
|
||||
|
||||
return wclen;
|
||||
}
|
||||
|
||||
|
||||
/*------------------------------------------------------------------------------
|
||||
Convert one wide char to a UTF-8 character.
|
||||
Return the length of the converted UTF-8 character in bytes.
|
||||
No more than 'count' bytes will be written to the output buffer.
|
||||
*/
|
||||
int
|
||||
ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
|
||||
{
|
||||
int len=0;
|
||||
|
||||
if (utf8char == NULL) /* Just determine the required UTF-8 char length. */
|
||||
{ /* Ignore count */
|
||||
if( wchar < 0 )
|
||||
return -1;
|
||||
if( wchar < 0x80 )
|
||||
return 1;
|
||||
if( wchar < 0x800 )
|
||||
return 2;
|
||||
if( wchar < 0x10000 )
|
||||
return 3;
|
||||
if( wchar < 0x200000 )
|
||||
return 4;
|
||||
if( wchar < 0x4000000 )
|
||||
return 5;
|
||||
if( wchar < 0x80000000 )
|
||||
return 6;
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
||||
if ( wchar < 0 ) { /* Invalid wide character */
|
||||
len = -1;
|
||||
|
||||
} else if( wchar < 0x80 ) {
|
||||
if (count >= 1) {
|
||||
utf8char[len++] = (char)wchar;
|
||||
}
|
||||
|
||||
} else if( wchar < 0x800 ) {
|
||||
if (count >=2) {
|
||||
utf8char[len++] = 0xc0 | ( wchar >> 6 );
|
||||
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||
}
|
||||
|
||||
} else if( wchar < 0x10000 ) {
|
||||
if (count >= 3) {
|
||||
utf8char[len++] = 0xe0 | ( wchar >> 12 );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||
}
|
||||
|
||||
} else if( wchar < 0x200000 ) {
|
||||
if (count >= 4) {
|
||||
utf8char[len++] = 0xf0 | ( wchar >> 18 );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||
}
|
||||
|
||||
} else if( wchar < 0x4000000 ) {
|
||||
if (count >= 5) {
|
||||
utf8char[len++] = 0xf8 | ( wchar >> 24 );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||
}
|
||||
|
||||
} else if( wchar < 0x80000000 ) {
|
||||
if (count >= 6) {
|
||||
utf8char[len++] = 0xfc | ( wchar >> 30 );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
|
||||
utf8char[len++] = 0x80 | ( wchar & 0x3f );
|
||||
}
|
||||
|
||||
} else
|
||||
len = -1;
|
||||
|
||||
return len;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Convert a wide char string to a UTF-8 string.
|
||||
No more than 'count' bytes will be written to the output buffer.
|
||||
Return the # of bytes written to the output buffer, excl null terminator.
|
||||
*/
|
||||
int
|
||||
ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
|
||||
{
|
||||
int len = 0;
|
||||
int n;
|
||||
char *p = utf8str;
|
||||
wchar_t empty = 0; /* To avoid use of L"" construct */
|
||||
|
||||
if (wcstr == NULL) /* Treat input ptr NULL as an empty string */
|
||||
wcstr = ∅
|
||||
|
||||
if (utf8str == NULL) /* Just compute size of output, excl null */
|
||||
{
|
||||
while (*wcstr)
|
||||
{
|
||||
/* Get UTF-8 size of next wide char */
|
||||
n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
|
||||
if (n == -1)
|
||||
return -1;
|
||||
len += n;
|
||||
}
|
||||
|
||||
return len;
|
||||
}
|
||||
|
||||
|
||||
/* Do the actual conversion. */
|
||||
|
||||
n = 1; /* In case of empty wcstr */
|
||||
while (*wcstr)
|
||||
{
|
||||
n = ldap_x_wc_to_utf8( p, *wcstr++, count);
|
||||
|
||||
if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */
|
||||
break;
|
||||
|
||||
p += n;
|
||||
count -= n; /* Space left in output buffer */
|
||||
}
|
||||
|
||||
/* If not enough room for last character, pad remainder with null
|
||||
so that return value = original count, indicating buffer full. */
|
||||
if (n == 0)
|
||||
{
|
||||
while (count--)
|
||||
*p++ = 0;
|
||||
}
|
||||
|
||||
/* Add a null terminator if there's room. */
|
||||
else if (count)
|
||||
*p = 0;
|
||||
|
||||
if (n == -1) /* Conversion encountered invalid wide char. */
|
||||
return -1;
|
||||
|
||||
/* Return the number of bytes written to output buffer, excl null. */
|
||||
return (p - utf8str);
|
||||
}
|
||||
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Convert a UTF-8 character to a MultiByte character.
|
||||
Return the size of the converted character in bytes.
|
||||
*/
|
||||
int
|
||||
ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
|
||||
int (*f_wctomb)(char *mbchar, wchar_t wchar) )
|
||||
{
|
||||
wchar_t wchar;
|
||||
int n;
|
||||
char tmp[6]; /* Large enough for biggest multibyte char */
|
||||
|
||||
if (f_wctomb == NULL) /* If no conversion function was given... */
|
||||
f_wctomb = wctomb; /* use the local ANSI C function */
|
||||
|
||||
/* First convert UTF-8 char to a wide char */
|
||||
n = ldap_x_utf8_to_wc( &wchar, utf8char);
|
||||
|
||||
if (n == -1)
|
||||
return -1; /* Invalid UTF-8 character */
|
||||
|
||||
if (mbchar == NULL)
|
||||
n = f_wctomb( tmp, wchar );
|
||||
else
|
||||
n = f_wctomb( mbchar, wchar);
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Convert a UTF-8 string to a MultiByte string.
|
||||
No more than 'count' bytes will be written to the output buffer.
|
||||
Return the size of the converted string in bytes, excl null terminator.
|
||||
*/
|
||||
int
|
||||
ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
|
||||
size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
|
||||
{
|
||||
wchar_t *wcs;
|
||||
size_t wcsize;
|
||||
int n;
|
||||
|
||||
if (f_wcstombs == NULL) /* If no conversion function was given... */
|
||||
f_wcstombs = wcstombs; /* use the local ANSI C function */
|
||||
|
||||
if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */
|
||||
{
|
||||
if (mbstr)
|
||||
*mbstr = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Allocate memory for the maximum size wchar string that we could get. */
|
||||
wcsize = strlen(utf8str) + 1;
|
||||
wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
|
||||
if (wcs == NULL)
|
||||
return -1; /* Memory allocation failure. */
|
||||
|
||||
/* First convert the UTF-8 string to a wide char string */
|
||||
n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
|
||||
|
||||
/* Then convert wide char string to multi-byte string */
|
||||
if (n != -1)
|
||||
{
|
||||
n = f_wcstombs(mbstr, wcs, count);
|
||||
}
|
||||
|
||||
LDAP_FREE(wcs);
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Convert a MultiByte character to a UTF-8 character.
|
||||
'mbsize' indicates the number of bytes of 'mbchar' to check.
|
||||
Returns the number of bytes written to the output character.
|
||||
*/
|
||||
int
|
||||
ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
|
||||
int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
|
||||
{
|
||||
wchar_t wchar;
|
||||
int n;
|
||||
|
||||
if (f_mbtowc == NULL) /* If no conversion function was given... */
|
||||
f_mbtowc = mbtowc; /* use the local ANSI C function */
|
||||
|
||||
if (mbsize == 0) /* 0 is not valid. */
|
||||
return -1;
|
||||
|
||||
if (mbchar == NULL || *mbchar == 0)
|
||||
{
|
||||
if (utf8char)
|
||||
*utf8char = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* First convert the MB char to a Wide Char */
|
||||
n = f_mbtowc( &wchar, mbchar, mbsize);
|
||||
|
||||
if (n == -1)
|
||||
return -1;
|
||||
|
||||
/* Convert the Wide Char to a UTF-8 character. */
|
||||
n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
|
||||
/*-----------------------------------------------------------------------------
|
||||
Convert a MultiByte string to a UTF-8 string.
|
||||
No more than 'count' bytes will be written to the output buffer.
|
||||
Return the size of the converted string in bytes, excl null terminator.
|
||||
*/
|
||||
int
|
||||
ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
|
||||
size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
|
||||
{
|
||||
wchar_t *wcs;
|
||||
int n;
|
||||
size_t wcsize;
|
||||
|
||||
if (mbstr == NULL) /* Treat NULL input string as an empty string */
|
||||
mbstr = "";
|
||||
|
||||
if (f_mbstowcs == NULL) /* If no conversion function was given... */
|
||||
f_mbstowcs = mbstowcs; /* use the local ANSI C function */
|
||||
|
||||
/* Allocate memory for the maximum size wchar string that we could get. */
|
||||
wcsize = strlen(mbstr) + 1;
|
||||
wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
|
||||
if (wcs == NULL)
|
||||
return -1;
|
||||
|
||||
/* First convert multi-byte string to a wide char string */
|
||||
n = f_mbstowcs(wcs, mbstr, wcsize);
|
||||
|
||||
/* Convert wide char string to UTF-8 string */
|
||||
if (n != -1)
|
||||
{
|
||||
n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
|
||||
}
|
||||
|
||||
LDAP_FREE(wcs);
|
||||
|
||||
return n;
|
||||
}
|
Loading…
Reference in New Issue
Block a user