openldap/libraries/libldap/utf-8.c

/* $OpenLDAP$ */
/*
 * Copyright 1998-2000 The OpenLDAP Foundation, All Rights Reserved.
 * COPYING RESTRICTIONS APPLY, see COPYRIGHT file
 */

/*
 * Basic UTF-8 routines
 *
 * These routines are "dumb".  Though they understand UTF-8,
 * they don't grok Unicode.  That is, they can push bits,
 * but don't have a clue what the bits represent.  That's
 * good enough for use with the LDAP Client SDK.
 *
 * These routines are not optimized.
 */

#include "portable.h"

#include <stdio.h>

#include <ac/stdlib.h>

#include <ac/socket.h>
#include <ac/string.h>
#include <ac/time.h>

#include "ldap-int.h"
#include "ldap_defaults.h"

#undef ISASCII
#define ISASCII(uc)	((uc) < 0x100)
#undef UCS4_INVALID
#define UCS4_INVALID	0x80000000U

/*
 * Basic UTF-8 routines
 */

/*
 * return the number of bytes required to hold the
 * NULL-terminated UTF-8 string INCLUDING the
 * termination.
 */
ber_len_t ldap_utf8_bytes( const char * p )
{
	ber_len_t bytes = 0;

	if( p == NULL ) return bytes;

	while( p[bytes++] ) {
		/* EMPTY */ ;
	}

	return bytes;
}

ber_len_t ldap_utf8_chars( const char * p )
{
	/* could be optimized and could check for invalid sequences */
	ber_len_t chars=0;

	for( ; *p ; p=LDAP_UTF8_NEXT(p) ) {
		chars++;
	};

	return chars;
}

/*
 * Returns length indicated by first byte.
 *
 * This function should use a table lookup.
 */
int ldap_utf8_charlen( const char * p )
{
	unsigned c = * (const unsigned char *) p;

	if ((c & 0xfe ) == 0xfc) {
		return 6;
	}

	if ((c & 0xfc ) == 0xf8) {
		return 5;
	}

	if ((c & 0xf8 ) == 0xf0) {
		return 4;
	}

	if ((c & 0xf0 ) == 0xe0) {
		return 3;
	}

	if ((c & 0xe0 ) == 0xc0) {
		return 2;
	}

	if ((c & 0x80 ) == 0x80) {
		/* INVALID */
		return 0;
	}

	return 1;
}

/* conv UTF-8 to UCS-4, useful for comparisons */
ber_int_t ldap_utf8_to_ucs4( const char * p )
{
    const unsigned char *c = p;
    ber_int_t ch;
	int len, i;
	static unsigned char mask[] = {
		0, 0x7f, 0x1F, 0x0F, 0x07, 0x03, 0x01 };

	len = LDAP_UTF8_CHARLEN(p);

	if( len == 0 ) return UCS4_INVALID;

	ch = c[0] & mask[len];

	for(i=1; i < len; i++) {
		if ((c[i] & 0xc0) != 0x80) {
			return UCS4_INVALID;
		}

		ch <<= 6;
		ch |= c[i] & 0x3f;
	}

	return ch;
}

/* conv UCS-4 to UTF-8, not used */
int ldap_ucs4_to_utf8( ber_int_t c, char *buf )
{
	int len=0;
	unsigned char* p = buf;
	if(buf == NULL) return 0;

	if ( c < 0 ) {
		/* not a valid Unicode character */

	} else if( c < 0x80 ) {
		p[len++] = c;

	} else if( c < 0x800 ) {
		p[len++] = 0xc0 | ( c >> 6 );
		p[len++] = 0x80 | ( c & 0x3F );

	} else if( c < 0x10000 ) {
		p[len++] = 0xe0 | ( c >> 12 );
		p[len++] = 0x80 | ( (c >> 6) & 0x3F );
		p[len++] = 0x80 | ( c & 0x3F );

	} else if( c < 0x200000 ) {
		p[len++] = 0xf0 | ( c >> 18 );
		p[len++] = 0x80 | ( (c >> 12) & 0x3F );
		p[len++] = 0x80 | ( (c >> 6) & 0x3F );
		p[len++] = 0x80 | ( c & 0x3F );

	} else if( c < 0x400000 ) {
		p[len++] = 0xf8 | ( c >> 24 );
		p[len++] = 0x80 | ( (c >> 18) & 0x3F );
		p[len++] = 0x80 | ( (c >> 12) & 0x3F );
		p[len++] = 0x80 | ( (c >> 6) & 0x3F );
		p[len++] = 0x80 | ( c & 0x3F );

	} else /* if( c < 0x80000000 ) */ {
		p[len++] = 0xfc | ( c >> 30 );
		p[len++] = 0x80 | ( (c >> 24) & 0x3F );
		p[len++] = 0x80 | ( (c >> 18) & 0x3F );
		p[len++] = 0x80 | ( (c >> 12) & 0x3F );
		p[len++] = 0x80 | ( (c >> 6) & 0x3F );
		p[len++] = 0x80 | ( c & 0x3F );
	}

	buf[len] = '\0';
	return len;
}

/*
 * Advance to the next UTF-8 character
 *
 * Ignores length of multibyte character, instead rely on
 * continuation markers to find start of next character.
 * This allows for "resyncing" of when invalid characters
 * are provided provided the start of the next character
 * is appears within the 6 bytes examined.
 */
char* ldap_utf8_next( const char * p )
{
	int i;
	const unsigned char *u = p;

	if( LDAP_UTF8_ISASCII(u) ) {
		return (char *) &p[1];
	}

	for( i=1; i<6; i++ ) {
		if ( u[i] & 0xC0 != 0x80 ) {
			return (char *) &p[i];
		}
	}

	return (char *) &p[i];
}

/*
 * Advance to the previous UTF-8 character
 *
 * Ignores length of multibyte character, instead rely on
 * continuation markers to find start of next character.
 * This allows for "resyncing" of when invalid characters
 * are provided provided the start of the next character
 * is appears within the 6 bytes examined.
 */
char* ldap_utf8_prev( const char * p )
{
	int i;
	const unsigned char *u = p;

	for( i=-1; i>-6 ; i-- ) {
		if ( u[i] & 0xC0 != 0x80 ) {
			return (char *) &p[i];
		}
	}

	return (char *) &p[i];
}

/*
 * Copy one UTF-8 character from src to dst returning
 * number of bytes copied.
 *
 * Ignores length of multibyte character, instead rely on
 * continuation markers to find start of next character.
 * This allows for "resyncing" of when invalid characters
 * are provided provided the start of the next character
 * is appears within the 6 bytes examined.
 */
int ldap_utf8_copy( char* dst, const char *src )
{
	int i;
	const unsigned char *u = src;

	dst[0] = src[0];

	if( LDAP_UTF8_ISASCII(u) ) {
		return 1;
	}

	for( i=1; i<6; i++ ) {
		if ( u[i] & 0xC0 != 0x80 ) {
			return i; 
		}
		dst[i] = src[i];
	}

	return i;
}

/*
 * UTF-8 ctype routines
 * Only deals with characters < 0x100 (ie: US-ASCII)
 */

int ldap_utf8_isascii( const char * p )
{
	unsigned c = * (const unsigned char *) p;
	return ISASCII(c);
}

int ldap_utf8_isdigit( const char * p )
{
	unsigned c = * (const unsigned char *) p;

	if(!ISASCII(c)) return 0;

	return c >= '0' && c <= '9';
}

int ldap_utf8_isxdigit( const char * p )
{
	unsigned c = * (const unsigned char *) p;

	if(!ISASCII(c)) return 0;

	return ( c >= '0' && c <= '9' )
		|| ( c >= 'A' && c <= 'F' )
		|| ( c >= 'a' && c <= 'f' );
}

int ldap_utf8_isspace( const char * p )
{
	unsigned c = * (const unsigned char *) p;

	if(!ISASCII(c)) return 0;

	switch(c) {
	case ' ':
	case '\t':
	case '\n':
	case '\r':
	case '\v':
	case '\f':
		return 1;
	}

	return 0;
}

#ifndef UTF8_ALPHA_CTYPE
/*
 * These are not needed by the C SDK and are
 * not "good enough" for general use.
 */
int ldap_utf8_isalpha( const char * p )
{
	unsigned c = * (const unsigned char *) p;

	if(!ISASCII(c)) return 0;

	return ( c >= 'A' && c <= 'Z' )
		|| ( c >= 'a' && c <= 'z' );
}

int ldap_utf8_isalnum( const char * p )
{
	unsigned c = * (const unsigned char *) p;

	if(!ISASCII(c)) return 0;

	return ( c >= '0' && c <= '9' )
		|| ( c >= 'A' && c <= 'Z' )
		|| ( c >= 'a' && c <= 'z' );
}

int ldap_utf8_islower( const char * p )
{
	unsigned c = * (const unsigned char *) p;

	if(!UTF8_ISASCII(c)) return 0;

	return ( c >= 'a' && c <= 'z' );
}

int ldap_utf8_isupper( const char * p )
{
	unsigned c = * (const unsigned char *) p;

	if(!ISASCII(c)) return 0;

	return ( c >= 'A' && c <= 'Z' );
}
#endif


/*
 * UTF-8 string routines
 */

/* like strcspn() but returns number of bytes, not characters */
ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
{
	const char *cstr;

	for( cstr = str; *cstr != '\0'; cstr = LDAP_UTF8_NEXT(cstr) ) {
		const char *cset;

		for( cset = set; ; cset = LDAP_UTF8_NEXT(cset) ) {
			if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
				return cstr - str;
			} 
		}
	}

	return cstr - str;
}

/* like strspn() but returns number of bytes, not characters */
ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
{
	const char *cstr;

	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
		const char *cset;

		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
			if( *cset == '\0' ) {
				return cstr - str;
			}

			if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
				break;
			} 
		}
	}

	return cstr - str;
}

/* like strpbrk(), replaces strchr() as well */
char *(ldap_utf8_strpbrk)( const char *str, const char *set )
{
	int len;
	const char *cstr;

	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
		const char *cset;

		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
			if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {
				return (char *) cstr;
			} 
		}
	}

	return NULL;
}

/* like strtok_r(), not strtok() */
char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
{
	char *begin;
	char *end;

	if( last == NULL ) return NULL;

	begin = str ? str : *last;

	begin += ldap_utf8_strspn( begin, sep );

	if( *begin == '\0' ) {
		*last = NULL;
		return NULL;
	}

	end = &begin[ ldap_utf8_strcpn( begin, sep ) ];

	if( *end != '\0' ) {
		char *next = LDAP_UTF8_NEXT( end );
		*end = '\0';
		end = next;
	}

	*last = end;
	return begin;
}
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`/* $OpenLDAP$ */`
			`/*`
Add comments. 2000-01-23 10:39:55 +08:00			`* Copyright 1998-2000 The OpenLDAP Foundation, All Rights Reserved.`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`* COPYING RESTRICTIONS APPLY, see COPYRIGHT file`
			`*/`

			`/*`
			`* Basic UTF-8 routines`
			`*`
Add comments. 2000-01-23 10:39:55 +08:00			`* These routines are "dumb". Though they understand UTF-8,`
			`* they don't grok Unicode. That is, they can push bits,`
			`* but don't have a clue what the bits represent. That's`
			`* good enough for use with the LDAP Client SDK.`
			`*`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`* These routines are not optimized.`
			`*/`

			`#include "portable.h"`

			`#include <stdio.h>`

			`#include <ac/stdlib.h>`

			`#include <ac/socket.h>`
			`#include <ac/string.h>`
			`#include <ac/time.h>`

			`#include "ldap-int.h"`
			`#include "ldap_defaults.h"`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`#undef ISASCII`
			`#define ISASCII(uc) ((uc) < 0x100)`
			`#undef UCS4_INVALID`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`#define UCS4_INVALID 0x80000000U`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
Add comments. 2000-01-23 10:39:55 +08:00			`/*`
			`* Basic UTF-8 routines`
			`*/`

Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`/*`
			`* return the number of bytes required to hold the`
			`* NULL-terminated UTF-8 string INCLUDING the`
			`* termination.`
			`*/`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`ber_len_t ldap_utf8_bytes( const char * p )`
			`{`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`ber_len_t bytes = 0;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`if( p == NULL ) return bytes;`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`while( p[bytes++] ) {`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`/* EMPTY */ ;`
			`}`

Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`return bytes;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`}`

			`ber_len_t ldap_utf8_chars( const char * p )`
			`{`
Fix charlen and add getc 2000-01-22 11:40:54 +08:00			`/* could be optimized and could check for invalid sequences */`
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`ber_len_t chars=0;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`for( ; *p ; p=LDAP_UTF8_NEXT(p) ) {`
			`chars++;`
Fix charlen and add getc 2000-01-22 11:40:54 +08:00			`};`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
Fix charlen and add getc 2000-01-22 11:40:54 +08:00			`return chars;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`}`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`/*`
			`* Returns length indicated by first byte.`
			`*`
			`* This function should use a table lookup.`
			`*/`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`int ldap_utf8_charlen( const char * p )`
			`{`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`unsigned c = * (const unsigned char *) p;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`if ((c & 0xfe ) == 0xfc) {`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`return 6;`
			`}`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
			`if ((c & 0xfc ) == 0xf8) {`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`return 5;`
			`}`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
			`if ((c & 0xf8 ) == 0xf0) {`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`return 4;`
			`}`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
			`if ((c & 0xf0 ) == 0xe0) {`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`return 3;`
			`}`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
			`if ((c & 0xe0 ) == 0xc0) {`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`return 2;`
			`}`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`if ((c & 0x80 ) == 0x80) {`
			`/* INVALID */`
			`return 0;`
			`}`

			`return 1;`
			`}`

Add comments. 2000-01-23 10:39:55 +08:00			`/* conv UTF-8 to UCS-4, useful for comparisons */`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`ber_int_t ldap_utf8_to_ucs4( const char * p )`
			`{`
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`const unsigned char *c = p;`
			`ber_int_t ch;`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`int len, i;`
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`static unsigned char mask[] = {`
			`0, 0x7f, 0x1F, 0x0F, 0x07, 0x03, 0x01 };`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`len = LDAP_UTF8_CHARLEN(p);`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`if( len == 0 ) return UCS4_INVALID;`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`ch = c[0] & mask[len];`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00
			`for(i=1; i < len; i++) {`
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`if ((c[i] & 0xc0) != 0x80) {`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`return UCS4_INVALID;`
			`}`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`ch <<= 6;`
			`ch \|= c[i] & 0x3f;`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`}`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`return ch;`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`}`

Add comments. 2000-01-23 10:39:55 +08:00			`/* conv UCS-4 to UTF-8, not used */`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`int ldap_ucs4_to_utf8( ber_int_t c, char *buf )`
			`{`
			`int len=0;`
			`unsigned char* p = buf;`
			`if(buf == NULL) return 0;`

			`if ( c < 0 ) {`
			`/* not a valid Unicode character */`

			`} else if( c < 0x80 ) {`
			`p[len++] = c;`

			`} else if( c < 0x800 ) {`
			`p[len++] = 0xc0 \| ( c >> 6 );`
			`p[len++] = 0x80 \| ( c & 0x3F );`

			`} else if( c < 0x10000 ) {`
			`p[len++] = 0xe0 \| ( c >> 12 );`
			`p[len++] = 0x80 \| ( (c >> 6) & 0x3F );`
			`p[len++] = 0x80 \| ( c & 0x3F );`

			`} else if( c < 0x200000 ) {`
			`p[len++] = 0xf0 \| ( c >> 18 );`
			`p[len++] = 0x80 \| ( (c >> 12) & 0x3F );`
			`p[len++] = 0x80 \| ( (c >> 6) & 0x3F );`
			`p[len++] = 0x80 \| ( c & 0x3F );`

			`} else if( c < 0x400000 ) {`
			`p[len++] = 0xf8 \| ( c >> 24 );`
			`p[len++] = 0x80 \| ( (c >> 18) & 0x3F );`
			`p[len++] = 0x80 \| ( (c >> 12) & 0x3F );`
			`p[len++] = 0x80 \| ( (c >> 6) & 0x3F );`
			`p[len++] = 0x80 \| ( c & 0x3F );`

			`} else /* if( c < 0x80000000 ) */ {`
			`p[len++] = 0xfc \| ( c >> 30 );`
			`p[len++] = 0x80 \| ( (c >> 24) & 0x3F );`
			`p[len++] = 0x80 \| ( (c >> 18) & 0x3F );`
			`p[len++] = 0x80 \| ( (c >> 12) & 0x3F );`
			`p[len++] = 0x80 \| ( (c >> 6) & 0x3F );`
			`p[len++] = 0x80 \| ( c & 0x3F );`
			`}`

			`buf[len] = '\0';`
			`return len;`
			`}`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`/*`
			`* Advance to the next UTF-8 character`
			`*`
			`* Ignores length of multibyte character, instead rely on`
			`* continuation markers to find start of next character.`
			`* This allows for "resyncing" of when invalid characters`
			`* are provided provided the start of the next character`
			`* is appears within the 6 bytes examined.`
			`*/`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`char* ldap_utf8_next( const char * p )`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`{`
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`int i;`
			`const unsigned char *u = p;`

			`if( LDAP_UTF8_ISASCII(u) ) {`
			`return (char *) &p[1];`
			`}`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`for( i=1; i<6; i++ ) {`
			`if ( u[i] & 0xC0 != 0x80 ) {`
			`return (char *) &p[i];`
			`}`
			`}`

			`return (char *) &p[i];`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`}`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`/*`
			`* Advance to the previous UTF-8 character`
			`*`
			`* Ignores length of multibyte character, instead rely on`
			`* continuation markers to find start of next character.`
			`* This allows for "resyncing" of when invalid characters`
			`* are provided provided the start of the next character`
			`* is appears within the 6 bytes examined.`
			`*/`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`char* ldap_utf8_prev( const char * p )`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`{`
			`int i;`
Add additional UTF-8 routines 2000-01-23 02:48:37 +08:00			`const unsigned char *u = p;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`for( i=-1; i>-6 ; i-- ) {`
			`if ( u[i] & 0xC0 != 0x80 ) {`
			`return (char *) &p[i];`
			`}`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`}`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`return (char *) &p[i];`
			`}`

			`/*`
			`* Copy one UTF-8 character from src to dst returning`
			`* number of bytes copied.`
			`*`
			`* Ignores length of multibyte character, instead rely on`
			`* continuation markers to find start of next character.`
			`* This allows for "resyncing" of when invalid characters`
			`* are provided provided the start of the next character`
			`* is appears within the 6 bytes examined.`
			`*/`
			`int ldap_utf8_copy( char* dst, const char *src )`
			`{`
			`int i;`
			`const unsigned char *u = src;`

			`dst[0] = src[0];`

			`if( LDAP_UTF8_ISASCII(u) ) {`
			`return 1;`
			`}`

			`for( i=1; i<6; i++ ) {`
			`if ( u[i] & 0xC0 != 0x80 ) {`
			`return i;`
			`}`
			`dst[i] = src[i];`
			`}`

			`return i;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`}`

Add comments. 2000-01-23 10:39:55 +08:00			`/*`
			`* UTF-8 ctype routines`
			`* Only deals with characters < 0x100 (ie: US-ASCII)`
			`*/`

Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`int ldap_utf8_isascii( const char * p )`
			`{`
			`unsigned c = * (const unsigned char *) p;`
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`return ISASCII(c);`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`}`

			`int ldap_utf8_isdigit( const char * p )`
			`{`
			`unsigned c = * (const unsigned char *) p;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`if(!ISASCII(c)) return 0;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
			`return c >= '0' && c <= '9';`
			`}`

			`int ldap_utf8_isxdigit( const char * p )`
			`{`
			`unsigned c = * (const unsigned char *) p;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`if(!ISASCII(c)) return 0;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
			`return ( c >= '0' && c <= '9' )`
			`\|\| ( c >= 'A' && c <= 'F' )`
			`\|\| ( c >= 'a' && c <= 'f' );`
			`}`

Add comments. 2000-01-23 10:39:55 +08:00			`int ldap_utf8_isspace( const char * p )`
			`{`
			`unsigned c = * (const unsigned char *) p;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`if(!ISASCII(c)) return 0;`
Add comments. 2000-01-23 10:39:55 +08:00
			`switch(c) {`
			`case ' ':`
			`case '\t':`
			`case '\n':`
			`case '\r':`
			`case '\v':`
			`case '\f':`
			`return 1;`
			`}`

			`return 0;`
			`}`

			`#ifndef UTF8_ALPHA_CTYPE`
			`/*`
			`* These are not needed by the C SDK and are`
			`* not "good enough" for general use.`
			`*/`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00			`int ldap_utf8_isalpha( const char * p )`
			`{`
			`unsigned c = * (const unsigned char *) p;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`if(!ISASCII(c)) return 0;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
			`return ( c >= 'A' && c <= 'Z' )`
			`\|\| ( c >= 'a' && c <= 'z' );`
			`}`

			`int ldap_utf8_isalnum( const char * p )`
			`{`
			`unsigned c = * (const unsigned char *) p;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`if(!ISASCII(c)) return 0;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
			`return ( c >= '0' && c <= '9' )`
			`\|\| ( c >= 'A' && c <= 'Z' )`
			`\|\| ( c >= 'a' && c <= 'z' );`
			`}`

			`int ldap_utf8_islower( const char * p )`
			`{`
			`unsigned c = * (const unsigned char *) p;`

			`if(!UTF8_ISASCII(c)) return 0;`

			`return ( c >= 'a' && c <= 'z' );`
			`}`

			`int ldap_utf8_isupper( const char * p )`
			`{`
			`unsigned c = * (const unsigned char *) p;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`if(!ISASCII(c)) return 0;`
Initial UTF-8 routines. 2000-01-22 09:55:34 +08:00
			`return ( c >= 'A' && c <= 'Z' );`
			`}`
Add comments to UTF-8 declarations. Add US ASCII optimizations macros. #ifdef out unused routines Ready to hack getdn.c and others to support UTF-8 2000-01-23 13:35:38 +08:00			`#endif`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00
Add comments. 2000-01-23 10:39:55 +08:00
			`/*`
			`* UTF-8 string routines`
			`*/`

			`/* like strcspn() but returns number of bytes, not characters */`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`ber_len_t (ldap_utf8_strcspn)( const char str, const char set )`
			`{`
			`const char *cstr;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`for( cstr = str; *cstr != '\0'; cstr = LDAP_UTF8_NEXT(cstr) ) {`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`const char *cset;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`for( cset = set; ; cset = LDAP_UTF8_NEXT(cset) ) {`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {`
			`return cstr - str;`
			`}`
			`}`
			`}`

			`return cstr - str;`
			`}`

Add comments. 2000-01-23 10:39:55 +08:00			`/* like strspn() but returns number of bytes, not characters */`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`ber_len_t (ldap_utf8_strspn)( const char str, const char set )`
			`{`
			`const char *cstr;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`const char *cset;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`for( cset = set; ; LDAP_UTF8_INCR(cset) ) {`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`if( *cset == '\0' ) {`
			`return cstr - str;`
			`}`

			`if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {`
			`break;`
			`}`
			`}`
			`}`

			`return cstr - str;`
			`}`

Add comments. 2000-01-23 10:39:55 +08:00			`/* like strpbrk(), replaces strchr() as well */`
We'll need ldap_utf8_strpbrk() as well. 2000-01-23 05:03:21 +08:00			`char (ldap_utf8_strpbrk)( const char str, const char *set )`
			`{`
			`int len;`
			`const char *cstr;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {`
We'll need ldap_utf8_strpbrk() as well. 2000-01-23 05:03:21 +08:00			`const char *cset;`

Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`for( cset = set; ; LDAP_UTF8_INCR(cset) ) {`
We'll need ldap_utf8_strpbrk() as well. 2000-01-23 05:03:21 +08:00			`if( ldap_utf8_to_ucs4( cstr ) == ldap_utf8_to_ucs4( cset ) ) {`
Add comments to UTF-8 declarations. Add US ASCII optimizations macros. #ifdef out unused routines Ready to hack getdn.c and others to support UTF-8 2000-01-23 13:35:38 +08:00			`return (char *) cstr;`
We'll need ldap_utf8_strpbrk() as well. 2000-01-23 05:03:21 +08:00			`}`
			`}`
			`}`

			`return NULL;`
			`}`

Add comments. 2000-01-23 10:39:55 +08:00			`/* like strtok_r(), not strtok() */`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`char (ldap_utf8_strtok)(char str, const char sep, char *last)`
			`{`
			`char *begin;`
			`char *end;`

			`if( last == NULL ) return NULL;`

			`begin = str ? str : *last;`

			`begin += ldap_utf8_strspn( begin, sep );`

			`if( *begin == '\0' ) {`
			`*last = NULL;`
			`return NULL;`
			`}`

			`end = &begin[ ldap_utf8_strcpn( begin, sep ) ];`

			`if( *end != '\0' ) {`
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`char *next = LDAP_UTF8_NEXT( end );`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`*end = '\0';`
Add some robustness to UTF-8 routines. 2000-01-24 02:43:30 +08:00			`end = next;`
Add initial (untested) implementations of ldap_utf8_strtok, ldap_utf8_strcspn, ldap_utf8_strtok. 2000-01-23 04:55:43 +08:00			`}`

			`*last = end;`
			`return begin;`
			`}`