mirror of
https://git.openldap.org/openldap/openldap.git
synced 2025-01-30 13:30:57 +08:00
Added code for approximate matching in UTF8bvnormalize() and changed to use
this in approxMatch etc in schema_init.c
This commit is contained in:
parent
5d347b6153
commit
94983da942
@ -141,6 +141,7 @@ LDAP_LUNICODE_F(void) ucstr2upper(
|
||||
#define LDAP_UTF8_CASEFOLD 0x1U
|
||||
#define LDAP_UTF8_ARG1NFC 0x2U
|
||||
#define LDAP_UTF8_ARG2NFC 0x4U
|
||||
#define LDAP_UTF8_APPROX 0x8U
|
||||
|
||||
LDAP_LUNICODE_F(char *) UTF8normalize(
|
||||
struct berval *,
|
||||
|
@ -245,12 +245,14 @@ char * UTF8normalize(
|
||||
struct berval * UTF8bvnormalize(
|
||||
struct berval *bv,
|
||||
struct berval *newbv,
|
||||
unsigned casefold )
|
||||
unsigned flags )
|
||||
{
|
||||
int i, j, len, clen, outpos, ucsoutlen, outsize, last;
|
||||
char *out, *s;
|
||||
unsigned long *ucs, *p, *ucsout;
|
||||
|
||||
|
||||
unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
|
||||
unsigned approx = flags & LDAP_UTF8_APPROX;
|
||||
static unsigned char mask[] = {
|
||||
0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
|
||||
|
||||
@ -361,20 +363,28 @@ struct berval * UTF8bvnormalize(
|
||||
}
|
||||
/* normalize ucs of length p - ucs */
|
||||
uccanondecomp( ucs, p - ucs, &ucsout, &ucsoutlen );
|
||||
ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
|
||||
/* convert ucs to utf-8 and store in out */
|
||||
for ( j = 0; j < ucsoutlen; j++ ) {
|
||||
/* allocate more space if not enough room for
|
||||
6 bytes and terminator */
|
||||
if ( outsize - outpos < 7 ) {
|
||||
outsize = ucsoutlen - j + outpos + 6;
|
||||
out = (char *) realloc( out, outsize );
|
||||
if ( out == NULL ) {
|
||||
free( ucs );
|
||||
return NULL;
|
||||
if ( approx ) {
|
||||
for ( j = 0; j < ucsoutlen; j++ ) {
|
||||
if ( ucsout[j] < 0x80 ) {
|
||||
out[outpos++] = ucsout[j];
|
||||
}
|
||||
}
|
||||
outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
|
||||
} else {
|
||||
ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
|
||||
/* convert ucs to utf-8 and store in out */
|
||||
for ( j = 0; j < ucsoutlen; j++ ) {
|
||||
/* allocate more space if not enough room for
|
||||
6 bytes and terminator */
|
||||
if ( outsize - outpos < 7 ) {
|
||||
outsize = ucsoutlen - j + outpos + 6;
|
||||
out = (char *) realloc( out, outsize );
|
||||
if ( out == NULL ) {
|
||||
free( ucs );
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
|
||||
}
|
||||
}
|
||||
|
||||
if ( i == len ) {
|
||||
|
@ -646,30 +646,6 @@ err:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* Strip characters with the 8th bit set */
|
||||
static char *
|
||||
strip8bitChars(
|
||||
char *in )
|
||||
{
|
||||
char *p = in, *q;
|
||||
|
||||
if( in == NULL ) {
|
||||
return NULL;
|
||||
}
|
||||
while( *p ) {
|
||||
if( *p & 0x80 ) {
|
||||
q = p;
|
||||
while( *++q & 0x80 ) {
|
||||
/* empty */
|
||||
}
|
||||
p = AC_MEMCPY(p, q, strlen(q) + 1);
|
||||
} else {
|
||||
p++;
|
||||
}
|
||||
}
|
||||
return in;
|
||||
}
|
||||
|
||||
#ifndef SLAPD_APPROX_OLDSINGLESTRING
|
||||
|
||||
#if defined(SLAPD_APPROX_INITIALS)
|
||||
@ -689,31 +665,27 @@ approxMatch(
|
||||
struct berval *value,
|
||||
void *assertedValue )
|
||||
{
|
||||
char *val, *nval, *assertv, **values, **words, *c;
|
||||
struct berval *nval, *assertv;
|
||||
char *val, **values, **words, *c;
|
||||
int i, count, len, nextchunk=0, nextavail=0;
|
||||
size_t avlen;
|
||||
|
||||
/* Yes, this is necessary */
|
||||
nval = UTF8normalize( value, LDAP_UTF8_NOCASEFOLD );
|
||||
nval = UTF8bvnormalize( value, NULL, LDAP_UTF8_APPROX );
|
||||
if( nval == NULL ) {
|
||||
*matchp = 1;
|
||||
return LDAP_SUCCESS;
|
||||
}
|
||||
strip8bitChars( nval );
|
||||
|
||||
/* Yes, this is necessary */
|
||||
assertv = UTF8normalize( ((struct berval *)assertedValue),
|
||||
LDAP_UTF8_NOCASEFOLD );
|
||||
assertv = UTF8bvnormalize( ((struct berval *)assertedValue), NULL, LDAP_UTF8_APPROX );
|
||||
if( assertv == NULL ) {
|
||||
ch_free( nval );
|
||||
ber_bvfree( nval );
|
||||
*matchp = 1;
|
||||
return LDAP_SUCCESS;
|
||||
}
|
||||
strip8bitChars( assertv );
|
||||
avlen = strlen( assertv );
|
||||
|
||||
/* Isolate how many words there are */
|
||||
for( c=nval,count=1; *c; c++ ) {
|
||||
for ( c = nval->bv_val, count = 1; *c; c++ ) {
|
||||
c = strpbrk( c, SLAPD_APPROX_DELIMITER );
|
||||
if ( c == NULL ) break;
|
||||
*c = '\0';
|
||||
@ -723,7 +695,7 @@ approxMatch(
|
||||
/* Get a phonetic copy of each word */
|
||||
words = (char **)ch_malloc( count * sizeof(char *) );
|
||||
values = (char **)ch_malloc( count * sizeof(char *) );
|
||||
for( c=nval,i=0; i<count; i++,c+=strlen(c)+1 ) {
|
||||
for ( c = nval->bv_val, i = 0; i < count; i++, c += strlen(c) + 1 ) {
|
||||
words[i] = c;
|
||||
values[i] = phonetic(c);
|
||||
}
|
||||
@ -731,8 +703,8 @@ approxMatch(
|
||||
/* Work through the asserted value's words, to see if at least some
|
||||
of the words are there, in the same order. */
|
||||
len = 0;
|
||||
while ( (size_t) nextchunk < avlen ) {
|
||||
len = strcspn( assertv + nextchunk, SLAPD_APPROX_DELIMITER);
|
||||
while ( (ber_len_t) nextchunk < assertv->bv_len ) {
|
||||
len = strcspn( assertv->bv_val + nextchunk, SLAPD_APPROX_DELIMITER);
|
||||
if( len == 0 ) {
|
||||
nextchunk++;
|
||||
continue;
|
||||
@ -741,7 +713,7 @@ approxMatch(
|
||||
else if( len == 1 ) {
|
||||
/* Single letter words need to at least match one word's initial */
|
||||
for( i=nextavail; i<count; i++ )
|
||||
if( !strncasecmp( assertv+nextchunk, words[i], 1 )) {
|
||||
if( !strncasecmp( assertv->bv_val + nextchunk, words[i], 1 )) {
|
||||
nextavail=i+1;
|
||||
break;
|
||||
}
|
||||
@ -749,8 +721,8 @@ approxMatch(
|
||||
#endif
|
||||
else {
|
||||
/* Isolate the next word in the asserted value and phonetic it */
|
||||
assertv[nextchunk+len] = '\0';
|
||||
val = phonetic( assertv + nextchunk );
|
||||
assertv->bv_val[nextchunk+len] = '\0';
|
||||
val = phonetic( assertv->bv_val + nextchunk );
|
||||
|
||||
/* See if this phonetic chunk is in the remaining words of *value */
|
||||
for( i=nextavail; i<count; i++ ){
|
||||
@ -781,13 +753,13 @@ approxMatch(
|
||||
}
|
||||
|
||||
/* Cleanup allocs */
|
||||
free( assertv );
|
||||
ber_bvfree( assertv );
|
||||
for( i=0; i<count; i++ ) {
|
||||
ch_free( values[i] );
|
||||
}
|
||||
ch_free( values );
|
||||
ch_free( words );
|
||||
ch_free( nval );
|
||||
ber_bvfree( nval );
|
||||
|
||||
return LDAP_SUCCESS;
|
||||
}
|
||||
@ -802,18 +774,18 @@ approxIndexer(
|
||||
BerVarray values,
|
||||
BerVarray *keysp )
|
||||
{
|
||||
char *val, *c;
|
||||
char *c;
|
||||
int i,j, len, wordcount, keycount=0;
|
||||
struct berval *newkeys;
|
||||
struct berval *val, *newkeys;
|
||||
BerVarray keys=NULL;
|
||||
|
||||
for( j=0; values[j].bv_val != NULL; j++ ) {
|
||||
/* Yes, this is necessary */
|
||||
val = UTF8normalize( &values[j], LDAP_UTF8_NOCASEFOLD );
|
||||
strip8bitChars( val );
|
||||
val = UTF8bvnormalize( &values[j], NULL, LDAP_UTF8_APPROX );
|
||||
assert( val != NULL && val->bv_val != NULL );
|
||||
|
||||
/* Isolate how many words there are. There will be a key for each */
|
||||
for( wordcount=0,c=val; *c; c++) {
|
||||
for( wordcount = 0, c = val->bv_val; *c; c++) {
|
||||
len = strcspn(c, SLAPD_APPROX_DELIMITER);
|
||||
if( len >= SLAPD_APPROX_WORDLEN ) wordcount++;
|
||||
c+= len;
|
||||
@ -829,7 +801,7 @@ approxIndexer(
|
||||
keys = newkeys;
|
||||
|
||||
/* Get a phonetic copy of each word */
|
||||
for( c=val,i=0; i<wordcount; c+=len+1 ) {
|
||||
for( c = val->bv_val, i = 0; i < wordcount; c += len + 1 ) {
|
||||
len = strlen( c );
|
||||
if( len < SLAPD_APPROX_WORDLEN ) continue;
|
||||
ber_str2bv( phonetic( c ), 0, 0, &keys[keycount] );
|
||||
@ -837,7 +809,7 @@ approxIndexer(
|
||||
i++;
|
||||
}
|
||||
|
||||
free( val );
|
||||
ber_bvfree( val );
|
||||
}
|
||||
keys[keycount].bv_val = NULL;
|
||||
*keysp = keys;
|
||||
@ -855,23 +827,23 @@ approxFilter(
|
||||
void * assertValue,
|
||||
BerVarray *keysp )
|
||||
{
|
||||
char *val, *c;
|
||||
char *c;
|
||||
int i, count, len;
|
||||
struct berval *val;
|
||||
BerVarray keys;
|
||||
|
||||
/* Yes, this is necessary */
|
||||
val = UTF8normalize( ((struct berval *)assertValue),
|
||||
LDAP_UTF8_NOCASEFOLD );
|
||||
if( val == NULL ) {
|
||||
val = UTF8bvnormalize( ((struct berval *)assertValue), NULL, LDAP_UTF8_APPROX );
|
||||
if( val == NULL || val->bv_val == NULL ) {
|
||||
keys = (struct berval *)ch_malloc( sizeof(struct berval) );
|
||||
keys[0].bv_val = NULL;
|
||||
*keysp = keys;
|
||||
ber_bvfree( val );
|
||||
return LDAP_SUCCESS;
|
||||
}
|
||||
strip8bitChars( val );
|
||||
|
||||
/* Isolate how many words there are. There will be a key for each */
|
||||
for( count=0,c=val; *c; c++) {
|
||||
for( count = 0,c = val->bv_val; *c; c++) {
|
||||
len = strcspn(c, SLAPD_APPROX_DELIMITER);
|
||||
if( len >= SLAPD_APPROX_WORDLEN ) count++;
|
||||
c+= len;
|
||||
@ -883,14 +855,14 @@ approxFilter(
|
||||
keys = (struct berval *)ch_malloc( (count + 1) * sizeof(struct berval) );
|
||||
|
||||
/* Get a phonetic copy of each word */
|
||||
for( c=val,i=0; i<count; c+=len+1 ) {
|
||||
for( c = val->bv_val, i = 0; i < count; c += len + 1 ) {
|
||||
len = strlen(c);
|
||||
if( len < SLAPD_APPROX_WORDLEN ) continue;
|
||||
ber_str2bv( phonetic( c ), 0, 0, &keys[i] );
|
||||
i++;
|
||||
}
|
||||
|
||||
free( val );
|
||||
ber_bvfree( val );
|
||||
|
||||
keys[count].bv_val = NULL;
|
||||
*keysp = keys;
|
||||
|
Loading…
Reference in New Issue
Block a user