mirror of
https://git.openldap.org/openldap/openldap.git
synced 2024-12-21 03:10:25 +08:00
122 lines
4.0 KiB
Plaintext
122 lines
4.0 KiB
Plaintext
#
|
|
# $Id: README,v 1.1 1999/09/21 15:45:17 mleisher Exp $
|
|
#
|
|
# Copyright 1997, 1998, 1999 Computing Research Labs,
|
|
# New Mexico State University
|
|
#
|
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
|
# copy of this software and associated documentation files (the "Software"),
|
|
# to deal in the Software without restriction, including without limitation
|
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
# and/or sell copies of the Software, and to permit persons to whom the
|
|
# Software is furnished to do so, subject to the following conditions:
|
|
#
|
|
# The above copyright notice and this permission notice shall be included in
|
|
# all copies or substantial portions of the Software.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
|
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
|
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
|
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
#
|
|
|
|
Unicode and Boyer-Moore Searching
|
|
Version 0.2
|
|
|
|
UTBM (Unicode Tuned Boyer-Moore) is a simple package that provides tuned
|
|
Boyer-Moore searches on Unicode UCS2 text (handles high and low surrogates).
|
|
|
|
---------------------------------------------------------------------------
|
|
|
|
Assumptions:
|
|
|
|
o Search pattern and text already normalized in some fasion.
|
|
|
|
o Upper, lower, and title case conversions are one-to-one.
|
|
|
|
o For conversions between upper, lower, and title case, UCS2 characters
|
|
always convert to other UCS2 characters, and UTF-16 characters always
|
|
convert to other UTF-16 characters.
|
|
|
|
Flags:
|
|
|
|
UTBM provides three processing flags:
|
|
|
|
o UTBM_CASEFOLD - search in a case-insensitive manner.
|
|
|
|
o UTBM_IGNORE_NONSPACING - ignore non-spacing characters in the pattern and
|
|
the text.
|
|
|
|
o UTBM_SPACE_COMPRESS - view as a *single space*, sequential groups of
|
|
U+2028, U+2029, '\n', '\r', '\t', and any
|
|
character identified as a space by the Unicode
|
|
support on the platform.
|
|
|
|
This flag also causes all characters identified
|
|
as control by the Unicode support on the
|
|
platform to be ignored (except for '\n', '\r',
|
|
and '\t').
|
|
|
|
---------------------------------------------------------------------------
|
|
|
|
Before using UTBM
|
|
-----------------
|
|
Before UTBM is used, some functions need to be created. The "utbmstub.c" file
|
|
contains stubs that need to be rewritten so they work with the Unicode support
|
|
on the platform on which this package is being used.
|
|
|
|
Using UTBM
|
|
----------
|
|
|
|
Sample pseudo-code fragment.
|
|
|
|
utbm_pattern_t pat;
|
|
ucs2_t *pattern, *text;
|
|
unsigned long patternlen, textlen;
|
|
unsigned long flags, match_start, match_end;
|
|
|
|
/*
|
|
* Allocate the dynamic storage needed for a search pattern.
|
|
*/
|
|
pat = utbm_create_pattern();
|
|
|
|
/*
|
|
* Set the search flags desired.
|
|
*/
|
|
flags = UTBM_CASEFOLD|UTBM_IGNORE_NONSPACING;
|
|
|
|
/*
|
|
* Compile the search pattern.
|
|
*/
|
|
utbm_compile(pattern, patternlen, flags, pat);
|
|
|
|
/*
|
|
* Find the first occurance of the search pattern in the text.
|
|
*/
|
|
if (utbm_exec(pat, text, textlen, &match_start, &match_end))
|
|
printf("MATCH: %ld %ld\n", match_start, match_end);
|
|
|
|
/*
|
|
* Free the dynamic storage used for the search pattern.
|
|
*/
|
|
ure_free_pattern(pat);
|
|
|
|
---------------------------------------------------------------------------
|
|
|
|
Mark Leisher <mleisher@crl.nmsu.edu>
|
|
2 May 1997
|
|
|
|
===========================================================================
|
|
|
|
CHANGES
|
|
-------
|
|
|
|
Version: 0.2
|
|
Date : 21 September 1999
|
|
==========================
|
|
1. Added copyright stuff and put in CVS.
|
|
|