mirror of
https://git.openldap.org/openldap/openldap.git
synced 2025-01-06 10:46:21 +08:00
213 lines
6.1 KiB
Plaintext
213 lines
6.1 KiB
Plaintext
|
#
|
||
|
# $Id: README,v 1.3 1999/09/21 15:47:43 mleisher Exp $
|
||
|
#
|
||
|
# Copyright 1997, 1998, 1999 Computing Research Labs,
|
||
|
# New Mexico State University
|
||
|
#
|
||
|
# Permission is hereby granted, free of charge, to any person obtaining a
|
||
|
# copy of this software and associated documentation files (the "Software"),
|
||
|
# to deal in the Software without restriction, including without limitation
|
||
|
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||
|
# and/or sell copies of the Software, and to permit persons to whom the
|
||
|
# Software is furnished to do so, subject to the following conditions:
|
||
|
#
|
||
|
# The above copyright notice and this permission notice shall be included in
|
||
|
# all copies or substantial portions of the Software.
|
||
|
#
|
||
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||
|
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||
|
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||
|
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||
|
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||
|
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||
|
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||
|
#
|
||
|
|
||
|
|
||
|
Unicode and Regular Expressions
|
||
|
Version 0.5
|
||
|
|
||
|
This is a simple regular expression package for matching against Unicode text
|
||
|
in UCS2 form. The implementation of this URE package is a variation on the
|
||
|
RE->DFA algorithm done by Mark Hopkins (markh@csd4.csd.uwm.edu). Mark
|
||
|
Hopkins' algorithm had the virtue of being very simple, so it was used as a
|
||
|
model.
|
||
|
|
||
|
---------------------------------------------------------------------------
|
||
|
|
||
|
Assumptions:
|
||
|
|
||
|
o Regular expression and text already normalized.
|
||
|
|
||
|
o Conversion to lower case assumes a 1-1 mapping.
|
||
|
|
||
|
Definitions:
|
||
|
|
||
|
Separator - any one of U+2028, U+2029, '\n', '\r'.
|
||
|
|
||
|
Operators:
|
||
|
. - match any character.
|
||
|
* - match zero or more of the last subexpression.
|
||
|
+ - match one or more of the last subexpression.
|
||
|
? - match zero or one of the last subexpression.
|
||
|
() - subexpression grouping.
|
||
|
|
||
|
Notes:
|
||
|
|
||
|
o The "." operator normally does not match separators, but a flag is
|
||
|
available for the ure_exec() function that will allow this operator to
|
||
|
match a separator.
|
||
|
|
||
|
Literals and Constants:
|
||
|
|
||
|
c - literal UCS2 character.
|
||
|
\x.... - hexadecimal number of up to 4 digits.
|
||
|
\X.... - hexadecimal number of up to 4 digits.
|
||
|
\u.... - hexadecimal number of up to 4 digits.
|
||
|
\U.... - hexadecimal number of up to 4 digits.
|
||
|
|
||
|
Character classes:
|
||
|
|
||
|
[...] - Character class.
|
||
|
[^...] - Negated character class.
|
||
|
\pN1,N2,...,Nn - Character properties class.
|
||
|
\PN1,N2,...,Nn - Negated character properties class.
|
||
|
|
||
|
POSIX character classes recognized:
|
||
|
|
||
|
:alnum:
|
||
|
:alpha:
|
||
|
:cntrl:
|
||
|
:digit:
|
||
|
:graph:
|
||
|
:lower:
|
||
|
:print:
|
||
|
:punct:
|
||
|
:space:
|
||
|
:upper:
|
||
|
:xdigit:
|
||
|
|
||
|
Notes:
|
||
|
|
||
|
o Character property classes are \p or \P followed by a comma separated
|
||
|
list of integers between 1 and 32. These integers are references to
|
||
|
the following character properties:
|
||
|
|
||
|
N Character Property
|
||
|
--------------------------
|
||
|
1 _URE_NONSPACING
|
||
|
2 _URE_COMBINING
|
||
|
3 _URE_NUMDIGIT
|
||
|
4 _URE_NUMOTHER
|
||
|
5 _URE_SPACESEP
|
||
|
6 _URE_LINESEP
|
||
|
7 _URE_PARASEP
|
||
|
8 _URE_CNTRL
|
||
|
9 _URE_PUA
|
||
|
10 _URE_UPPER
|
||
|
11 _URE_LOWER
|
||
|
12 _URE_TITLE
|
||
|
13 _URE_MODIFIER
|
||
|
14 _URE_OTHERLETTER
|
||
|
15 _URE_DASHPUNCT
|
||
|
16 _URE_OPENPUNCT
|
||
|
17 _URE_CLOSEPUNCT
|
||
|
18 _URE_OTHERPUNCT
|
||
|
19 _URE_MATHSYM
|
||
|
20 _URE_CURRENCYSYM
|
||
|
21 _URE_OTHERSYM
|
||
|
22 _URE_LTR
|
||
|
23 _URE_RTL
|
||
|
24 _URE_EURONUM
|
||
|
25 _URE_EURONUMSEP
|
||
|
26 _URE_EURONUMTERM
|
||
|
27 _URE_ARABNUM
|
||
|
28 _URE_COMMONSEP
|
||
|
29 _URE_BLOCKSEP
|
||
|
30 _URE_SEGMENTSEP
|
||
|
31 _URE_WHITESPACE
|
||
|
32 _URE_OTHERNEUT
|
||
|
|
||
|
o Character classes can contain literals, constants, and character
|
||
|
property classes. Example:
|
||
|
|
||
|
[abc\U10A\p1,3,4]
|
||
|
|
||
|
---------------------------------------------------------------------------
|
||
|
|
||
|
Before using URE
|
||
|
----------------
|
||
|
Before URE is used, two functions need to be created. One to check if a
|
||
|
character matches a set of URE character properties, and one to convert a
|
||
|
character to lower case.
|
||
|
|
||
|
Stubs for these function are located in the urestubs.c file.
|
||
|
|
||
|
Using URE
|
||
|
---------
|
||
|
|
||
|
Sample pseudo-code fragment.
|
||
|
|
||
|
ure_buffer_t rebuf;
|
||
|
ure_dfa_t dfa;
|
||
|
ucs2_t *re, *text;
|
||
|
unsigned long relen, textlen;
|
||
|
unsigned long match_start, match_end;
|
||
|
|
||
|
/*
|
||
|
* Allocate the dynamic storage needed to compile regular expressions.
|
||
|
*/
|
||
|
rebuf = ure_buffer_create();
|
||
|
|
||
|
for each regular expression in a list {
|
||
|
re = next regular expression;
|
||
|
relen = length(re);
|
||
|
|
||
|
/*
|
||
|
* Compile the regular expression with the case insensitive flag
|
||
|
* turned on.
|
||
|
*/
|
||
|
dfa = ure_compile(re, relen, 1, rebuf);
|
||
|
|
||
|
/*
|
||
|
* Look for the first match in some text. The matching will be done
|
||
|
* in a case insensitive manner because the expression was compiled
|
||
|
* with the case insensitive flag on.
|
||
|
*/
|
||
|
if (ure_exec(dfa, 0, text, textlen, &match_start, &match_end))
|
||
|
printf("MATCH: %ld %ld\n", match_start, match_end);
|
||
|
|
||
|
/*
|
||
|
* Look for the first match in some text, ignoring non-spacing
|
||
|
* characters.
|
||
|
*/
|
||
|
if (ure_exec(dfa, URE_IGNORE_NONSPACING, text, textlen,
|
||
|
&match_start, &match_end))
|
||
|
printf("MATCH: %ld %ld\n", match_start, match_end);
|
||
|
|
||
|
/*
|
||
|
* Free the DFA.
|
||
|
*/
|
||
|
ure_free_dfa(dfa);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Free the dynamic storage used for compiling the expressions.
|
||
|
*/
|
||
|
ure_free_buffer(rebuf);
|
||
|
|
||
|
---------------------------------------------------------------------------
|
||
|
|
||
|
Mark Leisher <mleisher@crl.nmsu.edu>
|
||
|
29 March 1997
|
||
|
|
||
|
===========================================================================
|
||
|
|
||
|
CHANGES
|
||
|
-------
|
||
|
|
||
|
Version: 0.5
|
||
|
Date : 21 September 1999
|
||
|
==========================
|
||
|
1. Added copyright stuff and put in CVS.
|