mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-18 18:44:06 +08:00
August 13, 2002
Use parser of OpenFTS v0.33. -- Teodor Sigaev
This commit is contained in:
parent
1276356268
commit
2860041bf0
@ -4,6 +4,11 @@ a searchable data type (textual) with indexed access.
|
||||
All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov
|
||||
(oleg@sai.msu.su).
|
||||
|
||||
CHANGES:
|
||||
|
||||
August 13, 2002
|
||||
Use parser of OpenFTS v0.33.
|
||||
|
||||
IMPORTANT NOTICE:
|
||||
|
||||
This is a first step of our work on integration of OpenFTS
|
||||
|
@ -2,28 +2,33 @@
|
||||
#define __DEFLEX_H__
|
||||
|
||||
/* rememder !!!! */
|
||||
#define LASTNUM 19
|
||||
#define LASTNUM 23
|
||||
|
||||
#define LATWORD 1
|
||||
#define NONLATINWORD 2
|
||||
#define CYRWORD 2
|
||||
#define UWORD 3
|
||||
#define EMAIL 4
|
||||
#define FURL 5
|
||||
#define HOST 6
|
||||
#define FLOAT 7
|
||||
#define FINT 8
|
||||
#define PARTWORD 9
|
||||
#define NONLATINPARTWORD 10
|
||||
#define LATPARTWORD 11
|
||||
#define SPACE 12
|
||||
#define SYMTAG 13
|
||||
#define HTTP 14
|
||||
#define DEFISWORD 15
|
||||
#define DEFISLATWORD 16
|
||||
#define DEFISNONLATINWORD 17
|
||||
#define SCIENTIFIC 7
|
||||
#define VERSIONNUMBER 8
|
||||
#define PARTHYPHENWORD 9
|
||||
#define CYRPARTHYPHENWORD 10
|
||||
#define LATPARTHYPHENWORD 11
|
||||
#define SPACE 12
|
||||
#define TAG 13
|
||||
#define HTTP 14
|
||||
#define HYPHENWORD 15
|
||||
#define LATHYPHENWORD 16
|
||||
#define CYRHYPHENWORD 17
|
||||
#define URI 18
|
||||
#define FILEPATH 19
|
||||
#define DECIMAL 20
|
||||
#define SIGNEDINT 21
|
||||
#define UNSIGNEDINT 22
|
||||
#define HTMLENTITY 23
|
||||
|
||||
extern const char *descr[];
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -689,9 +689,9 @@ SELECT count(*) FROM test_txtidx WHERE a ## '(eq|yt)&(wR|qh)';
|
||||
select txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
|
||||
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
|
||||
<i <b> wow < jqw <> qwerty');
|
||||
txt2txtidx
|
||||
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
|
||||
txt2txtidx
|
||||
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|
||||
'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' 'readline-4' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
|
||||
(1 row)
|
||||
|
||||
select txtidxsize(txt2txtidx('345 qw'));
|
||||
@ -705,7 +705,7 @@ select txtidxsize(txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.e
|
||||
<i <b> wow < jqw <> qwerty'));
|
||||
txtidxsize
|
||||
------------
|
||||
52
|
||||
53
|
||||
(1 row)
|
||||
|
||||
insert into test_txtidx (a) values ('345 qwerty');
|
||||
|
@ -75,19 +75,23 @@ static MAPDICT mapdict[] = {
|
||||
{NODICT, NODICT}, /* EMAIL */
|
||||
{NODICT, NODICT}, /* FURL */
|
||||
{NODICT, NODICT}, /* HOST */
|
||||
{NODICT, NODICT}, /* FLOAT */
|
||||
{NODICT, NODICT}, /* FINT */
|
||||
{BYLOCALE, DEFAULTDICT}, /* PARTWORD */
|
||||
{BYLOCALE, NODICT}, /* NONLATINPARTWORD */
|
||||
{DEFAULTDICT, NODICT}, /* LATPARTWORD */
|
||||
{NODICT, NODICT}, /* SCIENTIFIC */
|
||||
{NODICT, NODICT}, /* VERSIONNUMBER */
|
||||
{BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */
|
||||
{BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */
|
||||
{DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */
|
||||
{STOPLEXEM, NODICT}, /* SPACE */
|
||||
{STOPLEXEM, NODICT}, /* SYMTAG */
|
||||
{STOPLEXEM, NODICT}, /* TAG */
|
||||
{STOPLEXEM, NODICT}, /* HTTP */
|
||||
{BYLOCALE, DEFAULTDICT}, /* DEFISWORD */
|
||||
{DEFAULTDICT, NODICT}, /* DEFISLATWORD */
|
||||
{BYLOCALE, NODICT}, /* DEFISNONLATINWORD */
|
||||
{BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */
|
||||
{DEFAULTDICT, NODICT}, /* LATHYPHENWORD */
|
||||
{BYLOCALE, NODICT}, /* CYRHYPHENWORD */
|
||||
{NODICT, NODICT}, /* URI */
|
||||
{NODICT, NODICT} /* FILEPATH */
|
||||
{NODICT, NODICT}, /* FILEPATH */
|
||||
{NODICT, NODICT}, /* DECIMAL */
|
||||
{NODICT, NODICT}, /* SIGNEDINT */
|
||||
{NODICT, NODICT}, /* UNSIGNEDINT */
|
||||
{STOPLEXEM, NODICT} /* HTMLENTITY */
|
||||
};
|
||||
|
||||
static bool inited = false;
|
||||
|
@ -5,18 +5,17 @@
|
||||
|
||||
/* postgres allocation function */
|
||||
#include "postgres.h"
|
||||
#define free pfree
|
||||
#define malloc palloc
|
||||
#define free pfree
|
||||
#define malloc palloc
|
||||
#define realloc repalloc
|
||||
|
||||
#ifdef strdup
|
||||
#undef strdup
|
||||
#endif
|
||||
#define strdup pstrdup
|
||||
|
||||
#define strdup pstrdup
|
||||
|
||||
char *token = NULL; /* pointer to token */
|
||||
char *s = NULL; /* for returning full defis-word */
|
||||
char *s = NULL; /* to return WHOLE hyphenated-word */
|
||||
|
||||
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
|
||||
|
||||
@ -57,21 +56,21 @@ int bytestoread = 0; /* for limiting read from filehandle */
|
||||
%option nounput
|
||||
%option noyywrap
|
||||
|
||||
|
||||
/* parser's state for parsing defis-word */
|
||||
/* parser's state for parsing hyphenated-word */
|
||||
%x DELIM
|
||||
/* parser's state for parsing URL*/
|
||||
%x URL
|
||||
%x SERVER
|
||||
|
||||
/* parser's state for parsing filepath */
|
||||
|
||||
/* parser's state for parsing TAGS */
|
||||
%x INTAG
|
||||
%x QINTAG
|
||||
%x INCOMMENT
|
||||
%x INSCRIPT
|
||||
|
||||
/* NONLATIN char */
|
||||
NONLATINALNUM [0-9\200-\377]
|
||||
NONLATINALPHA [\200-\377]
|
||||
/* cyrillic koi8 char */
|
||||
CYRALNUM [0-9\200-\377]
|
||||
CYRALPHA [\200-\377]
|
||||
ALPHA [a-zA-Z\200-\377]
|
||||
ALNUM [0-9a-zA-Z\200-\377]
|
||||
|
||||
@ -81,66 +80,59 @@ URI [-_[:alnum:]/%,\.;=&?#]+
|
||||
|
||||
%%
|
||||
|
||||
"<"[[:alpha:]] { BEGIN INTAG;
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SYMTAG;
|
||||
}
|
||||
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
|
||||
|
||||
"</"[[:alpha:]] { BEGIN INTAG;
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SYMTAG;
|
||||
}
|
||||
|
||||
"<>" {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SYMTAG;
|
||||
}
|
||||
|
||||
"<"[^>[:alpha:]] {
|
||||
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
|
||||
BEGIN INITIAL;
|
||||
*tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SPACE;
|
||||
}
|
||||
|
||||
<INTAG>"\"" { BEGIN QINTAG;
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SYMTAG;
|
||||
}
|
||||
"<!--" { BEGIN INCOMMENT; }
|
||||
|
||||
<QINTAG>"\\\"" {
|
||||
<INCOMMENT>"-->" {
|
||||
BEGIN INITIAL;
|
||||
*tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SYMTAG;
|
||||
return SPACE;
|
||||
}
|
||||
|
||||
<QINTAG>"\"" { BEGIN INTAG;
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SYMTAG;
|
||||
}
|
||||
|
||||
<QINTAG>.|\n {
|
||||
"<"[\![:alpha:]] { BEGIN INTAG; }
|
||||
|
||||
"</"[[:alpha:]] { BEGIN INTAG; }
|
||||
|
||||
<INTAG>"\"" { BEGIN QINTAG; }
|
||||
|
||||
<QINTAG>"\\\"" ;
|
||||
|
||||
<QINTAG>"\"" { BEGIN INTAG; }
|
||||
|
||||
<INTAG>">" {
|
||||
BEGIN INITIAL;
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SYMTAG;
|
||||
*tsearch_yytext=' ';
|
||||
token = tsearch_yytext;
|
||||
tokenlen = 1;
|
||||
return TAG;
|
||||
}
|
||||
|
||||
<INTAG>">" { BEGIN INITIAL;
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SYMTAG;
|
||||
}
|
||||
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
|
||||
|
||||
<INTAG>.|\n {
|
||||
\&(quot|amp|nbsp|lt|gt)\; {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SYMTAG;
|
||||
return HTMLENTITY;
|
||||
}
|
||||
|
||||
\&\#[0-9][0-9]?[0-9]?\; {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return HTMLENTITY;
|
||||
}
|
||||
|
||||
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
|
||||
token = tsearch_yytext;
|
||||
@ -148,22 +140,34 @@ URI [-_[:alnum:]/%,\.;=&?#]+
|
||||
return EMAIL;
|
||||
}
|
||||
|
||||
<DELIM,INITIAL>[0-9] /* digit's and point (might be a version) */ {
|
||||
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return FINT;
|
||||
return SCIENTIFIC;
|
||||
}
|
||||
|
||||
<DELIM,INITIAL>[0-9]+[0-9\.]*[0-9] /* digit's and point (might be a version) */ {
|
||||
token = tsearch_yytext;
|
||||
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return FINT;
|
||||
return VERSIONNUMBER;
|
||||
}
|
||||
|
||||
[+-]?[0-9\.]+[eE][+-]?[0-9]+ /* float */ {
|
||||
[+-]?[0-9]+\.[0-9]+ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return DECIMAL;
|
||||
}
|
||||
|
||||
[+-][0-9]+ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return FLOAT;
|
||||
return SIGNEDINT;
|
||||
}
|
||||
|
||||
<DELIM,INITIAL>[0-9]+ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return UNSIGNEDINT;
|
||||
}
|
||||
|
||||
http"://" {
|
||||
@ -208,52 +212,58 @@ ftp"://" {
|
||||
return FILEPATH;
|
||||
}
|
||||
|
||||
({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */ {
|
||||
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
|
||||
BEGIN DELIM;
|
||||
if (s) { free(s); s=NULL; }
|
||||
s = strdup( tsearch_yytext );
|
||||
tokenlen = tsearch_yyleng;
|
||||
yyless( 0 );
|
||||
token = s;
|
||||
return DEFISNONLATINWORD;
|
||||
return CYRHYPHENWORD;
|
||||
}
|
||||
|
||||
([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */ {
|
||||
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
|
||||
BEGIN DELIM;
|
||||
if (s) { free(s); s=NULL; }
|
||||
tokenlen = tsearch_yyleng;
|
||||
s = strdup( tsearch_yytext );
|
||||
tokenlen = tsearch_yyleng;
|
||||
yyless( 0 );
|
||||
token = s;
|
||||
return DEFISLATWORD;
|
||||
return LATHYPHENWORD;
|
||||
}
|
||||
|
||||
({ALNUM}+-)+{ALPHA}+ /* composite-word */ {
|
||||
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
|
||||
BEGIN DELIM;
|
||||
if (s) { free(s); s=NULL; }
|
||||
s = strdup( tsearch_yytext );
|
||||
tokenlen = tsearch_yyleng;
|
||||
yyless( 0 );
|
||||
token = s;
|
||||
return DEFISWORD;
|
||||
return HYPHENWORD;
|
||||
}
|
||||
|
||||
<DELIM>{NONLATINALNUM}+ /* one word in composite-word */ {
|
||||
token = tsearch_yytext;
|
||||
<DELIM>\+?[0-9]+\.[0-9]+ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return NONLATINPARTWORD;
|
||||
return DECIMAL;
|
||||
}
|
||||
|
||||
<DELIM>[[:alnum:]]+ /* one word in composite-word */ {
|
||||
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return LATPARTWORD;
|
||||
return CYRPARTHYPHENWORD;
|
||||
}
|
||||
|
||||
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return LATPARTHYPHENWORD;
|
||||
}
|
||||
|
||||
<DELIM>{ALNUM}+ /* one word in composite-word */ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return PARTWORD;
|
||||
return PARTHYPHENWORD;
|
||||
}
|
||||
|
||||
<DELIM>- {
|
||||
@ -264,17 +274,16 @@ ftp"://" {
|
||||
|
||||
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
|
||||
BEGIN INITIAL;
|
||||
tokenlen = tsearch_yyleng;
|
||||
yyless( 0 );
|
||||
}
|
||||
|
||||
{NONLATINALNUM}+ /* normal word */ {
|
||||
{CYRALPHA}+ /* normal word */ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return NONLATINWORD;
|
||||
return CYRWORD;
|
||||
}
|
||||
|
||||
[[:alnum:]]+ /* normal word */ {
|
||||
[[:alpha:]]+ /* normal word */ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return LATWORD;
|
||||
@ -286,7 +295,13 @@ ftp"://" {
|
||||
return UWORD;
|
||||
}
|
||||
|
||||
.|\n {
|
||||
[ \r\n\t]+ {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SPACE;
|
||||
}
|
||||
|
||||
. {
|
||||
token = tsearch_yytext;
|
||||
tokenlen = tsearch_yyleng;
|
||||
return SPACE;
|
||||
|
Loading…
Reference in New Issue
Block a user