2003-07-21 18:27:44 +08:00
|
|
|
%{
|
|
|
|
#include "postgres.h"
|
|
|
|
|
|
|
|
#include "deflex.h"
|
|
|
|
#include "parser.h"
|
|
|
|
#include "common.h"
|
|
|
|
|
|
|
|
/* Avoid exit() on fatal scanner errors */
|
|
|
|
#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg)
|
|
|
|
|
|
|
|
char *token = NULL; /* pointer to token */
|
2003-12-05 22:27:42 +08:00
|
|
|
int tokenlen;
|
2004-06-29 00:19:09 +08:00
|
|
|
static char *s = NULL; /* to return WHOLE hyphenated-word */
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
|
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
typedef struct {
|
|
|
|
int tlen;
|
|
|
|
int clen;
|
|
|
|
char *str;
|
|
|
|
} TagStorage;
|
|
|
|
|
|
|
|
static TagStorage ts={0,0,NULL};
|
|
|
|
|
|
|
|
static void
|
|
|
|
addTag() {
|
|
|
|
while( ts.clen+tsearch2_yyleng+1 > ts.tlen ) {
|
|
|
|
ts.tlen*=2;
|
|
|
|
ts.str=realloc(ts.str,ts.tlen);
|
|
|
|
if (!ts.str)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OUT_OF_MEMORY),
|
|
|
|
errmsg("out of memory")));
|
|
|
|
}
|
|
|
|
memcpy(ts.str+ts.clen,tsearch2_yytext,tsearch2_yyleng);
|
|
|
|
ts.clen+=tsearch2_yyleng;
|
|
|
|
ts.str[ts.clen]='\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
startTag() {
|
|
|
|
if ( ts.str==NULL ) {
|
|
|
|
ts.tlen=tsearch2_yyleng+1;
|
|
|
|
ts.str=malloc(ts.tlen);
|
|
|
|
if (!ts.str)
|
|
|
|
ereport(ERROR,
|
|
|
|
(errcode(ERRCODE_OUT_OF_MEMORY),
|
|
|
|
errmsg("out of memory")));
|
|
|
|
}
|
|
|
|
ts.clen=0;
|
|
|
|
ts.str[0]='\0';
|
|
|
|
addTag();
|
|
|
|
}
|
|
|
|
|
2003-07-21 18:27:44 +08:00
|
|
|
%}
|
|
|
|
|
|
|
|
%option 8bit
|
|
|
|
%option never-interactive
|
2004-02-25 06:06:32 +08:00
|
|
|
%option nodefault
|
2003-07-21 18:27:44 +08:00
|
|
|
%option nounput
|
|
|
|
%option noyywrap
|
|
|
|
|
|
|
|
/* parser's state for parsing hyphenated-word */
|
|
|
|
%x DELIM
|
|
|
|
/* parser's state for parsing URL*/
|
|
|
|
%x URL
|
|
|
|
%x SERVER
|
|
|
|
|
|
|
|
/* parser's state for parsing TAGS */
|
|
|
|
%x INTAG
|
|
|
|
%x QINTAG
|
|
|
|
%x INCOMMENT
|
|
|
|
%x INSCRIPT
|
|
|
|
|
|
|
|
/* cyrillic koi8 char */
|
|
|
|
CYRALNUM [0-9\200-\377]
|
|
|
|
CYRALPHA [\200-\377]
|
|
|
|
ALPHA [a-zA-Z\200-\377]
|
|
|
|
ALNUM [0-9a-zA-Z\200-\377]
|
|
|
|
|
|
|
|
|
|
|
|
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
|
|
|
|
URI [-_[:alnum:]/%,\.;=&?#]+
|
|
|
|
|
|
|
|
%%
|
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; startTag(); }
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
|
|
|
|
BEGIN INITIAL;
|
2004-06-29 00:19:09 +08:00
|
|
|
addTag();
|
|
|
|
token = ts.str;
|
|
|
|
tokenlen = ts.clen;
|
|
|
|
return TAG;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
"<!--" { BEGIN INCOMMENT; startTag(); }
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
<INCOMMENT>"-->" {
|
|
|
|
BEGIN INITIAL;
|
2004-06-29 00:19:09 +08:00
|
|
|
addTag();
|
|
|
|
token = ts.str;
|
|
|
|
tokenlen = ts.clen;
|
|
|
|
return TAG;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
"<"[\![:alpha:]] { BEGIN INTAG; startTag(); }
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
"</"[[:alpha:]] { BEGIN INTAG; startTag(); }
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
<INTAG>"\"" { BEGIN QINTAG; addTag(); }
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
<QINTAG>"\\\"" { addTag(); }
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
<QINTAG>"\"" { BEGIN INTAG; addTag(); }
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
<INTAG>">" {
|
|
|
|
BEGIN INITIAL;
|
2004-06-29 00:19:09 +08:00
|
|
|
addTag();
|
|
|
|
token = ts.str;
|
|
|
|
tokenlen = ts.clen;
|
2003-07-21 18:27:44 +08:00
|
|
|
return TAG;
|
|
|
|
}
|
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n { addTag(); }
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
\&(quot|amp|nbsp|lt|gt)\; {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return HTMLENTITY;
|
|
|
|
}
|
|
|
|
|
|
|
|
\&\#[0-9][0-9]?[0-9]?\; {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return HTMLENTITY;
|
|
|
|
}
|
|
|
|
|
|
|
|
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return EMAIL;
|
|
|
|
}
|
|
|
|
|
|
|
|
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return SCIENTIFIC;
|
|
|
|
}
|
|
|
|
|
|
|
|
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return VERSIONNUMBER;
|
|
|
|
}
|
|
|
|
|
|
|
|
[+-]?[0-9]+\.[0-9]+ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return DECIMAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
[+-][0-9]+ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return SIGNEDINT;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM,INITIAL>[0-9]+ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return UNSIGNEDINT;
|
|
|
|
}
|
|
|
|
|
|
|
|
http"://" {
|
|
|
|
BEGIN URL;
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return HTTP;
|
|
|
|
}
|
|
|
|
|
|
|
|
ftp"://" {
|
|
|
|
BEGIN URL;
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return HTTP;
|
|
|
|
}
|
|
|
|
|
|
|
|
<URL,INITIAL>{HOSTNAME}[/:]{URI} {
|
|
|
|
BEGIN SERVER;
|
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
s = strdup( tsearch2_yytext );
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
yyless( 0 );
|
|
|
|
token = s;
|
|
|
|
return FURL;
|
|
|
|
}
|
|
|
|
|
|
|
|
<SERVER,URL,INITIAL>{HOSTNAME} {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return HOST;
|
|
|
|
}
|
|
|
|
|
|
|
|
<SERVER>[/:]{URI} {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return URI;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return FILEPATH;
|
|
|
|
}
|
|
|
|
|
|
|
|
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
|
|
|
|
BEGIN DELIM;
|
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
s = strdup( tsearch2_yytext );
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
yyless( 0 );
|
|
|
|
token = s;
|
|
|
|
return CYRHYPHENWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
|
|
|
|
BEGIN DELIM;
|
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
s = strdup( tsearch2_yytext );
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
yyless( 0 );
|
|
|
|
token = s;
|
|
|
|
return LATHYPHENWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
|
|
|
|
BEGIN DELIM;
|
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
s = strdup( tsearch2_yytext );
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
yyless( 0 );
|
|
|
|
token = s;
|
|
|
|
return HYPHENWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return VERSIONNUMBER;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM>\+?[0-9]+\.[0-9]+ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return DECIMAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return CYRPARTHYPHENWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return LATPARTHYPHENWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM>{ALNUM}+ /* one word in composite-word */ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return PARTHYPHENWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM>- {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return SPACE;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
|
|
|
|
BEGIN INITIAL;
|
|
|
|
yyless( 0 );
|
|
|
|
}
|
|
|
|
|
|
|
|
{CYRALPHA}+ /* normal word */ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return CYRWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[:alpha:]]+ /* normal word */ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return LATWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
{ALNUM}+ /* normal word */ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return UWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
[ \r\n\t]+ {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return SPACE;
|
|
|
|
}
|
|
|
|
|
|
|
|
. {
|
|
|
|
token = tsearch2_yytext;
|
|
|
|
tokenlen = tsearch2_yyleng;
|
|
|
|
return SPACE;
|
|
|
|
}
|
|
|
|
|
|
|
|
%%
|
|
|
|
|
|
|
|
/* clearing after parsing from string */
|
2003-12-05 22:27:42 +08:00
|
|
|
void tsearch2_end_parse() {
|
2003-07-21 18:27:44 +08:00
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
tsearch2_yy_delete_buffer( buf );
|
|
|
|
buf = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* start parse from string */
|
2003-12-05 22:27:42 +08:00
|
|
|
void tsearch2_start_parse_str(char* str, int limit) {
|
2003-12-05 23:37:51 +08:00
|
|
|
if (buf) tsearch2_end_parse();
|
2003-07-21 18:27:44 +08:00
|
|
|
buf = tsearch2_yy_scan_bytes( str, limit );
|
|
|
|
tsearch2_yy_switch_to_buffer( buf );
|
|
|
|
BEGIN INITIAL;
|
|
|
|
}
|
2004-06-29 00:19:09 +08:00
|
|
|
|