2001-10-13 07:19:09 +08:00
|
|
|
%{
|
2002-09-05 08:43:07 +08:00
|
|
|
#include "postgres.h"
|
|
|
|
|
2001-10-13 07:19:09 +08:00
|
|
|
#include "deflex.h"
|
|
|
|
#include "parser.h"
|
|
|
|
|
2003-05-30 06:30:02 +08:00
|
|
|
/* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
|
|
|
|
#define fprintf(file, fmt, msg) ereport(ERROR, (errmsg_internal("%s", msg)))
|
|
|
|
|
2001-10-13 07:19:09 +08:00
|
|
|
char *token = NULL; /* pointer to token */
|
2002-08-15 11:02:08 +08:00
|
|
|
char *s = NULL; /* to return WHOLE hyphenated-word */
|
2001-10-13 07:19:09 +08:00
|
|
|
|
|
|
|
YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
|
|
|
|
|
|
|
|
%}
|
|
|
|
|
2002-07-31 00:33:08 +08:00
|
|
|
%option 8bit
|
|
|
|
%option never-interactive
|
2004-02-25 06:06:32 +08:00
|
|
|
%option nodefault
|
2002-07-31 00:33:08 +08:00
|
|
|
%option nounput
|
|
|
|
%option noyywrap
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
/* parser's state for parsing hyphenated-word */
|
2001-10-13 07:19:09 +08:00
|
|
|
%x DELIM
|
|
|
|
/* parser's state for parsing URL*/
|
|
|
|
%x URL
|
|
|
|
%x SERVER
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
/* parser's state for parsing TAGS */
|
2001-10-13 07:19:09 +08:00
|
|
|
%x INTAG
|
|
|
|
%x QINTAG
|
2002-08-15 11:02:08 +08:00
|
|
|
%x INCOMMENT
|
|
|
|
%x INSCRIPT
|
2001-10-13 07:19:09 +08:00
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
/* cyrillic koi8 char */
|
|
|
|
CYRALNUM [0-9\200-\377]
|
|
|
|
CYRALPHA [\200-\377]
|
2001-10-13 07:19:09 +08:00
|
|
|
ALPHA [a-zA-Z\200-\377]
|
|
|
|
ALNUM [0-9a-zA-Z\200-\377]
|
|
|
|
|
|
|
|
|
|
|
|
HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
|
|
|
|
URI [-_[:alnum:]/%,\.;=&?#]+
|
|
|
|
|
|
|
|
%%
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
|
2001-10-13 07:19:09 +08:00
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
|
|
|
|
BEGIN INITIAL;
|
|
|
|
*tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return SPACE;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
"<!--" { BEGIN INCOMMENT; }
|
|
|
|
|
|
|
|
<INCOMMENT>"-->" {
|
|
|
|
BEGIN INITIAL;
|
|
|
|
*tsearch_yytext=' '; *(tsearch_yytext+1) = '\0';
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return SPACE;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
"<"[\![:alpha:]] { BEGIN INTAG; }
|
2001-10-13 07:19:09 +08:00
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
"</"[[:alpha:]] { BEGIN INTAG; }
|
2001-10-13 07:19:09 +08:00
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
<INTAG>"\"" { BEGIN QINTAG; }
|
|
|
|
|
|
|
|
<QINTAG>"\\\"" ;
|
|
|
|
|
|
|
|
<QINTAG>"\"" { BEGIN INTAG; }
|
|
|
|
|
|
|
|
<INTAG>">" {
|
|
|
|
BEGIN INITIAL;
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
2002-08-15 11:02:08 +08:00
|
|
|
*tsearch_yytext=' ';
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = 1;
|
|
|
|
return TAG;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
|
|
|
|
|
|
|
|
\&(quot|amp|nbsp|lt|gt)\; {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return HTMLENTITY;
|
|
|
|
}
|
2001-10-13 07:19:09 +08:00
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
\&\#[0-9][0-9]?[0-9]?\; {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return HTMLENTITY;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return EMAIL;
|
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return SCIENTIFIC;
|
|
|
|
}
|
|
|
|
|
|
|
|
[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return VERSIONNUMBER;
|
|
|
|
}
|
|
|
|
|
|
|
|
[+-]?[0-9]+\.[0-9]+ {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return DECIMAL;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
[+-][0-9]+ {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return SIGNEDINT;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
<DELIM,INITIAL>[0-9]+ {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return UNSIGNEDINT;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
http"://" {
|
|
|
|
BEGIN URL;
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return HTTP;
|
|
|
|
}
|
|
|
|
|
|
|
|
ftp"://" {
|
|
|
|
BEGIN URL;
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return HTTP;
|
|
|
|
}
|
|
|
|
|
|
|
|
<URL,INITIAL>{HOSTNAME}[/:]{URI} {
|
|
|
|
BEGIN SERVER;
|
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
s = strdup( tsearch_yytext );
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
yyless( 0 );
|
|
|
|
token = s;
|
|
|
|
return FURL;
|
|
|
|
}
|
|
|
|
|
|
|
|
<SERVER,URL,INITIAL>{HOSTNAME} {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return HOST;
|
|
|
|
}
|
|
|
|
|
|
|
|
<SERVER>[/:]{URI} {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return URI;
|
|
|
|
}
|
|
|
|
|
|
|
|
[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return FILEPATH;
|
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
|
2001-10-13 07:19:09 +08:00
|
|
|
BEGIN DELIM;
|
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
s = strdup( tsearch_yytext );
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
yyless( 0 );
|
|
|
|
token = s;
|
2002-08-15 11:02:08 +08:00
|
|
|
return CYRHYPHENWORD;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
|
2001-10-13 07:19:09 +08:00
|
|
|
BEGIN DELIM;
|
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
s = strdup( tsearch_yytext );
|
2002-08-15 11:02:08 +08:00
|
|
|
tokenlen = tsearch_yyleng;
|
2001-10-13 07:19:09 +08:00
|
|
|
yyless( 0 );
|
|
|
|
token = s;
|
2002-08-15 11:02:08 +08:00
|
|
|
return LATHYPHENWORD;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
|
2001-10-13 07:19:09 +08:00
|
|
|
BEGIN DELIM;
|
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
s = strdup( tsearch_yytext );
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
yyless( 0 );
|
|
|
|
token = s;
|
2002-08-15 11:02:08 +08:00
|
|
|
return HYPHENWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM>\+?[0-9]+\.[0-9]+ {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return DECIMAL;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return CYRPARTHYPHENWORD;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return LATPARTHYPHENWORD;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM>{ALNUM}+ /* one word in composite-word */ {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return PARTHYPHENWORD;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM>- {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return SPACE;
|
|
|
|
}
|
|
|
|
|
|
|
|
<DELIM,SERVER,URL>.|\n /* return in basic state */ {
|
|
|
|
BEGIN INITIAL;
|
|
|
|
yyless( 0 );
|
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
{CYRALPHA}+ /* normal word */ {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
2002-08-15 11:02:08 +08:00
|
|
|
return CYRWORD;
|
2001-10-13 07:19:09 +08:00
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
[[:alpha:]]+ /* normal word */ {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return LATWORD;
|
|
|
|
}
|
|
|
|
|
|
|
|
{ALNUM}+ /* normal word */ {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return UWORD;
|
|
|
|
}
|
|
|
|
|
2002-08-15 11:02:08 +08:00
|
|
|
[ \r\n\t]+ {
|
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return SPACE;
|
|
|
|
}
|
|
|
|
|
|
|
|
. {
|
2001-10-13 07:19:09 +08:00
|
|
|
token = tsearch_yytext;
|
|
|
|
tokenlen = tsearch_yyleng;
|
|
|
|
return SPACE;
|
|
|
|
}
|
|
|
|
|
|
|
|
%%
|
|
|
|
|
|
|
|
/* clearing after parsing from string */
|
|
|
|
void end_parse() {
|
|
|
|
if (s) { free(s); s=NULL; }
|
|
|
|
tsearch_yy_delete_buffer( buf );
|
|
|
|
buf = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* start parse from string */
|
|
|
|
void start_parse_str(char* str, int limit) {
|
|
|
|
if (buf) end_parse();
|
|
|
|
buf = tsearch_yy_scan_bytes( str, limit );
|
|
|
|
tsearch_yy_switch_to_buffer( buf );
|
|
|
|
BEGIN INITIAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
|