2003-08-04 08:43:34 +08:00
|
|
|
/*
|
|
|
|
* default word parser
|
2003-07-21 18:27:44 +08:00
|
|
|
* Teodor Sigaev <teodor@sigaev.ru>
|
|
|
|
*/
|
|
|
|
#include "postgres.h"
|
2005-05-07 01:24:55 +08:00
|
|
|
|
2003-07-21 18:27:44 +08:00
|
|
|
#include "utils/builtins.h"
|
|
|
|
|
|
|
|
#include "dict.h"
|
|
|
|
#include "wparser.h"
|
|
|
|
#include "common.h"
|
|
|
|
#include "ts_cfg.h"
|
|
|
|
#include "wordparser/parser.h"
|
|
|
|
#include "wordparser/deflex.h"
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_lextype);
|
2003-08-04 08:43:34 +08:00
|
|
|
Datum prsd_lextype(PG_FUNCTION_ARGS);
|
|
|
|
|
|
|
|
Datum
|
|
|
|
prsd_lextype(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 1; i <= LASTNUM; i++)
|
|
|
|
{
|
|
|
|
descr[i - 1].lexid = i;
|
|
|
|
descr[i - 1].alias = pstrdup(tok_alias[i]);
|
|
|
|
descr[i - 1].descr = pstrdup(lex_descr[i]);
|
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
descr[LASTNUM].lexid = 0;
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
PG_RETURN_POINTER(descr);
|
|
|
|
}
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_start);
|
2003-08-04 08:43:34 +08:00
|
|
|
Datum prsd_start(PG_FUNCTION_ARGS);
|
|
|
|
Datum
|
|
|
|
prsd_start(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2005-11-23 02:17:34 +08:00
|
|
|
PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_getlexeme);
|
2003-08-04 08:43:34 +08:00
|
|
|
Datum prsd_getlexeme(PG_FUNCTION_ARGS);
|
|
|
|
Datum
|
|
|
|
prsd_getlexeme(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2005-11-23 02:17:34 +08:00
|
|
|
TParser *p = (TParser *) PG_GETARG_POINTER(0);
|
2003-08-04 08:43:34 +08:00
|
|
|
char **t = (char **) PG_GETARG_POINTER(1);
|
|
|
|
int *tlen = (int *) PG_GETARG_POINTER(2);
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2005-11-23 02:17:34 +08:00
|
|
|
if (!TParserGet(p))
|
2005-11-21 20:27:57 +08:00
|
|
|
PG_RETURN_INT32(0);
|
|
|
|
|
2005-11-23 02:17:34 +08:00
|
|
|
*t = p->lexeme;
|
2005-11-21 20:27:57 +08:00
|
|
|
*tlen = p->lenbytelexeme;
|
|
|
|
|
|
|
|
PG_RETURN_INT32(p->type);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_end);
|
2003-08-04 08:43:34 +08:00
|
|
|
Datum prsd_end(PG_FUNCTION_ARGS);
|
|
|
|
Datum
|
|
|
|
prsd_end(PG_FUNCTION_ARGS)
|
|
|
|
{
|
2005-11-23 02:17:34 +08:00
|
|
|
TParser *p = (TParser *) PG_GETARG_POINTER(0);
|
|
|
|
|
|
|
|
TParserClose(p);
|
2003-07-21 18:27:44 +08:00
|
|
|
PG_RETURN_VOID();
|
|
|
|
}
|
|
|
|
|
|
|
|
#define LEAVETOKEN(x) ( (x)==12 )
|
2003-08-04 08:43:34 +08:00
|
|
|
#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
|
|
|
|
#define ENDPUNCTOKEN(x) ( (x)==12 )
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
|
2004-10-22 03:49:27 +08:00
|
|
|
#define TS_IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
|
2003-07-21 18:27:44 +08:00
|
|
|
#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
|
2004-06-29 00:19:09 +08:00
|
|
|
#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
|
2003-08-04 08:43:34 +08:00
|
|
|
#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
|
2004-10-22 03:49:27 +08:00
|
|
|
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || TS_IDIGNORE(x) )
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
typedef struct
|
|
|
|
{
|
|
|
|
HLWORD *words;
|
|
|
|
int len;
|
|
|
|
} hlCheck;
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
static bool
|
2003-08-04 08:43:34 +08:00
|
|
|
checkcondition_HL(void *checkval, ITEM * val)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < ((hlCheck *) checkval)->len; i++)
|
|
|
|
{
|
|
|
|
if (((hlCheck *) checkval)->words[i].item == val)
|
2003-07-21 18:27:44 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static bool
|
2003-08-04 08:43:34 +08:00
|
|
|
hlCover(HLPRSTEXT * prs, QUERYTYPE * query, int *p, int *q)
|
|
|
|
{
|
|
|
|
int i,
|
|
|
|
j;
|
|
|
|
ITEM *item = GETQUERY(query);
|
|
|
|
int pos = *p;
|
|
|
|
|
2008-10-18 01:40:14 +08:00
|
|
|
*q = -1;
|
2003-08-04 08:43:34 +08:00
|
|
|
*p = 0x7fffffff;
|
|
|
|
|
|
|
|
for (j = 0; j < query->size; j++)
|
|
|
|
{
|
|
|
|
if (item->type != VAL)
|
|
|
|
{
|
2003-07-21 18:27:44 +08:00
|
|
|
item++;
|
|
|
|
continue;
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
for (i = pos; i < prs->curwords; i++)
|
|
|
|
{
|
|
|
|
if (prs->words[i].item == item)
|
|
|
|
{
|
|
|
|
if (i > *q)
|
2003-07-21 18:27:44 +08:00
|
|
|
*q = i;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
item++;
|
|
|
|
}
|
|
|
|
|
2008-10-18 01:40:14 +08:00
|
|
|
if (*q < 0)
|
2003-07-21 18:27:44 +08:00
|
|
|
return false;
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
item = GETQUERY(query);
|
|
|
|
for (j = 0; j < query->size; j++)
|
|
|
|
{
|
|
|
|
if (item->type != VAL)
|
|
|
|
{
|
2003-07-21 18:27:44 +08:00
|
|
|
item++;
|
|
|
|
continue;
|
|
|
|
}
|
2003-08-04 08:43:34 +08:00
|
|
|
for (i = *q; i >= pos; i--)
|
|
|
|
{
|
|
|
|
if (prs->words[i].item == item)
|
|
|
|
{
|
|
|
|
if (i < *p)
|
|
|
|
*p = i;
|
2003-07-21 18:27:44 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
item++;
|
2003-08-04 08:43:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (*p <= *q)
|
|
|
|
{
|
2004-04-02 07:44:38 +08:00
|
|
|
hlCheck ch;
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-04-02 07:44:38 +08:00
|
|
|
ch.words = &(prs->words[*p]);
|
|
|
|
ch.len = *q - *p + 1;
|
2003-08-04 08:43:34 +08:00
|
|
|
if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
|
2003-07-21 18:27:44 +08:00
|
|
|
return true;
|
2003-08-04 08:43:34 +08:00
|
|
|
else
|
|
|
|
{
|
2003-07-21 18:27:44 +08:00
|
|
|
(*p)++;
|
2003-08-04 08:43:34 +08:00
|
|
|
return hlCover(prs, query, p, q);
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
PG_FUNCTION_INFO_V1(prsd_headline);
|
2003-08-04 08:43:34 +08:00
|
|
|
Datum prsd_headline(PG_FUNCTION_ARGS);
|
|
|
|
Datum
|
|
|
|
prsd_headline(PG_FUNCTION_ARGS)
|
|
|
|
{
|
|
|
|
HLPRSTEXT *prs = (HLPRSTEXT *) PG_GETARG_POINTER(0);
|
|
|
|
text *opt = (text *) PG_GETARG_POINTER(1); /* can't be toasted */
|
|
|
|
QUERYTYPE *query = (QUERYTYPE *) PG_GETARG_POINTER(2); /* can't be toasted */
|
|
|
|
|
2003-07-21 18:27:44 +08:00
|
|
|
/* from opt + start and and tag */
|
2003-08-04 08:43:34 +08:00
|
|
|
int min_words = 15;
|
|
|
|
int max_words = 35;
|
|
|
|
int shortword = 3;
|
|
|
|
|
|
|
|
int p = 0,
|
|
|
|
q = 0;
|
|
|
|
int bestb = -1,
|
|
|
|
beste = -1;
|
|
|
|
int bestlen = -1;
|
2004-08-29 13:07:03 +08:00
|
|
|
int pose = 0,
|
|
|
|
posb,
|
2003-08-04 08:43:34 +08:00
|
|
|
poslen,
|
|
|
|
curlen;
|
|
|
|
|
|
|
|
int i;
|
2004-08-29 13:07:03 +08:00
|
|
|
int highlight = 0;
|
2003-08-04 08:43:34 +08:00
|
|
|
|
|
|
|
/* config */
|
|
|
|
prs->startsel = NULL;
|
|
|
|
prs->stopsel = NULL;
|
|
|
|
if (opt)
|
|
|
|
{
|
|
|
|
Map *map,
|
|
|
|
*mptr;
|
|
|
|
|
|
|
|
parse_cfgdict(opt, &map);
|
|
|
|
mptr = map;
|
|
|
|
|
|
|
|
while (mptr && mptr->key)
|
|
|
|
{
|
2004-05-07 08:24:59 +08:00
|
|
|
if (pg_strcasecmp(mptr->key, "MaxWords") == 0)
|
2003-08-04 08:43:34 +08:00
|
|
|
max_words = pg_atoi(mptr->value, 4, 1);
|
2004-05-07 08:24:59 +08:00
|
|
|
else if (pg_strcasecmp(mptr->key, "MinWords") == 0)
|
2003-08-04 08:43:34 +08:00
|
|
|
min_words = pg_atoi(mptr->value, 4, 1);
|
2004-05-07 08:24:59 +08:00
|
|
|
else if (pg_strcasecmp(mptr->key, "ShortWord") == 0)
|
2003-08-04 08:43:34 +08:00
|
|
|
shortword = pg_atoi(mptr->value, 4, 1);
|
2004-05-07 08:24:59 +08:00
|
|
|
else if (pg_strcasecmp(mptr->key, "StartSel") == 0)
|
2003-08-04 08:43:34 +08:00
|
|
|
prs->startsel = pstrdup(mptr->value);
|
2004-05-07 08:24:59 +08:00
|
|
|
else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
|
2003-08-04 08:43:34 +08:00
|
|
|
prs->stopsel = pstrdup(mptr->value);
|
2004-06-29 00:19:09 +08:00
|
|
|
else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0)
|
|
|
|
highlight = (
|
2004-08-29 13:07:03 +08:00
|
|
|
pg_strcasecmp(mptr->value, "1") == 0 ||
|
|
|
|
pg_strcasecmp(mptr->value, "on") == 0 ||
|
|
|
|
pg_strcasecmp(mptr->value, "true") == 0 ||
|
|
|
|
pg_strcasecmp(mptr->value, "t") == 0 ||
|
|
|
|
pg_strcasecmp(mptr->value, "y") == 0 ||
|
|
|
|
pg_strcasecmp(mptr->value, "yes") == 0) ?
|
|
|
|
1 : 0;
|
2003-08-04 08:43:34 +08:00
|
|
|
|
2003-07-21 18:27:44 +08:00
|
|
|
pfree(mptr->key);
|
|
|
|
pfree(mptr->value);
|
|
|
|
|
|
|
|
mptr++;
|
|
|
|
}
|
|
|
|
pfree(map);
|
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (highlight == 0)
|
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
if (min_words >= max_words)
|
|
|
|
ereport(ERROR,
|
2004-08-29 13:07:03 +08:00
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
2005-10-15 10:49:52 +08:00
|
|
|
errmsg("MinWords should be less than MaxWords")));
|
2004-06-29 00:19:09 +08:00
|
|
|
if (min_words <= 0)
|
|
|
|
ereport(ERROR,
|
2004-08-29 13:07:03 +08:00
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
|
|
errmsg("MinWords should be positive")));
|
2004-06-29 00:19:09 +08:00
|
|
|
if (shortword < 0)
|
|
|
|
ereport(ERROR,
|
2004-08-29 13:07:03 +08:00
|
|
|
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
|
|
|
errmsg("ShortWord should be >= 0")));
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2004-06-29 00:19:09 +08:00
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-08-29 13:07:03 +08:00
|
|
|
if (highlight == 0)
|
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
while (hlCover(prs, query, &p, &q))
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
/* find cover len in words */
|
|
|
|
curlen = 0;
|
|
|
|
poslen = 0;
|
|
|
|
for (i = p; i <= q && curlen < max_words; i++)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
|
|
|
curlen++;
|
|
|
|
if (prs->words[i].item && !prs->words[i].repeated)
|
|
|
|
poslen++;
|
|
|
|
pose = i;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
|
|
|
|
{
|
|
|
|
/* best already finded, so try one more cover */
|
|
|
|
p++;
|
|
|
|
continue;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
|
|
|
posb = p;
|
2004-06-29 00:19:09 +08:00
|
|
|
if (curlen < max_words)
|
2004-08-29 13:07:03 +08:00
|
|
|
{ /* find good end */
|
2004-06-29 00:19:09 +08:00
|
|
|
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
if (i != q)
|
|
|
|
{
|
|
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
|
|
|
curlen++;
|
|
|
|
if (prs->words[i].item && !prs->words[i].repeated)
|
|
|
|
poslen++;
|
|
|
|
}
|
|
|
|
pose = i;
|
|
|
|
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
|
|
|
continue;
|
|
|
|
if (curlen >= min_words)
|
|
|
|
break;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
if (curlen < min_words && i >= prs->curwords)
|
|
|
|
{ /* got end of text and our cover is shoter
|
|
|
|
* than min_words */
|
|
|
|
for (i = p; i >= 0; i--)
|
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
|
|
|
curlen++;
|
|
|
|
if (prs->words[i].item && !prs->words[i].repeated)
|
|
|
|
poslen++;
|
|
|
|
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
|
|
|
continue;
|
|
|
|
if (curlen >= min_words)
|
|
|
|
break;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
posb = (i >= 0) ? i : 0;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
}
|
2004-06-29 00:19:09 +08:00
|
|
|
else
|
2004-08-29 13:07:03 +08:00
|
|
|
{ /* shorter cover :((( */
|
2004-06-29 00:19:09 +08:00
|
|
|
for (; curlen > min_words; i--)
|
|
|
|
{
|
2003-08-13 22:37:55 +08:00
|
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
2004-06-29 00:19:09 +08:00
|
|
|
curlen--;
|
2003-08-13 22:37:55 +08:00
|
|
|
if (prs->words[i].item && !prs->words[i].repeated)
|
2004-06-29 00:19:09 +08:00
|
|
|
poslen--;
|
|
|
|
pose = i;
|
2003-08-13 22:37:55 +08:00
|
|
|
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
|
|
|
|
continue;
|
2004-06-29 00:19:09 +08:00
|
|
|
break;
|
2003-08-13 22:37:55 +08:00
|
|
|
}
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
|
|
|
|
(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
|
|
|
|
(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
bestb = posb;
|
|
|
|
beste = pose;
|
|
|
|
bestlen = poslen;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
p++;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
if (bestlen < 0)
|
2003-08-04 08:43:34 +08:00
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
curlen = 0;
|
|
|
|
for (i = 0; i < prs->curwords && curlen < min_words; i++)
|
|
|
|
{
|
|
|
|
if (!NONWORDTOKEN(prs->words[i].type))
|
|
|
|
curlen++;
|
|
|
|
pose = i;
|
|
|
|
}
|
|
|
|
bestb = 0;
|
2003-08-04 08:43:34 +08:00
|
|
|
beste = pose;
|
|
|
|
}
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
bestb = 0;
|
|
|
|
beste = prs->curwords - 1;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
2003-08-04 08:43:34 +08:00
|
|
|
for (i = bestb; i <= beste; i++)
|
|
|
|
{
|
|
|
|
if (prs->words[i].item)
|
|
|
|
prs->words[i].selected = 1;
|
2004-08-29 13:07:03 +08:00
|
|
|
if (highlight == 0)
|
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
if (HLIDIGNORE(prs->words[i].type))
|
|
|
|
prs->words[i].replace = 1;
|
2004-08-29 13:07:03 +08:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2004-06-29 00:19:09 +08:00
|
|
|
if (HTMLHLIDIGNORE(prs->words[i].type))
|
|
|
|
prs->words[i].replace = 1;
|
|
|
|
}
|
2003-07-21 18:27:44 +08:00
|
|
|
|
2004-06-29 00:19:09 +08:00
|
|
|
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
|
2003-07-21 18:27:44 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!prs->startsel)
|
2003-08-04 08:43:34 +08:00
|
|
|
prs->startsel = pstrdup("<b>");
|
2003-07-21 18:27:44 +08:00
|
|
|
if (!prs->stopsel)
|
2003-08-04 08:43:34 +08:00
|
|
|
prs->stopsel = pstrdup("</b>");
|
|
|
|
prs->startsellen = strlen(prs->startsel);
|
|
|
|
prs->stopsellen = strlen(prs->stopsel);
|
2003-07-21 18:27:44 +08:00
|
|
|
|
|
|
|
PG_RETURN_POINTER(prs);
|
|
|
|
}
|