postgresql/contrib/tsearch2/wparser_def.c

395 lines
8.4 KiB
C
Raw Normal View History

2003-08-04 08:43:34 +08:00
/*
* default word parser
2003-07-21 18:27:44 +08:00
* Teodor Sigaev <teodor@sigaev.ru>
*/
#include "postgres.h"
2003-07-21 18:27:44 +08:00
#include "utils/builtins.h"
#include "dict.h"
#include "wparser.h"
#include "common.h"
#include "ts_cfg.h"
#include "wordparser/parser.h"
#include "wordparser/deflex.h"
PG_FUNCTION_INFO_V1(prsd_lextype);
2003-08-04 08:43:34 +08:00
Datum prsd_lextype(PG_FUNCTION_ARGS);
Datum
prsd_lextype(PG_FUNCTION_ARGS)
{
LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
int i;
for (i = 1; i <= LASTNUM; i++)
{
descr[i - 1].lexid = i;
descr[i - 1].alias = pstrdup(tok_alias[i]);
descr[i - 1].descr = pstrdup(lex_descr[i]);
}
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
descr[LASTNUM].lexid = 0;
2003-07-21 18:27:44 +08:00
PG_RETURN_POINTER(descr);
}
PG_FUNCTION_INFO_V1(prsd_start);
2003-08-04 08:43:34 +08:00
Datum prsd_start(PG_FUNCTION_ARGS);
Datum
prsd_start(PG_FUNCTION_ARGS)
{
PG_RETURN_POINTER(TParserInit((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1)));
2003-07-21 18:27:44 +08:00
}
PG_FUNCTION_INFO_V1(prsd_getlexeme);
2003-08-04 08:43:34 +08:00
Datum prsd_getlexeme(PG_FUNCTION_ARGS);
Datum
prsd_getlexeme(PG_FUNCTION_ARGS)
{
TParser *p = (TParser *) PG_GETARG_POINTER(0);
2003-08-04 08:43:34 +08:00
char **t = (char **) PG_GETARG_POINTER(1);
int *tlen = (int *) PG_GETARG_POINTER(2);
2003-07-21 18:27:44 +08:00
if (!TParserGet(p))
PG_RETURN_INT32(0);
*t = p->lexeme;
*tlen = p->lenbytelexeme;
PG_RETURN_INT32(p->type);
2003-07-21 18:27:44 +08:00
}
PG_FUNCTION_INFO_V1(prsd_end);
2003-08-04 08:43:34 +08:00
Datum prsd_end(PG_FUNCTION_ARGS);
Datum
prsd_end(PG_FUNCTION_ARGS)
{
TParser *p = (TParser *) PG_GETARG_POINTER(0);
TParserClose(p);
2003-07-21 18:27:44 +08:00
PG_RETURN_VOID();
}
#define LEAVETOKEN(x) ( (x)==12 )
2003-08-04 08:43:34 +08:00
#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define ENDPUNCTOKEN(x) ( (x)==12 )
2003-07-21 18:27:44 +08:00
#define TS_IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
#define HLIDREPLACE(x) ( (x)==13 )
#define HLIDSKIP(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
#define NONWORDTOKEN(x) ( (x)==12 || HLIDREPLACE(x) || HLIDSKIP(x) )
#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || TS_IDIGNORE(x) )
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
typedef struct
{
HLWORD *words;
int len;
} hlCheck;
2003-07-21 18:27:44 +08:00
static bool
2003-08-04 08:43:34 +08:00
checkcondition_HL(void *checkval, ITEM * val)
{
int i;
for (i = 0; i < ((hlCheck *) checkval)->len; i++)
{
if (((hlCheck *) checkval)->words[i].item == val)
2003-07-21 18:27:44 +08:00
return true;
}
return false;
}
static bool
2003-08-04 08:43:34 +08:00
hlCover(HLPRSTEXT * prs, QUERYTYPE * query, int *p, int *q)
{
int i,
j;
ITEM *item = GETQUERY(query);
int pos = *p;
*q = -1;
2003-08-04 08:43:34 +08:00
*p = 0x7fffffff;
for (j = 0; j < query->size; j++)
{
if (item->type != VAL)
{
2003-07-21 18:27:44 +08:00
item++;
continue;
}
2003-08-04 08:43:34 +08:00
for (i = pos; i < prs->curwords; i++)
{
if (prs->words[i].item == item)
{
if (i > *q)
2003-07-21 18:27:44 +08:00
*q = i;
break;
}
}
item++;
}
if (*q < 0)
2003-07-21 18:27:44 +08:00
return false;
2003-08-04 08:43:34 +08:00
item = GETQUERY(query);
for (j = 0; j < query->size; j++)
{
if (item->type != VAL)
{
2003-07-21 18:27:44 +08:00
item++;
continue;
}
2003-08-04 08:43:34 +08:00
for (i = *q; i >= pos; i--)
{
if (prs->words[i].item == item)
{
if (i < *p)
*p = i;
2003-07-21 18:27:44 +08:00
break;
}
}
item++;
2003-08-04 08:43:34 +08:00
}
if (*p <= *q)
{
hlCheck ch;
2003-07-21 18:27:44 +08:00
ch.words = &(prs->words[*p]);
ch.len = *q - *p + 1;
2003-08-04 08:43:34 +08:00
if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
2003-07-21 18:27:44 +08:00
return true;
2003-08-04 08:43:34 +08:00
else
{
2003-07-21 18:27:44 +08:00
(*p)++;
2003-08-04 08:43:34 +08:00
return hlCover(prs, query, p, q);
2003-07-21 18:27:44 +08:00
}
}
return false;
}
PG_FUNCTION_INFO_V1(prsd_headline);
2003-08-04 08:43:34 +08:00
Datum prsd_headline(PG_FUNCTION_ARGS);
Datum
prsd_headline(PG_FUNCTION_ARGS)
{
HLPRSTEXT *prs = (HLPRSTEXT *) PG_GETARG_POINTER(0);
text *opt = (text *) PG_GETARG_POINTER(1); /* can't be toasted */
QUERYTYPE *query = (QUERYTYPE *) PG_GETARG_POINTER(2); /* can't be toasted */
2003-07-21 18:27:44 +08:00
/* from opt + start and and tag */
2003-08-04 08:43:34 +08:00
int min_words = 15;
int max_words = 35;
int shortword = 3;
int p = 0,
q = 0;
int bestb = -1,
beste = -1;
int bestlen = -1;
2004-08-29 13:07:03 +08:00
int pose = 0,
posb,
2003-08-04 08:43:34 +08:00
poslen,
curlen;
int i;
2004-08-29 13:07:03 +08:00
int highlight = 0;
2003-08-04 08:43:34 +08:00
/* config */
prs->startsel = NULL;
prs->stopsel = NULL;
if (opt)
{
Map *map,
*mptr;
parse_cfgdict(opt, &map);
mptr = map;
while (mptr && mptr->key)
{
if (pg_strcasecmp(mptr->key, "MaxWords") == 0)
2003-08-04 08:43:34 +08:00
max_words = pg_atoi(mptr->value, 4, 1);
else if (pg_strcasecmp(mptr->key, "MinWords") == 0)
2003-08-04 08:43:34 +08:00
min_words = pg_atoi(mptr->value, 4, 1);
else if (pg_strcasecmp(mptr->key, "ShortWord") == 0)
2003-08-04 08:43:34 +08:00
shortword = pg_atoi(mptr->value, 4, 1);
else if (pg_strcasecmp(mptr->key, "StartSel") == 0)
2003-08-04 08:43:34 +08:00
prs->startsel = pstrdup(mptr->value);
else if (pg_strcasecmp(mptr->key, "StopSel") == 0)
2003-08-04 08:43:34 +08:00
prs->stopsel = pstrdup(mptr->value);
else if (pg_strcasecmp(mptr->key, "HighlightAll") == 0)
highlight = (
2004-08-29 13:07:03 +08:00
pg_strcasecmp(mptr->value, "1") == 0 ||
pg_strcasecmp(mptr->value, "on") == 0 ||
pg_strcasecmp(mptr->value, "true") == 0 ||
pg_strcasecmp(mptr->value, "t") == 0 ||
pg_strcasecmp(mptr->value, "y") == 0 ||
pg_strcasecmp(mptr->value, "yes") == 0) ?
1 : 0;
2003-08-04 08:43:34 +08:00
2003-07-21 18:27:44 +08:00
pfree(mptr->key);
pfree(mptr->value);
mptr++;
}
pfree(map);
2004-08-29 13:07:03 +08:00
if (highlight == 0)
{
if (min_words >= max_words)
ereport(ERROR,
2004-08-29 13:07:03 +08:00
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
2005-10-15 10:49:52 +08:00
errmsg("MinWords should be less than MaxWords")));
if (min_words <= 0)
ereport(ERROR,
2004-08-29 13:07:03 +08:00
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("MinWords should be positive")));
if (shortword < 0)
ereport(ERROR,
2004-08-29 13:07:03 +08:00
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
errmsg("ShortWord should be >= 0")));
2003-07-21 18:27:44 +08:00
}
}
2003-07-21 18:27:44 +08:00
2004-08-29 13:07:03 +08:00
if (highlight == 0)
{
while (hlCover(prs, query, &p, &q))
2003-08-04 08:43:34 +08:00
{
/* find cover len in words */
curlen = 0;
poslen = 0;
for (i = p; i <= q && curlen < max_words; i++)
2003-08-04 08:43:34 +08:00
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
pose = i;
}
2004-08-29 13:07:03 +08:00
if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
{
/* best already finded, so try one more cover */
p++;
continue;
}
2004-08-29 13:07:03 +08:00
posb = p;
if (curlen < max_words)
2004-08-29 13:07:03 +08:00
{ /* find good end */
for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
2003-08-04 08:43:34 +08:00
{
if (i != q)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
}
pose = i;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
if (curlen >= min_words)
break;
}
2004-08-29 13:07:03 +08:00
if (curlen < min_words && i >= prs->curwords)
{ /* got end of text and our cover is shoter
* than min_words */
for (i = p - 1 ; i >= 0; i--)
2004-08-29 13:07:03 +08:00
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
if (prs->words[i].item && !prs->words[i].repeated)
poslen++;
if ( curlen >= max_words )
break;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
if (curlen >= min_words)
break;
}
2004-08-29 13:07:03 +08:00
posb = (i >= 0) ? i : 0;
2003-07-21 18:27:44 +08:00
}
}
else
2004-08-29 13:07:03 +08:00
{ /* shorter cover :((( */
for (; curlen > min_words; i--)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen--;
if (prs->words[i].item && !prs->words[i].repeated)
poslen--;
pose = i;
if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
continue;
break;
}
}
2004-08-29 13:07:03 +08:00
if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
(bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
2003-08-04 08:43:34 +08:00
{
bestb = posb;
beste = pose;
bestlen = poslen;
2003-07-21 18:27:44 +08:00
}
2004-08-29 13:07:03 +08:00
p++;
2003-07-21 18:27:44 +08:00
}
if (bestlen < 0)
2003-08-04 08:43:34 +08:00
{
curlen = 0;
for (i = 0; i < prs->curwords && curlen < min_words; i++)
{
if (!NONWORDTOKEN(prs->words[i].type))
curlen++;
pose = i;
}
bestb = 0;
2003-08-04 08:43:34 +08:00
beste = pose;
}
2004-08-29 13:07:03 +08:00
}
else
{
bestb = 0;
beste = prs->curwords - 1;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
for (i = bestb; i <= beste; i++)
{
if (prs->words[i].item)
prs->words[i].selected = 1;
2004-08-29 13:07:03 +08:00
if (highlight == 0)
{
if (HLIDREPLACE(prs->words[i].type))
prs->words[i].replace = 1;
else if (HLIDSKIP(prs->words[i].type))
prs->words[i].skip = 1;
2004-08-29 13:07:03 +08:00
}
else
{
if (HTMLHLIDIGNORE(prs->words[i].type))
prs->words[i].skip = 1;
}
2003-07-21 18:27:44 +08:00
prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
2003-07-21 18:27:44 +08:00
}
if (!prs->startsel)
2003-08-04 08:43:34 +08:00
prs->startsel = pstrdup("<b>");
2003-07-21 18:27:44 +08:00
if (!prs->stopsel)
2003-08-04 08:43:34 +08:00
prs->stopsel = pstrdup("</b>");
prs->startsellen = strlen(prs->startsel);
prs->stopsellen = strlen(prs->stopsel);
2003-07-21 18:27:44 +08:00
PG_RETURN_POINTER(prs);
}