postgresql/contrib/tsearch2/ts_stat.c

560 lines
12 KiB
C
Raw Normal View History

2003-07-21 18:27:44 +08:00
/*
* stat functions
*/
#include "tsvector.h"
#include "ts_stat.h"
#include "funcapi.h"
#include "catalog/pg_type.h"
#include "executor/spi.h"
#include "common.h"
PG_FUNCTION_INFO_V1(tsstat_in);
2003-08-04 08:43:34 +08:00
Datum tsstat_in(PG_FUNCTION_ARGS);
Datum
tsstat_in(PG_FUNCTION_ARGS)
{
tsstat *stat = palloc(STATHDRSIZE);
2004-08-29 13:07:03 +08:00
2003-08-04 08:43:34 +08:00
stat->len = STATHDRSIZE;
stat->size = 0;
stat->weight = 0;
2003-07-21 18:27:44 +08:00
PG_RETURN_POINTER(stat);
}
PG_FUNCTION_INFO_V1(tsstat_out);
2003-08-04 08:43:34 +08:00
Datum tsstat_out(PG_FUNCTION_ARGS);
Datum
tsstat_out(PG_FUNCTION_ARGS)
{
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("tsstat_out not implemented")));
2003-07-21 18:27:44 +08:00
PG_RETURN_NULL();
}
static int
2004-08-29 13:07:03 +08:00
check_weight(tsvector * txt, WordEntry * wptr, int8 weight)
{
int len = POSDATALEN(txt, wptr);
int num = 0;
WordEntryPos *ptr = POSDATAPTR(txt, wptr);
2004-08-29 13:07:03 +08:00
while (len--)
{
if (weight & (1 << WEP_GETWEIGHT(*ptr)))
num++;
ptr++;
}
return num;
}
2003-08-04 08:43:34 +08:00
static WordEntry **
SEI_realloc(WordEntry ** in, uint32 *len)
{
if (*len == 0 || in == NULL)
{
*len = 8;
in = palloc(sizeof(WordEntry *) * (*len));
}
else
{
2003-07-21 18:27:44 +08:00
*len *= 2;
2003-08-04 08:43:34 +08:00
in = repalloc(in, sizeof(WordEntry *) * (*len));
2003-07-21 18:27:44 +08:00
}
return in;
}
static int
2003-08-04 08:43:34 +08:00
compareStatWord(StatEntry * a, WordEntry * b, tsstat * stat, tsvector * txt)
{
if (a->len == b->len)
2003-07-21 18:27:44 +08:00
return strncmp(
2003-08-04 08:43:34 +08:00
STATSTRPTR(stat) + a->pos,
STRPTR(txt) + b->pos,
a->len
);
2003-08-04 08:43:34 +08:00
return (a->len > b->len) ? 1 : -1;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
static tsstat *
formstat(tsstat * stat, tsvector * txt, WordEntry ** entry, uint32 len)
{
tsstat *newstat;
uint32 totallen,
nentry;
uint32 slen = 0;
WordEntry **ptr = entry;
char *curptr;
StatEntry *sptr,
*nptr;
while (ptr - entry < len)
{
2003-07-21 18:27:44 +08:00
slen += (*ptr)->len;
ptr++;
}
2003-08-04 08:43:34 +08:00
nentry = stat->size + len;
slen += STATSTRSIZE(stat);
totallen = CALCSTATSIZE(nentry, slen);
newstat = palloc(totallen);
newstat->len = totallen;
newstat->weight = stat->weight;
2003-08-04 08:43:34 +08:00
newstat->size = nentry;
2003-07-21 18:27:44 +08:00
memcpy(STATSTRPTR(newstat), STATSTRPTR(stat), STATSTRSIZE(stat));
2003-08-04 08:43:34 +08:00
curptr = STATSTRPTR(newstat) + STATSTRSIZE(stat);
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
ptr = entry;
sptr = STATPTR(stat);
nptr = STATPTR(newstat);
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
if (len == 1)
{
StatEntry *StopLow = STATPTR(stat);
StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat);
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
while (StopLow < StopHigh)
{
sptr = StopLow + (StopHigh - StopLow) / 2;
if (compareStatWord(sptr, *ptr, stat, txt) < 0)
2003-07-21 18:27:44 +08:00
StopLow = sptr + 1;
else
2003-08-04 08:43:34 +08:00
StopHigh = sptr;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
nptr = STATPTR(newstat) + (StopLow - STATPTR(stat));
memcpy(STATPTR(newstat), STATPTR(stat), sizeof(StatEntry) * (StopLow - STATPTR(stat)));
2004-08-29 13:07:03 +08:00
if ((*ptr)->haspos)
nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
else
2003-08-04 08:43:34 +08:00
nptr->nentry = 1;
nptr->ndoc = 1;
nptr->len = (*ptr)->len;
2003-07-21 18:27:44 +08:00
memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
nptr->pos = curptr - STATSTRPTR(newstat);
2003-08-04 08:43:34 +08:00
memcpy(nptr + 1, StopLow, sizeof(StatEntry) * (((StatEntry *) STATSTRPTR(stat)) - StopLow));
}
else
{
while (sptr - STATPTR(stat) < stat->size && ptr - entry < len)
{
if (compareStatWord(sptr, *ptr, stat, txt) < 0)
{
2003-07-21 18:27:44 +08:00
memcpy(nptr, sptr, sizeof(StatEntry));
sptr++;
2003-08-04 08:43:34 +08:00
}
else
{
2004-08-29 13:07:03 +08:00
if ((*ptr)->haspos)
nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
else
2003-08-04 08:43:34 +08:00
nptr->nentry = 1;
nptr->ndoc = 1;
nptr->len = (*ptr)->len;
2003-07-21 18:27:44 +08:00
memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
nptr->pos = curptr - STATSTRPTR(newstat);
curptr += nptr->len;
ptr++;
}
nptr++;
}
2003-08-04 08:43:34 +08:00
memcpy(nptr, sptr, sizeof(StatEntry) * (stat->size - (sptr - STATPTR(stat))));
while (ptr - entry < len)
{
2004-08-29 13:07:03 +08:00
if ((*ptr)->haspos)
nptr->nentry = (stat->weight) ? check_weight(txt, *ptr, stat->weight) : POSDATALEN(txt, *ptr);
else
2003-08-04 08:43:34 +08:00
nptr->nentry = 1;
nptr->ndoc = 1;
nptr->len = (*ptr)->len;
2003-07-21 18:27:44 +08:00
memcpy(curptr, STRPTR(txt) + (*ptr)->pos, nptr->len);
nptr->pos = curptr - STATSTRPTR(newstat);
curptr += nptr->len;
2003-08-04 08:43:34 +08:00
ptr++;
nptr++;
2003-07-21 18:27:44 +08:00
}
}
return newstat;
2003-08-04 08:43:34 +08:00
}
2003-07-21 18:27:44 +08:00
PG_FUNCTION_INFO_V1(ts_accum);
2003-08-04 08:43:34 +08:00
Datum ts_accum(PG_FUNCTION_ARGS);
Datum
ts_accum(PG_FUNCTION_ARGS)
{
tsstat *newstat,
*stat = (tsstat *) PG_GETARG_POINTER(0);
tsvector *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
WordEntry **newentry = NULL;
uint32 len = 0,
cur = 0;
StatEntry *sptr;
WordEntry *wptr;
2004-08-29 13:07:03 +08:00
int n = 0;
2003-08-04 08:43:34 +08:00
if (stat == NULL || PG_ARGISNULL(0))
{ /* Init in first */
stat = palloc(STATHDRSIZE);
stat->len = STATHDRSIZE;
stat->size = 0;
stat->weight = 0;
2003-07-21 18:27:44 +08:00
}
/* simple check of correctness */
2003-08-04 08:43:34 +08:00
if (txt == NULL || PG_ARGISNULL(1) || txt->size == 0)
{
PG_FREE_IF_COPY(txt, 1);
2003-07-21 18:27:44 +08:00
PG_RETURN_POINTER(stat);
}
2003-08-04 08:43:34 +08:00
sptr = STATPTR(stat);
wptr = ARRPTR(txt);
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
if (stat->size < 100 * txt->size)
{ /* merge */
while (sptr - STATPTR(stat) < stat->size && wptr - ARRPTR(txt) < txt->size)
{
int cmp = compareStatWord(sptr, wptr, stat, txt);
if (cmp < 0)
2003-07-21 18:27:44 +08:00
sptr++;
2003-08-04 08:43:34 +08:00
else if (cmp == 0)
{
2004-08-29 13:07:03 +08:00
if (stat->weight == 0)
{
sptr->ndoc++;
sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
2004-08-29 13:07:03 +08:00
}
else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0)
{
sptr->ndoc++;
sptr->nentry += n;
}
2003-08-04 08:43:34 +08:00
sptr++;
wptr++;
}
else
{
2004-08-29 13:07:03 +08:00
if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
{
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
}
2003-08-04 08:43:34 +08:00
wptr++;
2003-07-21 18:27:44 +08:00
}
}
2003-08-04 08:43:34 +08:00
while (wptr - ARRPTR(txt) < txt->size)
{
2004-08-29 13:07:03 +08:00
if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
{
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
}
2003-08-04 08:43:34 +08:00
wptr++;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
}
else
{ /* search */
while (wptr - ARRPTR(txt) < txt->size)
{
StatEntry *StopLow = STATPTR(stat);
StatEntry *StopHigh = (StatEntry *) STATSTRPTR(stat);
int cmp;
while (StopLow < StopHigh)
{
sptr = StopLow + (StopHigh - StopLow) / 2;
cmp = compareStatWord(sptr, wptr, stat, txt);
if (cmp == 0)
{
2004-08-29 13:07:03 +08:00
if (stat->weight == 0)
{
sptr->ndoc++;
sptr->nentry += (wptr->haspos) ? POSDATALEN(txt, wptr) : 1;
2004-08-29 13:07:03 +08:00
}
else if (wptr->haspos && (n = check_weight(txt, wptr, stat->weight)) != 0)
{
sptr->ndoc++;
sptr->nentry += n;
}
2003-07-21 18:27:44 +08:00
break;
2003-08-04 08:43:34 +08:00
}
else if (cmp < 0)
2003-07-21 18:27:44 +08:00
StopLow = sptr + 1;
else
2003-08-04 08:43:34 +08:00
StopHigh = sptr;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
if (StopLow >= StopHigh)
{ /* not found */
2004-08-29 13:07:03 +08:00
if (stat->weight == 0 || check_weight(txt, wptr, stat->weight) != 0)
{
if (cur == len)
newentry = SEI_realloc(newentry, &len);
newentry[cur] = wptr;
cur++;
}
2003-07-21 18:27:44 +08:00
}
wptr++;
2003-08-04 08:43:34 +08:00
}
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
if (cur == 0)
{ /* no new words */
PG_FREE_IF_COPY(txt, 1);
2003-07-21 18:27:44 +08:00
PG_RETURN_POINTER(stat);
}
newstat = formstat(stat, txt, newentry, cur);
pfree(newentry);
2003-08-04 08:43:34 +08:00
PG_FREE_IF_COPY(txt, 1);
2003-07-21 18:27:44 +08:00
/* pfree(stat); */
PG_RETURN_POINTER(newstat);
}
2003-08-04 08:43:34 +08:00
typedef struct
{
uint32 cur;
tsvector *stat;
} StatStorage;
2003-07-21 18:27:44 +08:00
static void
ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx,
tsstat * stat)
2003-08-04 08:43:34 +08:00
{
TupleDesc tupdesc;
MemoryContext oldcontext;
StatStorage *st;
2003-07-21 18:27:44 +08:00
oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
2003-08-04 08:43:34 +08:00
st = palloc(sizeof(StatStorage));
st->cur = 0;
st->stat = palloc(stat->len);
2003-07-21 18:27:44 +08:00
memcpy(st->stat, stat, stat->len);
2003-08-04 08:43:34 +08:00
funcctx->user_fctx = (void *) st;
if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
elog(ERROR, "return type must be a row type");
tupdesc = CreateTupleDescCopy(tupdesc);
2003-07-21 18:27:44 +08:00
funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc);
MemoryContextSwitchTo(oldcontext);
}
static Datum
2003-08-04 08:43:34 +08:00
ts_process_call(FuncCallContext *funcctx)
{
StatStorage *st;
st = (StatStorage *) funcctx->user_fctx;
if (st->cur < st->stat->size)
{
Datum result;
char *values[3];
char ndoc[16];
char nentry[16];
StatEntry *entry = STATPTR(st->stat) + st->cur;
HeapTuple tuple;
values[1] = ndoc;
sprintf(ndoc, "%d", entry->ndoc);
values[2] = nentry;
sprintf(nentry, "%d", entry->nentry);
values[0] = palloc(entry->len + 1);
memcpy(values[0], STATSTRPTR(st->stat) + entry->pos, entry->len);
(values[0])[entry->len] = '\0';
2003-07-21 18:27:44 +08:00
tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
result = HeapTupleGetDatum(tuple);
2003-07-21 18:27:44 +08:00
pfree(values[0]);
st->cur++;
2003-08-04 08:43:34 +08:00
return result;
}
else
{
2003-07-21 18:27:44 +08:00
pfree(st->stat);
pfree(st);
}
2003-08-04 08:43:34 +08:00
return (Datum) 0;
2003-07-21 18:27:44 +08:00
}
PG_FUNCTION_INFO_V1(ts_accum_finish);
2003-08-04 08:43:34 +08:00
Datum ts_accum_finish(PG_FUNCTION_ARGS);
Datum
ts_accum_finish(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
2003-07-21 18:27:44 +08:00
funcctx = SRF_FIRSTCALL_INIT();
ts_setup_firstcall(fcinfo, funcctx, (tsstat *) PG_GETARG_POINTER(0));
2003-07-21 18:27:44 +08:00
}
funcctx = SRF_PERCALL_SETUP();
2003-08-04 08:43:34 +08:00
if ((result = ts_process_call(funcctx)) != (Datum) 0)
2003-07-21 18:27:44 +08:00
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}
2003-08-04 08:43:34 +08:00
static Oid tiOid = InvalidOid;
2005-10-16 04:28:59 +08:00
2003-08-04 08:43:34 +08:00
static void
get_ti_Oid(void)
{
int ret;
bool isnull;
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
if ((ret = SPI_exec("select oid from pg_type where typname='tsvector'", 1)) < 0)
/* internal error */
2003-07-21 18:27:44 +08:00
elog(ERROR, "SPI_exec to get tsvector oid returns %d", ret);
2005-10-16 04:28:59 +08:00
if (SPI_processed < 1)
/* internal error */
2003-07-21 18:27:44 +08:00
elog(ERROR, "There is no tsvector type");
2003-08-04 08:43:34 +08:00
tiOid = DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0], SPI_tuptable->tupdesc, 1, &isnull));
if (tiOid == InvalidOid)
/* internal error */
2003-07-21 18:27:44 +08:00
elog(ERROR, "tsvector type has InvalidOid");
}
2003-08-04 08:43:34 +08:00
static tsstat *
ts_stat_sql(text *txt, text *ws)
2003-08-04 08:43:34 +08:00
{
char *query = text2char(txt);
int i;
tsstat *newstat,
*stat;
bool isnull;
Portal portal;
void *plan;
if (tiOid == InvalidOid)
2003-07-21 18:27:44 +08:00
get_ti_Oid();
2003-08-04 08:43:34 +08:00
if ((plan = SPI_prepare(query, 0, NULL)) == NULL)
/* internal error */
2003-08-04 08:43:34 +08:00
elog(ERROR, "SPI_prepare('%s') returns NULL", query);
2003-07-21 18:27:44 +08:00
if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, false)) == NULL)
/* internal error */
2003-08-04 08:43:34 +08:00
elog(ERROR, "SPI_cursor_open('%s') returns NULL", query);
2003-07-21 18:27:44 +08:00
SPI_cursor_fetch(portal, true, 100);
2003-08-04 08:43:34 +08:00
if (SPI_tuptable->tupdesc->natts != 1)
/* internal error */
elog(ERROR, "number of fields doesn't equal to 1");
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
if (SPI_gettypeid(SPI_tuptable->tupdesc, 1) != tiOid)
/* internal error */
elog(ERROR, "column isn't of tsvector type");
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
stat = palloc(STATHDRSIZE);
stat->len = STATHDRSIZE;
stat->size = 0;
stat->weight = 0;
2004-08-29 13:07:03 +08:00
if (ws)
{
char *buf;
buf = VARDATA(ws);
2004-08-29 13:07:03 +08:00
while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ)
{
switch (tolower(*buf))
{
case 'a':
stat->weight |= 1 << 3;
break;
case 'b':
stat->weight |= 1 << 2;
break;
case 'c':
stat->weight |= 1 << 1;
break;
case 'd':
stat->weight |= 1;
break;
default:
stat->weight |= 0;
}
buf++;
}
}
2003-08-04 08:43:34 +08:00
while (SPI_processed > 0)
{
for (i = 0; i < SPI_processed; i++)
{
Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull);
if (!isnull)
{
newstat = (tsstat *) DatumGetPointer(DirectFunctionCall2(
2005-10-15 10:49:52 +08:00
ts_accum,
PointerGetDatum(stat),
data
));
2003-08-04 08:43:34 +08:00
if (stat != newstat && stat)
2003-07-21 18:27:44 +08:00
pfree(stat);
2003-08-04 08:43:34 +08:00
stat = newstat;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
}
2003-07-21 18:27:44 +08:00
SPI_freetuptable(SPI_tuptable);
2003-08-04 08:43:34 +08:00
SPI_cursor_fetch(portal, true, 100);
}
2003-07-21 18:27:44 +08:00
SPI_freetuptable(SPI_tuptable);
SPI_cursor_close(portal);
SPI_freeplan(plan);
pfree(query);
2003-08-04 08:43:34 +08:00
return stat;
2003-07-21 18:27:44 +08:00
}
PG_FUNCTION_INFO_V1(ts_stat);
2003-08-04 08:43:34 +08:00
Datum ts_stat(PG_FUNCTION_ARGS);
Datum
ts_stat(PG_FUNCTION_ARGS)
{
FuncCallContext *funcctx;
Datum result;
if (SRF_IS_FIRSTCALL())
{
tsstat *stat;
text *txt = PG_GETARG_TEXT_P(0);
2004-08-29 13:07:03 +08:00
text *ws = (PG_NARGS() > 1) ? PG_GETARG_TEXT_P(1) : NULL;
2003-08-04 08:43:34 +08:00
2003-07-21 18:27:44 +08:00
funcctx = SRF_FIRSTCALL_INIT();
SPI_connect();
2004-08-29 13:07:03 +08:00
stat = ts_stat_sql(txt, ws);
2003-08-04 08:43:34 +08:00
PG_FREE_IF_COPY(txt, 0);
2004-08-29 13:07:03 +08:00
if (PG_NARGS() > 1)
PG_FREE_IF_COPY(ws, 1);
ts_setup_firstcall(fcinfo, funcctx, stat);
2003-07-21 18:27:44 +08:00
SPI_finish();
}
funcctx = SRF_PERCALL_SETUP();
2003-08-04 08:43:34 +08:00
if ((result = ts_process_call(funcctx)) != (Datum) 0)
2003-07-21 18:27:44 +08:00
SRF_RETURN_NEXT(funcctx, result);
SRF_RETURN_DONE(funcctx);
}