postgresql/contrib/tsearch2/tsvector.c

1109 lines
25 KiB
C
Raw Normal View History

2003-07-21 18:27:44 +08:00
/*
* In/Out definitions for tsvector type
* Internal structure:
* string of values, array of position lexeme in string and it's length
2003-07-21 18:27:44 +08:00
* Teodor Sigaev <teodor@sigaev.ru>
*/
#include "postgres.h"
2003-07-21 18:27:44 +08:00
#include "access/gist.h"
#include "access/itup.h"
#include "catalog/namespace.h"
2003-07-21 18:27:44 +08:00
#include "commands/trigger.h"
#include "executor/spi.h"
2003-07-21 18:27:44 +08:00
#include "nodes/pg_list.h"
#include "storage/bufpage.h"
#include "utils/builtins.h"
2003-07-21 18:27:44 +08:00
#include "utils/pg_locale.h"
#include "mb/pg_wchar.h"
2003-07-21 18:27:44 +08:00
#include <ctype.h>
2003-07-21 18:27:44 +08:00
#include "tsvector.h"
#include "query.h"
#include "ts_cfg.h"
#include "common.h"
PG_FUNCTION_INFO_V1(tsvector_in);
Datum tsvector_in(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(tsvector_out);
Datum tsvector_out(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(to_tsvector);
Datum to_tsvector(PG_FUNCTION_ARGS);
2003-08-04 08:43:34 +08:00
2003-07-21 18:27:44 +08:00
PG_FUNCTION_INFO_V1(to_tsvector_current);
Datum to_tsvector_current(PG_FUNCTION_ARGS);
2003-08-04 08:43:34 +08:00
2003-07-21 18:27:44 +08:00
PG_FUNCTION_INFO_V1(to_tsvector_name);
Datum to_tsvector_name(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(tsearch2);
Datum tsearch2(PG_FUNCTION_ARGS);
PG_FUNCTION_INFO_V1(tsvector_length);
Datum tsvector_length(PG_FUNCTION_ARGS);
/*
* in/out text index type
*/
2003-08-04 08:43:34 +08:00
static int
comparePos(const void *a, const void *b)
{
if (WEP_GETPOS(*(WordEntryPos *) a) == WEP_GETPOS(*(WordEntryPos *) b))
return 0;
2005-10-15 10:49:52 +08:00
return (WEP_GETPOS(*(WordEntryPos *) a) > WEP_GETPOS(*(WordEntryPos *) b)) ? 1 : -1;
2003-07-21 18:27:44 +08:00
}
static int
2003-08-04 08:43:34 +08:00
uniquePos(WordEntryPos * a, int4 l)
{
WordEntryPos *ptr,
*res;
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
res = a;
if (l == 1)
2003-07-21 18:27:44 +08:00
return l;
qsort((void *) a, l, sizeof(WordEntryPos), comparePos);
ptr = a + 1;
2003-08-04 08:43:34 +08:00
while (ptr - a < l)
{
if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res))
2003-08-04 08:43:34 +08:00
{
2003-07-21 18:27:44 +08:00
res++;
*res = *ptr;
if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1)
2003-07-21 18:27:44 +08:00
break;
2003-08-04 08:43:34 +08:00
}
else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res))
WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr));
2003-07-21 18:27:44 +08:00
ptr++;
}
return res + 1 - a;
}
static int
compareentry(const void *a, const void *b, void *arg)
2003-07-21 18:27:44 +08:00
{
char *BufferStr = (char *) arg;
2003-08-04 08:43:34 +08:00
if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len)
2003-07-21 18:27:44 +08:00
{
return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos],
2003-07-21 18:27:44 +08:00
&BufferStr[((WordEntryIN *) b)->entry.pos],
((WordEntryIN *) a)->entry.len);
}
2003-08-04 08:43:34 +08:00
return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1;
2003-07-21 18:27:44 +08:00
}
static int
uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
{
2003-08-04 08:43:34 +08:00
WordEntryIN *ptr,
2003-07-21 18:27:44 +08:00
*res;
res = a;
2003-08-04 08:43:34 +08:00
if (l == 1)
{
if (a->entry.haspos)
{
*(uint16 *) (a->pos) = uniquePos(&(a->pos[1]), *(uint16 *) (a->pos));
*outbuflen = SHORTALIGN(res->entry.len) + (*(uint16 *) (a->pos) + 1) * sizeof(WordEntryPos);
2003-07-21 18:27:44 +08:00
}
return l;
}
ptr = a + 1;
qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry, (void *) buf);
2003-07-21 18:27:44 +08:00
while (ptr - a < l)
{
if (!(ptr->entry.len == res->entry.len &&
strncmp(&buf[ptr->entry.pos], &buf[res->entry.pos], res->entry.len) == 0))
{
2003-08-04 08:43:34 +08:00
if (res->entry.haspos)
{
*(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
*outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
2003-07-21 18:27:44 +08:00
}
*outbuflen += SHORTALIGN(res->entry.len);
res++;
2003-08-04 08:43:34 +08:00
memcpy(res, ptr, sizeof(WordEntryIN));
}
else if (ptr->entry.haspos)
{
if (res->entry.haspos)
{
int4 len = *(uint16 *) (ptr->pos) + 1 + *(uint16 *) (res->pos);
res->pos = (WordEntryPos *) repalloc(res->pos, len * sizeof(WordEntryPos));
memcpy(&(res->pos[*(uint16 *) (res->pos) + 1]),
&(ptr->pos[1]), *(uint16 *) (ptr->pos) * sizeof(WordEntryPos));
*(uint16 *) (res->pos) += *(uint16 *) (ptr->pos);
pfree(ptr->pos);
}
else
{
res->entry.haspos = 1;
2003-07-21 18:27:44 +08:00
res->pos = ptr->pos;
}
}
ptr++;
}
2003-08-04 08:43:34 +08:00
if (res->entry.haspos)
{
*(uint16 *) (res->pos) = uniquePos(&(res->pos[1]), *(uint16 *) (res->pos));
*outbuflen += *(uint16 *) (res->pos) * sizeof(WordEntryPos);
2003-07-21 18:27:44 +08:00
}
*outbuflen += SHORTALIGN(res->entry.len);
return res + 1 - a;
}
#define WAITWORD 1
2006-10-04 08:30:14 +08:00
#define WAITENDWORD 2
2003-07-21 18:27:44 +08:00
#define WAITNEXTCHAR 3
#define WAITENDCMPLX 4
2006-10-04 08:30:14 +08:00
#define WAITPOSINFO 5
#define INPOSINFO 6
2003-07-21 18:27:44 +08:00
#define WAITPOSDELIM 7
2006-10-04 08:30:14 +08:00
#define WAITCHARCMPLX 8
2003-07-21 18:27:44 +08:00
#define RESIZEPRSBUF \
do { \
if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
2003-07-21 18:27:44 +08:00
{ \
int4 clen = state->curpos - state->word; \
state->len *= 2; \
state->word = (char*)repalloc( (void*)state->word, state->len ); \
state->curpos = state->word + clen; \
} \
} while (0)
2003-07-21 18:27:44 +08:00
int4
gettoken_tsvector(TI_IN_STATE * state)
{
int4 oldstate = 0;
state->curpos = state->word;
state->state = WAITWORD;
2003-08-04 08:43:34 +08:00
state->alen = 0;
2003-07-21 18:27:44 +08:00
while (1)
{
if (state->state == WAITWORD)
{
if (*(state->prsbuf) == '\0')
return 0;
2006-10-04 08:30:14 +08:00
else if (t_iseq(state->prsbuf, '\''))
2003-07-21 18:27:44 +08:00
state->state = WAITENDCMPLX;
2006-10-04 08:30:14 +08:00
else if (t_iseq(state->prsbuf, '\\'))
2003-07-21 18:27:44 +08:00
{
state->state = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
else if (!t_isspace(state->prsbuf))
2003-07-21 18:27:44 +08:00
{
COPYCHAR(state->curpos, state->prsbuf);
2006-10-04 08:30:14 +08:00
state->curpos += pg_mblen(state->prsbuf);
2003-07-21 18:27:44 +08:00
state->state = WAITENDWORD;
}
}
else if (state->state == WAITNEXTCHAR)
{
if (*(state->prsbuf) == '\0')
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("there is no escaped character")));
2003-07-21 18:27:44 +08:00
else
{
RESIZEPRSBUF;
COPYCHAR(state->curpos, state->prsbuf);
2006-10-04 08:30:14 +08:00
state->curpos += pg_mblen(state->prsbuf);
2003-07-21 18:27:44 +08:00
state->state = oldstate;
}
}
else if (state->state == WAITENDWORD)
{
2006-10-04 08:30:14 +08:00
if (t_iseq(state->prsbuf, '\\'))
2003-07-21 18:27:44 +08:00
{
state->state = WAITNEXTCHAR;
oldstate = WAITENDWORD;
}
2006-10-04 08:30:14 +08:00
else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
(state->oprisdelim && ISOPERATOR(state->prsbuf)))
2003-07-21 18:27:44 +08:00
{
RESIZEPRSBUF;
if (state->curpos == state->word)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
2003-07-21 18:27:44 +08:00
*(state->curpos) = '\0';
2003-08-04 08:43:34 +08:00
return 1;
}
2006-10-04 08:30:14 +08:00
else if (t_iseq(state->prsbuf, ':'))
2003-08-04 08:43:34 +08:00
{
2003-07-21 18:27:44 +08:00
if (state->curpos == state->word)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
2003-07-21 18:27:44 +08:00
*(state->curpos) = '\0';
2003-08-04 08:43:34 +08:00
if (state->oprisdelim)
2003-07-21 18:27:44 +08:00
return 1;
else
state->state = INPOSINFO;
}
else
{
RESIZEPRSBUF;
COPYCHAR(state->curpos, state->prsbuf);
2006-10-04 08:30:14 +08:00
state->curpos += pg_mblen(state->prsbuf);
2003-07-21 18:27:44 +08:00
}
}
else if (state->state == WAITENDCMPLX)
{
2006-10-04 08:30:14 +08:00
if (t_iseq(state->prsbuf, '\''))
{
state->state = WAITCHARCMPLX;
2003-07-21 18:27:44 +08:00
}
2006-10-04 08:30:14 +08:00
else if (t_iseq(state->prsbuf, '\\'))
2003-07-21 18:27:44 +08:00
{
state->state = WAITNEXTCHAR;
oldstate = WAITENDCMPLX;
}
else if (*(state->prsbuf) == '\0')
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
2003-07-21 18:27:44 +08:00
else
{
RESIZEPRSBUF;
COPYCHAR(state->curpos, state->prsbuf);
2006-10-04 08:30:14 +08:00
state->curpos += pg_mblen(state->prsbuf);
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
}
else if (state->state == WAITCHARCMPLX)
{
2006-10-04 08:30:14 +08:00
if (t_iseq(state->prsbuf, '\''))
{
RESIZEPRSBUF;
COPYCHAR(state->curpos, state->prsbuf);
2006-10-04 08:30:14 +08:00
state->curpos += pg_mblen(state->prsbuf);
state->state = WAITENDCMPLX;
2006-10-04 08:30:14 +08:00
}
else
{
RESIZEPRSBUF;
*(state->curpos) = '\0';
if (state->curpos == state->word)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
if (state->oprisdelim)
{
/* state->prsbuf+=pg_mblen(state->prsbuf); */
return 1;
}
else
state->state = WAITPOSINFO;
2006-10-04 08:30:14 +08:00
continue; /* recheck current character */
}
}
2003-08-04 08:43:34 +08:00
else if (state->state == WAITPOSINFO)
{
2006-10-04 08:30:14 +08:00
if (t_iseq(state->prsbuf, ':'))
2003-08-04 08:43:34 +08:00
state->state = INPOSINFO;
2003-07-21 18:27:44 +08:00
else
return 1;
2003-08-04 08:43:34 +08:00
}
else if (state->state == INPOSINFO)
{
if (t_isdigit(state->prsbuf))
2003-08-04 08:43:34 +08:00
{
if (state->alen == 0)
{
state->alen = 4;
state->pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * state->alen);
*(uint16 *) (state->pos) = 0;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
else if (*(uint16 *) (state->pos) + 1 >= state->alen)
{
state->alen *= 2;
state->pos = (WordEntryPos *) repalloc(state->pos, sizeof(WordEntryPos) * state->alen);
}
(*(uint16 *) (state->pos))++;
WEP_SETPOS(state->pos[*(uint16 *) (state->pos)], LIMITPOS(atoi(state->prsbuf)));
if (WEP_GETPOS(state->pos[*(uint16 *) (state->pos)]) == 0)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("wrong position info")));
2005-10-15 10:49:52 +08:00
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
2003-07-21 18:27:44 +08:00
state->state = WAITPOSDELIM;
2003-08-04 08:43:34 +08:00
}
else
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
2003-08-04 08:43:34 +08:00
}
else if (state->state == WAITPOSDELIM)
{
2006-10-04 08:30:14 +08:00
if (t_iseq(state->prsbuf, ','))
2003-07-21 18:27:44 +08:00
state->state = INPOSINFO;
2006-10-04 08:30:14 +08:00
else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
2003-08-04 08:43:34 +08:00
{
2005-10-15 10:49:52 +08:00
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
2005-10-15 10:49:52 +08:00
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
2003-08-04 08:43:34 +08:00
}
2006-10-04 08:30:14 +08:00
else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
2003-08-04 08:43:34 +08:00
{
2005-10-15 10:49:52 +08:00
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
2005-10-15 10:49:52 +08:00
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
2003-08-04 08:43:34 +08:00
}
2006-10-04 08:30:14 +08:00
else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
2003-08-04 08:43:34 +08:00
{
2005-10-15 10:49:52 +08:00
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
2005-10-15 10:49:52 +08:00
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
2003-08-04 08:43:34 +08:00
}
2006-10-04 08:30:14 +08:00
else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
2003-08-04 08:43:34 +08:00
{
2005-10-15 10:49:52 +08:00
if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
2005-10-15 10:49:52 +08:00
WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
2003-08-04 08:43:34 +08:00
}
else if (t_isspace(state->prsbuf) ||
*(state->prsbuf) == '\0')
2003-07-21 18:27:44 +08:00
return 1;
else if (!t_isdigit(state->prsbuf))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("syntax error")));
2003-08-04 08:43:34 +08:00
}
else
/* internal error */
elog(ERROR, "internal error");
/* get next char */
2006-10-04 08:30:14 +08:00
state->prsbuf += pg_mblen(state->prsbuf);
2003-07-21 18:27:44 +08:00
}
return 0;
}
Datum
tsvector_in(PG_FUNCTION_ARGS)
{
char *buf = PG_GETARG_CSTRING(0);
TI_IN_STATE state;
2003-08-04 08:43:34 +08:00
WordEntryIN *arr;
2003-07-21 18:27:44 +08:00
WordEntry *inarr;
int4 len = 0,
totallen = 64;
2003-08-04 08:43:34 +08:00
tsvector *in;
2003-07-21 18:27:44 +08:00
char *tmpbuf,
*cur;
int4 i,
buflen = 256;
2004-08-29 13:07:03 +08:00
SET_FUNCOID();
2006-10-04 08:30:14 +08:00
pg_verifymbstr(buf, strlen(buf), false);
2003-07-21 18:27:44 +08:00
state.prsbuf = buf;
state.len = 32;
state.word = (char *) palloc(state.len);
state.oprisdelim = false;
arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * totallen);
cur = tmpbuf = (char *) palloc(buflen);
while (gettoken_tsvector(&state))
{
if (len >= totallen)
{
totallen *= 2;
arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * totallen);
}
while ((cur - tmpbuf) + (state.curpos - state.word) >= buflen)
{
int4 dist = cur - tmpbuf;
buflen *= 2;
tmpbuf = (char *) repalloc((void *) tmpbuf, buflen);
cur = tmpbuf + dist;
}
if (state.curpos - state.word >= MAXSTRLEN)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("word is too long")));
2003-08-04 08:43:34 +08:00
arr[len].entry.len = state.curpos - state.word;
2003-07-21 18:27:44 +08:00
if (cur - tmpbuf > MAXSTRPOS)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("too long value")));
2003-08-04 08:43:34 +08:00
arr[len].entry.pos = cur - tmpbuf;
2003-07-21 18:27:44 +08:00
memcpy((void *) cur, (void *) state.word, arr[len].entry.len);
cur += arr[len].entry.len;
2003-08-04 08:43:34 +08:00
if (state.alen)
{
arr[len].entry.haspos = 1;
2003-07-21 18:27:44 +08:00
arr[len].pos = state.pos;
2003-08-04 08:43:34 +08:00
}
else
arr[len].entry.haspos = 0;
2003-07-21 18:27:44 +08:00
len++;
}
pfree(state.word);
2003-08-04 08:43:34 +08:00
if (len > 0)
2003-07-21 18:27:44 +08:00
len = uniqueentry(arr, len, tmpbuf, &buflen);
else
2004-08-29 13:07:03 +08:00
buflen = 0;
2003-07-21 18:27:44 +08:00
totallen = CALCDATASIZE(len, buflen);
in = (tsvector *) palloc(totallen);
2003-08-04 08:43:34 +08:00
memset(in, 0, totallen);
2003-07-21 18:27:44 +08:00
in->len = totallen;
in->size = len;
cur = STRPTR(in);
inarr = ARRPTR(in);
for (i = 0; i < len; i++)
{
memcpy((void *) cur, (void *) &tmpbuf[arr[i].entry.pos], arr[i].entry.len);
2003-08-04 08:43:34 +08:00
arr[i].entry.pos = cur - STRPTR(in);
2003-07-21 18:27:44 +08:00
cur += SHORTALIGN(arr[i].entry.len);
2003-08-04 08:43:34 +08:00
if (arr[i].entry.haspos)
{
memcpy(cur, arr[i].pos, (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos));
cur += (*(uint16 *) arr[i].pos + 1) * sizeof(WordEntryPos);
pfree(arr[i].pos);
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
memcpy(&(inarr[i]), &(arr[i].entry), sizeof(WordEntry));
2003-07-21 18:27:44 +08:00
}
pfree(tmpbuf);
pfree(arr);
PG_RETURN_POINTER(in);
}
Datum
tsvector_length(PG_FUNCTION_ARGS)
{
2003-08-04 08:43:34 +08:00
tsvector *in = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
2003-07-21 18:27:44 +08:00
int4 ret = in->size;
PG_FREE_IF_COPY(in, 0);
PG_RETURN_INT32(ret);
}
Datum
tsvector_out(PG_FUNCTION_ARGS)
{
2003-08-04 08:43:34 +08:00
tsvector *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
2003-07-21 18:27:44 +08:00
char *outbuf;
int4 i,
2003-08-04 08:43:34 +08:00
lenbuf = 0,
pp;
2003-07-21 18:27:44 +08:00
WordEntry *ptr = ARRPTR(out);
2006-10-04 08:30:14 +08:00
char *curbegin,
*curin,
2003-07-21 18:27:44 +08:00
*curout;
2003-08-04 08:43:34 +08:00
lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
for (i = 0; i < out->size; i++)
{
2006-10-04 08:30:14 +08:00
lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length() /* for escape */ ;
2003-08-04 08:43:34 +08:00
if (ptr[i].haspos)
lenbuf += 7 * POSDATALEN(out, &(ptr[i]));
}
2003-07-21 18:27:44 +08:00
curout = outbuf = (char *) palloc(lenbuf);
for (i = 0; i < out->size; i++)
{
curbegin = curin = STRPTR(out) + ptr->pos;
2003-07-21 18:27:44 +08:00
if (i != 0)
*curout++ = ' ';
*curout++ = '\'';
2006-10-04 08:30:14 +08:00
while (curin - curbegin < ptr->len)
2003-07-21 18:27:44 +08:00
{
2006-10-04 08:30:14 +08:00
int len = pg_mblen(curin);
if (t_iseq(curin, '\''))
2003-07-21 18:27:44 +08:00
{
int4 pos = curout - outbuf;
outbuf = (char *) repalloc((void *) outbuf, ++lenbuf);
curout = outbuf + pos;
*curout++ = '\'';
2003-07-21 18:27:44 +08:00
}
2006-10-04 08:30:14 +08:00
while (len--)
*curout++ = *curin++;
2003-07-21 18:27:44 +08:00
}
*curout++ = '\'';
2003-08-04 08:43:34 +08:00
if ((pp = POSDATALEN(out, ptr)) != 0)
{
2003-07-21 18:27:44 +08:00
WordEntryPos *wptr;
2003-08-04 08:43:34 +08:00
2003-07-21 18:27:44 +08:00
*curout++ = ':';
2003-08-04 08:43:34 +08:00
wptr = POSDATAPTR(out, ptr);
while (pp)
{
sprintf(curout, "%d", WEP_GETPOS(*wptr));
2003-08-04 08:43:34 +08:00
curout = strchr(curout, '\0');
switch (WEP_GETWEIGHT(*wptr))
2003-08-04 08:43:34 +08:00
{
case 3:
*curout++ = 'A';
break;
case 2:
*curout++ = 'B';
break;
case 1:
*curout++ = 'C';
break;
case 0:
default:
break;
2003-07-21 18:27:44 +08:00
}
2003-08-04 08:43:34 +08:00
if (pp > 1)
*curout++ = ',';
pp--;
wptr++;
2003-07-21 18:27:44 +08:00
}
}
ptr++;
}
2003-08-04 08:43:34 +08:00
*curout = '\0';
2003-07-21 18:27:44 +08:00
outbuf[lenbuf - 1] = '\0';
PG_FREE_IF_COPY(out, 0);
PG_RETURN_POINTER(outbuf);
}
static int
compareWORD(const void *a, const void *b)
{
if (((TSWORD *) a)->len == ((TSWORD *) b)->len)
2003-08-04 08:43:34 +08:00
{
int res = strncmp(
((TSWORD *) a)->word,
((TSWORD *) b)->word,
((TSWORD *) b)->len);
2003-08-04 08:43:34 +08:00
if (res == 0)
return (((TSWORD *) a)->pos.pos > ((TSWORD *) b)->pos.pos) ? 1 : -1;
2003-07-21 18:27:44 +08:00
return res;
}
return (((TSWORD *) a)->len > ((TSWORD *) b)->len) ? 1 : -1;
2003-07-21 18:27:44 +08:00
}
static int
uniqueWORD(TSWORD * a, int4 l)
2003-07-21 18:27:44 +08:00
{
TSWORD *ptr,
2003-07-21 18:27:44 +08:00
*res;
2003-08-04 08:43:34 +08:00
int tmppos;
if (l == 1)
{
tmppos = LIMITPOS(a->pos.pos);
a->alen = 2;
a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
a->pos.apos[0] = 1;
a->pos.apos[1] = tmppos;
2003-07-21 18:27:44 +08:00
return l;
}
res = a;
ptr = a + 1;
qsort((void *) a, l, sizeof(TSWORD), compareWORD);
2003-08-04 08:43:34 +08:00
tmppos = LIMITPOS(a->pos.pos);
a->alen = 2;
a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen);
a->pos.apos[0] = 1;
a->pos.apos[1] = tmppos;
2003-07-21 18:27:44 +08:00
while (ptr - a < l)
{
if (!(ptr->len == res->len &&
strncmp(ptr->word, res->word, res->len) == 0))
{
res++;
res->len = ptr->len;
res->word = ptr->word;
2003-08-04 08:43:34 +08:00
tmppos = LIMITPOS(ptr->pos.pos);
res->alen = 2;
res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen);
res->pos.apos[0] = 1;
res->pos.apos[1] = tmppos;
}
else
{
2003-07-21 18:27:44 +08:00
pfree(ptr->word);
2003-08-04 08:43:34 +08:00
if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1)
{
if (res->pos.apos[0] + 1 >= res->alen)
{
res->alen *= 2;
res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen);
2003-07-21 18:27:44 +08:00
}
2004-08-29 13:07:03 +08:00
if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos))
{
res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos);
res->pos.apos[0]++;
}
2003-07-21 18:27:44 +08:00
}
}
ptr++;
}
return res + 1 - a;
}
/*
* make value of tsvector
*/
static tsvector *
makevalue(PRSTEXT * prs)
{
2003-08-04 08:43:34 +08:00
int4 i,
j,
2003-07-21 18:27:44 +08:00
lenstr = 0,
totallen;
2003-08-04 08:43:34 +08:00
tsvector *in;
2003-07-21 18:27:44 +08:00
WordEntry *ptr;
char *str,
*cur;
prs->curwords = uniqueWORD(prs->words, prs->curwords);
2003-08-04 08:43:34 +08:00
for (i = 0; i < prs->curwords; i++)
{
2003-07-21 18:27:44 +08:00
lenstr += SHORTALIGN(prs->words[i].len);
2003-08-04 08:43:34 +08:00
if (prs->words[i].alen)
2003-07-21 18:27:44 +08:00
lenstr += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
}
totallen = CALCDATASIZE(prs->curwords, lenstr);
in = (tsvector *) palloc(totallen);
2003-08-04 08:43:34 +08:00
memset(in, 0, totallen);
2003-07-21 18:27:44 +08:00
in->len = totallen;
in->size = prs->curwords;
ptr = ARRPTR(in);
cur = str = STRPTR(in);
for (i = 0; i < prs->curwords; i++)
{
ptr->len = prs->words[i].len;
if (cur - str > MAXSTRPOS)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("value is too big")));
2003-08-04 08:43:34 +08:00
ptr->pos = cur - str;
2003-07-21 18:27:44 +08:00
memcpy((void *) cur, (void *) prs->words[i].word, prs->words[i].len);
pfree(prs->words[i].word);
cur += SHORTALIGN(prs->words[i].len);
2003-08-04 08:43:34 +08:00
if (prs->words[i].alen)
{
2003-07-21 18:27:44 +08:00
WordEntryPos *wptr;
2003-08-04 08:43:34 +08:00
ptr->haspos = 1;
*(uint16 *) cur = prs->words[i].pos.apos[0];
wptr = POSDATAPTR(in, ptr);
for (j = 0; j < *(uint16 *) cur; j++)
{
WEP_SETWEIGHT(wptr[j], 0);
WEP_SETPOS(wptr[j], prs->words[i].pos.apos[j + 1]);
2003-07-21 18:27:44 +08:00
}
cur += sizeof(uint16) + prs->words[i].pos.apos[0] * sizeof(WordEntryPos);
pfree(prs->words[i].pos.apos);
2003-08-04 08:43:34 +08:00
}
else
ptr->haspos = 0;
2003-07-21 18:27:44 +08:00
ptr++;
}
pfree(prs->words);
return in;
}
Datum
to_tsvector(PG_FUNCTION_ARGS)
{
text *in = PG_GETARG_TEXT_P(1);
PRSTEXT prs;
2003-08-04 08:43:34 +08:00
tsvector *out = NULL;
TSCfgInfo *cfg;
2004-08-29 13:07:03 +08:00
SET_FUNCOID();
cfg = findcfg(PG_GETARG_INT32(0));
2003-07-21 18:27:44 +08:00
prs.lenwords = 32;
prs.curwords = 0;
prs.pos = 0;
prs.words = (TSWORD *) palloc(sizeof(TSWORD) * prs.lenwords);
2003-08-04 08:43:34 +08:00
2003-07-21 18:27:44 +08:00
parsetext_v2(cfg, &prs, VARDATA(in), VARSIZE(in) - VARHDRSZ);
PG_FREE_IF_COPY(in, 1);
if (prs.curwords)
out = makevalue(&prs);
2003-08-04 08:43:34 +08:00
else
{
2003-07-21 18:27:44 +08:00
pfree(prs.words);
2003-08-04 08:43:34 +08:00
out = palloc(CALCDATASIZE(0, 0));
out->len = CALCDATASIZE(0, 0);
2003-07-21 18:27:44 +08:00
out->size = 0;
2003-08-04 08:43:34 +08:00
}
2003-07-21 18:27:44 +08:00
PG_RETURN_POINTER(out);
}
Datum
2003-08-04 08:43:34 +08:00
to_tsvector_name(PG_FUNCTION_ARGS)
{
text *cfg = PG_GETARG_TEXT_P(0);
Datum res;
2004-08-29 13:07:03 +08:00
SET_FUNCOID();
res = DirectFunctionCall3(
2004-08-29 13:07:03 +08:00
to_tsvector,
Int32GetDatum(name2id_cfg(cfg)),
PG_GETARG_DATUM(1),
(Datum) 0
);
2003-08-04 08:43:34 +08:00
PG_FREE_IF_COPY(cfg, 0);
PG_RETURN_DATUM(res);
2003-07-21 18:27:44 +08:00
}
Datum
2003-08-04 08:43:34 +08:00
to_tsvector_current(PG_FUNCTION_ARGS)
{
Datum res;
2004-08-29 13:07:03 +08:00
SET_FUNCOID();
res = DirectFunctionCall3(
2004-08-29 13:07:03 +08:00
to_tsvector,
Int32GetDatum(get_currcfg()),
PG_GETARG_DATUM(0),
(Datum) 0
);
2003-08-04 08:43:34 +08:00
PG_RETURN_DATUM(res);
2003-07-21 18:27:44 +08:00
}
static Oid
2003-08-04 08:43:34 +08:00
findFunc(char *fname)
{
FuncCandidateList clist,
ptr;
Oid funcid = InvalidOid;
List *names = list_make1(makeString(fname));
2003-07-21 18:27:44 +08:00
ptr = clist = FuncnameGetCandidates(names, 1);
list_free(names);
2003-07-21 18:27:44 +08:00
2003-08-04 08:43:34 +08:00
if (!ptr)
2003-07-21 18:27:44 +08:00
return funcid;
2003-08-04 08:43:34 +08:00
while (ptr)
{
if (ptr->args[0] == TEXTOID && funcid == InvalidOid)
funcid = ptr->oid;
clist = ptr->next;
2003-07-21 18:27:44 +08:00
pfree(ptr);
2003-08-04 08:43:34 +08:00
ptr = clist;
2003-07-21 18:27:44 +08:00
}
return funcid;
}
/*
* Trigger
*/
Datum
tsearch2(PG_FUNCTION_ARGS)
{
TriggerData *trigdata;
Trigger *trigger;
Relation rel;
HeapTuple rettuple = NULL;
int numidxattr,
i;
PRSTEXT prs;
Datum datum = (Datum) 0;
2003-08-04 08:43:34 +08:00
Oid funcoid = InvalidOid;
TSCfgInfo *cfg;
2004-08-29 13:07:03 +08:00
SET_FUNCOID();
cfg = findcfg(get_currcfg());
2003-07-21 18:27:44 +08:00
if (!CALLED_AS_TRIGGER(fcinfo))
/* internal error */
2003-07-21 18:27:44 +08:00
elog(ERROR, "TSearch: Not fired by trigger manager");
trigdata = (TriggerData *) fcinfo->context;
if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
/* internal error */
2003-07-21 18:27:44 +08:00
elog(ERROR, "TSearch: Can't process STATEMENT events");
if (TRIGGER_FIRED_AFTER(trigdata->tg_event))
/* internal error */
2003-07-21 18:27:44 +08:00
elog(ERROR, "TSearch: Must be fired BEFORE event");
if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
rettuple = trigdata->tg_trigtuple;
else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
rettuple = trigdata->tg_newtuple;
else
/* internal error */
2003-07-21 18:27:44 +08:00
elog(ERROR, "TSearch: Unknown event");
trigger = trigdata->tg_trigger;
rel = trigdata->tg_relation;
if (trigger->tgnargs < 2)
/* internal error */
2003-07-21 18:27:44 +08:00
elog(ERROR, "TSearch: format tsearch2(tsvector_field, text_field1,...)");
numidxattr = SPI_fnumber(rel->rd_att, trigger->tgargs[0]);
if (numidxattr == SPI_ERROR_NOATTRIBUTE)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("tsvector column \"%s\" does not exist",
trigger->tgargs[0])));
2003-07-21 18:27:44 +08:00
prs.lenwords = 32;
prs.curwords = 0;
prs.pos = 0;
prs.words = (TSWORD *) palloc(sizeof(TSWORD) * prs.lenwords);
2003-07-21 18:27:44 +08:00
/* find all words in indexable column */
for (i = 1; i < trigger->tgnargs; i++)
{
int numattr;
Oid oidtype;
Datum txt_toasted;
bool isnull;
text *txt;
numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]);
if (numattr == SPI_ERROR_NOATTRIBUTE)
{
2003-08-04 08:43:34 +08:00
funcoid = findFunc(trigger->tgargs[i]);
if (funcoid == InvalidOid)
ereport(ERROR,
(errcode(ERRCODE_UNDEFINED_COLUMN),
errmsg("could not find function or field \"%s\"",
trigger->tgargs[i])));
2003-07-21 18:27:44 +08:00
continue;
}
oidtype = SPI_gettypeid(rel->rd_att, numattr);
/* We assume char() and varchar() are binary-equivalent to text */
if (!(oidtype == TEXTOID ||
oidtype == VARCHAROID ||
oidtype == BPCHAROID))
{
elog(WARNING, "TSearch: '%s' is not of character type",
trigger->tgargs[i]);
continue;
}
txt_toasted = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull);
if (isnull)
continue;
2003-08-04 08:43:34 +08:00
if (funcoid != InvalidOid)
{
text *txttmp = (text *) DatumGetPointer(OidFunctionCall1(
2005-10-15 10:49:52 +08:00
funcoid,
PointerGetDatum(txt_toasted)
));
2003-08-04 08:43:34 +08:00
2003-07-21 18:27:44 +08:00
txt = (text *) DatumGetPointer(PG_DETOAST_DATUM(PointerGetDatum(txttmp)));
2003-08-04 08:43:34 +08:00
if (txt == txttmp)
2003-07-21 18:27:44 +08:00
txt_toasted = PointerGetDatum(txt);
2003-08-04 08:43:34 +08:00
}
else
txt = (text *) DatumGetPointer(PG_DETOAST_DATUM(PointerGetDatum(txt_toasted)));
2003-07-21 18:27:44 +08:00
parsetext_v2(cfg, &prs, VARDATA(txt), VARSIZE(txt) - VARHDRSZ);
2003-08-04 08:43:34 +08:00
if (txt != (text *) DatumGetPointer(txt_toasted))
2003-07-21 18:27:44 +08:00
pfree(txt);
}
/* make tsvector value */
if (prs.curwords)
{
datum = PointerGetDatum(makevalue(&prs));
rettuple = SPI_modifytuple(rel, rettuple, 1, &numidxattr,
&datum, NULL);
pfree(DatumGetPointer(datum));
}
else
{
2003-08-04 08:43:34 +08:00
tsvector *out = palloc(CALCDATASIZE(0, 0));
out->len = CALCDATASIZE(0, 0);
2003-07-21 18:27:44 +08:00
out->size = 0;
datum = PointerGetDatum(out);
pfree(prs.words);
rettuple = SPI_modifytuple(rel, rettuple, 1, &numidxattr,
&datum, NULL);
}
if (rettuple == NULL)
/* internal error */
2003-07-21 18:27:44 +08:00
elog(ERROR, "TSearch: %d returned by SPI_modifytuple", SPI_result);
return PointerGetDatum(rettuple);
}
static int
2004-08-29 13:07:03 +08:00
silly_cmp_tsvector(const tsvector * a, const tsvector * b)
{
if (a->len < b->len)
return -1;
2004-08-29 13:07:03 +08:00
else if (a->len > b->len)
return 1;
2004-08-29 13:07:03 +08:00
else if (a->size < b->size)
return -1;
2004-08-29 13:07:03 +08:00
else if (a->size > b->size)
return 1;
2004-08-29 13:07:03 +08:00
else
{
WordEntry *aptr = ARRPTR(a);
WordEntry *bptr = ARRPTR(b);
2006-10-04 08:30:14 +08:00
int i = 0;
int res;
2006-10-04 08:30:14 +08:00
for (i = 0; i < a->size; i++)
{
if (aptr->haspos != bptr->haspos)
{
return (aptr->haspos > bptr->haspos) ? -1 : 1;
}
else if (aptr->len != bptr->len)
{
return (aptr->len > bptr->len) ? -1 : 1;
}
else if ((res = strncmp(STRPTR(a) + aptr->pos, STRPTR(b) + bptr->pos, bptr->len)) != 0)
{
return res;
2006-10-04 08:30:14 +08:00
}
else if (aptr->haspos)
{
WordEntryPos *ap = POSDATAPTR(a, aptr);
WordEntryPos *bp = POSDATAPTR(b, bptr);
int j;
if (POSDATALEN(a, aptr) != POSDATALEN(b, bptr))
return (POSDATALEN(a, aptr) > POSDATALEN(b, bptr)) ? -1 : 1;
for (j = 0; j < POSDATALEN(a, aptr); j++)
{
if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp))
{
return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1;
}
else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp))
{
return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1;
}
ap++, bp++;
}
}
2004-08-29 13:07:03 +08:00
2006-10-04 08:30:14 +08:00
aptr++;
bptr++;
2004-08-29 13:07:03 +08:00
}
}
2004-08-29 13:07:03 +08:00
return 0;
}
PG_FUNCTION_INFO_V1(tsvector_cmp);
PG_FUNCTION_INFO_V1(tsvector_lt);
PG_FUNCTION_INFO_V1(tsvector_le);
PG_FUNCTION_INFO_V1(tsvector_eq);
PG_FUNCTION_INFO_V1(tsvector_ne);
PG_FUNCTION_INFO_V1(tsvector_ge);
PG_FUNCTION_INFO_V1(tsvector_gt);
2004-08-29 13:07:03 +08:00
Datum tsvector_cmp(PG_FUNCTION_ARGS);
Datum tsvector_lt(PG_FUNCTION_ARGS);
Datum tsvector_le(PG_FUNCTION_ARGS);
Datum tsvector_eq(PG_FUNCTION_ARGS);
Datum tsvector_ne(PG_FUNCTION_ARGS);
Datum tsvector_ge(PG_FUNCTION_ARGS);
Datum tsvector_gt(PG_FUNCTION_ARGS);
#define RUNCMP \
tsvector *a = (tsvector *) DatumGetPointer(PG_DETOAST_DATUM(PG_GETARG_DATUM(0)));\
tsvector *b = (tsvector *) DatumGetPointer(PG_DETOAST_DATUM(PG_GETARG_DATUM(1)));\
int res = silly_cmp_tsvector(a,b); \
PG_FREE_IF_COPY(a,0); \
PG_FREE_IF_COPY(b,1); \
Datum
2004-08-29 13:07:03 +08:00
tsvector_cmp(PG_FUNCTION_ARGS)
{
RUNCMP
PG_RETURN_INT32(res);
}
Datum
2004-08-29 13:07:03 +08:00
tsvector_lt(PG_FUNCTION_ARGS)
{
RUNCMP
PG_RETURN_BOOL((res < 0) ? true : false);
}
Datum
2004-08-29 13:07:03 +08:00
tsvector_le(PG_FUNCTION_ARGS)
{
RUNCMP
PG_RETURN_BOOL((res <= 0) ? true : false);
}
Datum
2004-08-29 13:07:03 +08:00
tsvector_eq(PG_FUNCTION_ARGS)
{
RUNCMP
PG_RETURN_BOOL((res == 0) ? true : false);
}
Datum
2004-08-29 13:07:03 +08:00
tsvector_ge(PG_FUNCTION_ARGS)
{
RUNCMP
PG_RETURN_BOOL((res >= 0) ? true : false);
}
2004-08-29 13:07:03 +08:00
Datum
2004-08-29 13:07:03 +08:00
tsvector_gt(PG_FUNCTION_ARGS)
{
RUNCMP
PG_RETURN_BOOL((res > 0) ? true : false);
2004-08-29 13:07:03 +08:00
}
Datum
2004-08-29 13:07:03 +08:00
tsvector_ne(PG_FUNCTION_ARGS)
{
RUNCMP
PG_RETURN_BOOL((res != 0) ? true : false);
}