postgresql/contrib/fulltextindex/fti.c

444 lines
12 KiB
C
Raw Normal View History

#include "postgres.h"
#include <ctype.h>
#include "executor/spi.h"
#include "commands/trigger.h"
/*
* Trigger function accepts variable number of arguments:
*
* 1. relation in which to store the substrings
* 2. fields to extract substrings from
*
* The relation in which to insert *must* have the following layout:
*
* string varchar(#)
* id oid
*
* where # is the largest size of the varchar columns being indexed
*
* Example:
*
* -- Create the SQL function based on the compiled shared object
* create function fti() returns trigger as
* '/usr/local/pgsql/lib/contrib/fti.so' language 'C';
*
* -- Create the FTI table
* create table product_fti (string varchar(255), id oid) without oids;
*
* -- Create an index to assist string matches
* create index product_fti_string_idx on product_fti (string);
*
* -- Create an index to assist trigger'd deletes
* create index product_fti_id_idx on product_fti (id);
*
* -- Create an index on the product oid column to assist joins
* -- between the fti table and the product table
* create index product_oid_idx on product (oid);
*
* -- Create the trigger to perform incremental changes to the full text index.
* create trigger product_fti_trig after update or insert or delete on product
* for each row execute procedure fti(product_fti, title, artist);
* ^^^^^^^^^^^
* table where full text index is stored
* ^^^^^^^^^^^^^
* columns to index in the base table
*
* After populating 'product', try something like:
*
* SELECT DISTINCT(p.*) FROM product p, product_fti f1, product_fti f2 WHERE
* f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND p.oid=f2.id;
*
* To check that your indicies are being used correctly, make sure you
* EXPLAIN SELECT ... your test query above.
*
* CHANGELOG
* ---------
*
* august 3 2001
* Extended fti function to accept more than one column as a
* parameter and all specified columns are indexed. Changed
* all uses of sprintf to snprintf. Made error messages more
* consistent.
*
* march 4 1998 Changed breakup() to return less substrings. Only breakup
* in word parts which are in turn shortened from the start
* of the word (ie. word, ord, rd)
* Did allocation of substring buffer outside of breakup()
*
* oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha
* characters between words then 1).
*
* oct 4-5 1997 implemented the thing, at least the basic functionallity
* of it all....
*
* TODO
* ----
*
* prevent generating duplicate words for an oid in the fti table
* save a plan for deletes
* create a function that will make the index *after* we have populated
* the main table (probably first delete all contents to be sure there's
* nothing in it, then re-populate the fti-table)
*
* can we do something with operator overloading or a seperate function
* that can build the final query automagically?
*/
#define MAX_FTI_QUERY_LENGTH 8192
2001-03-22 12:01:46 +08:00
extern Datum fti(PG_FUNCTION_ARGS);
static char *breakup(char *, char *);
static bool is_stopword(char *);
2001-03-22 12:01:46 +08:00
static bool new_tuple = false;
#ifdef USE_STOP_WORDS
/* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */
1999-05-26 00:15:34 +08:00
char *StopWords[] = { /* list of words to skip in indexing */
"no",
"the",
"yes"
};
#endif /* USE_STOP_WORDS */
/* stuff for caching query-plans, stolen from contrib/spi/\*.c */
typedef struct
{
char *ident;
int nplans;
void **splan;
} EPlan;
static EPlan *InsertPlans = NULL;
static EPlan *DeletePlans = NULL;
static int nInsertPlans = 0;
1999-05-26 00:15:34 +08:00
static int nDeletePlans = 0;
static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans);
/***********************************************************************/
PG_FUNCTION_INFO_V1(fti);
Datum
fti(PG_FUNCTION_ARGS)
{
TriggerData *trigdata;
1999-05-26 00:15:34 +08:00
Trigger *trigger; /* to get trigger name */
int nargs; /* # of arguments */
char **args; /* arguments */
char *relname; /* triggered relation name */
Relation rel; /* triggered relation */
char *indexname; /* name of table for substrings */
HeapTuple rettuple = NULL;
1999-05-26 00:15:34 +08:00
TupleDesc tupdesc; /* tuple description */
bool isinsert = false;
bool isdelete = false;
int ret;
char query[MAX_FTI_QUERY_LENGTH];
Oid oid;
1999-05-26 00:15:34 +08:00
/*
1999-05-26 00:15:34 +08:00
* FILE *debug;
*/
/*
1999-05-26 00:15:34 +08:00
* debug = fopen("/dev/xconsole", "w"); fprintf(debug, "FTI: entered
* function\n"); fflush(debug);
*/
if (!CALLED_AS_TRIGGER(fcinfo))
elog(ERROR, "Full Text Indexing: Not fired by trigger manager");
/* It's safe to cast now that we've checked */
trigdata = (TriggerData *) fcinfo->context;
if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
elog(ERROR, "Full Text Indexing: Can't process STATEMENT events");
if (TRIGGER_FIRED_BEFORE(trigdata->tg_event))
elog(ERROR, "Full Text Indexing: Must be fired AFTER event");
if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
1999-05-26 00:15:34 +08:00
isinsert = true;
if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
1999-05-26 00:15:34 +08:00
{
isdelete = true;
isinsert = true;
}
if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event))
1999-05-26 00:15:34 +08:00
isdelete = true;
trigger = trigdata->tg_trigger;
rel = trigdata->tg_relation;
relname = SPI_getrelname(rel);
rettuple = trigdata->tg_trigtuple;
1999-05-26 00:15:34 +08:00
if (isdelete && isinsert) /* is an UPDATE */
rettuple = trigdata->tg_newtuple;
1999-05-26 00:15:34 +08:00
if ((ret = SPI_connect()) < 0)
elog(ERROR, "Full Text Indexing: SPI_connect: Failed, returned %d\n", ret);
nargs = trigger->tgnargs;
if (nargs < 2)
elog(ERROR, "Full Text Indexing: Trigger must have at least 2 arguments\n");
1999-05-26 00:15:34 +08:00
args = trigger->tgargs;
indexname = args[0];
1999-05-26 00:15:34 +08:00
tupdesc = rel->rd_att; /* what the tuple looks like (?) */
/* get oid of current tuple, needed by all, so place here */
oid = HeapTupleGetOid(rettuple);
if (!OidIsValid(oid))
elog(ERROR, "Full Text Indexing: Oid of current tuple is invalid");
1999-05-26 00:15:34 +08:00
if (isdelete)
{
void *pplan;
Oid *argtypes;
Datum values[1];
EPlan *plan;
int i;
snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname);
for (i = 1; i < nargs; i++)
snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
plan = find_plan(query, &DeletePlans, &nDeletePlans);
1999-05-26 00:15:34 +08:00
if (plan->nplans <= 0)
{
argtypes = (Oid *) palloc(sizeof(Oid));
argtypes[0] = OIDOID;
snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1", indexname);
pplan = SPI_prepare(query, 1, argtypes);
if (!pplan)
elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in delete");
pplan = SPI_saveplan(pplan);
if (pplan == NULL)
elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in delete");
1999-05-26 00:15:34 +08:00
plan->splan = (void **) malloc(sizeof(void *));
*(plan->splan) = pplan;
plan->nplans = 1;
}
values[0] = oid;
ret = SPI_execp(*(plan->splan), values, NULL, 0);
if (ret != SPI_OK_DELETE)
elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in delete");
}
1999-05-26 00:15:34 +08:00
if (isinsert)
{
char *substring;
char *column;
void *pplan;
Oid *argtypes;
1999-05-26 00:15:34 +08:00
Datum values[2];
int colnum;
struct varlena *data;
EPlan *plan;
int i;
char *buff;
char *string;
snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname);
for (i = 1; i < nargs; i++)
snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
plan = find_plan(query, &InsertPlans, &nInsertPlans);
/* no plan yet, so allocate mem for argtypes */
1999-05-26 00:15:34 +08:00
if (plan->nplans <= 0)
{
argtypes = (Oid *) palloc(2 * sizeof(Oid));
1999-05-26 00:15:34 +08:00
argtypes[0] = VARCHAROID; /* create table t_name (string
* varchar, */
argtypes[1] = OIDOID; /* id oid); */
/* prepare plan to gain speed */
snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string, id) VALUES ($1, $2)",
indexname);
pplan = SPI_prepare(query, 2, argtypes);
if (!pplan)
elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in insert");
pplan = SPI_saveplan(pplan);
if (pplan == NULL)
elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in insert");
1999-05-26 00:15:34 +08:00
plan->splan = (void **) malloc(sizeof(void *));
*(plan->splan) = pplan;
plan->nplans = 1;
}
1999-05-26 00:15:34 +08:00
/* prepare plan for query */
for (i = 0; i < nargs - 1; i++)
{
colnum = SPI_fnumber(tupdesc, args[i + 1]);
if (colnum == SPI_ERROR_NOATTRIBUTE)
elog(ERROR, "Full Text Indexing: SPI_fnumber: Column '%s' of '%s' not found", args[i + 1], indexname);
/* Get the char* representation of the column */
column = SPI_getvalue(rettuple, tupdesc, colnum);
/* make sure we don't try to index NULL's */
if (column)
1999-05-26 00:15:34 +08:00
{
string = column;
while (*string != '\0')
{
*string = tolower((unsigned char) *string);
string++;
}
data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1);
buff = palloc(strlen(column) + 1);
/* saves lots of calls in while-loop and in breakup() */
new_tuple = true;
while ((substring = breakup(column, buff)))
{
int l;
l = strlen(substring);
data->vl_len = l + sizeof(int32);
memcpy(VARDATA(data), substring, l);
values[0] = PointerGetDatum(data);
values[1] = oid;
ret = SPI_execp(*(plan->splan), values, NULL, 0);
if (ret != SPI_OK_INSERT)
elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in insert");
}
pfree(buff);
pfree(data);
}
}
}
SPI_finish();
return PointerGetDatum(rettuple);
}
static char *
1999-05-26 00:15:34 +08:00
breakup(char *string, char *substring)
{
static char *last_start;
static char *cur_pos;
if (new_tuple)
{
1999-05-26 00:15:34 +08:00
cur_pos = last_start = &string[strlen(string) - 1];
new_tuple = false; /* don't initialize this next time */
}
1999-05-26 00:15:34 +08:00
while (cur_pos > string) /* don't read before start of 'string' */
{
1999-05-26 00:15:34 +08:00
/*
* skip pieces at the end of a string that are not alfa-numeric
* (ie. 'string$%^&', last_start first points to '&', and after
* this to 'g'
*/
if (!isalnum((unsigned char) *last_start))
1999-05-26 00:15:34 +08:00
{
while (!isalnum((unsigned char) *last_start) &&
last_start > string)
last_start--;
1999-05-26 00:15:34 +08:00
cur_pos = last_start;
}
1999-05-26 00:15:34 +08:00
cur_pos--; /* substrings are at minimum 2 characters
* long */
if (isalnum((unsigned char) *cur_pos))
{
/* Houston, we have a substring! :) */
memcpy(substring, cur_pos, last_start - cur_pos + 1);
1999-05-26 00:15:34 +08:00
substring[last_start - cur_pos + 1] = '\0';
if (!is_stopword(substring))
return substring;
}
else
{
1999-05-26 00:15:34 +08:00
last_start = cur_pos - 1;
cur_pos = last_start;
}
}
1999-05-26 00:15:34 +08:00
return NULL; /* we've processed all of 'string' */
}
/* copied from src/backend/parser/keywords.c and adjusted for our situation*/
static bool
is_stopword(char *text)
{
#ifdef USE_STOP_WORDS
1999-05-26 00:15:34 +08:00
char **StopLow; /* for list of stop-words */
char **StopHigh;
char **StopMiddle;
int difference;
1999-05-26 00:15:34 +08:00
StopLow = &StopWords[0]; /* initialize stuff for binary search */
StopHigh = endof(StopWords);
/* Loop invariant: *StopLow <= text < *StopHigh */
1999-05-26 00:15:34 +08:00
while (StopLow < StopHigh)
1999-05-26 00:15:34 +08:00
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
difference = strcmp(*StopMiddle, text);
if (difference == 0)
return (true);
else if (difference < 0)
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
1999-05-26 00:15:34 +08:00
}
#endif /* USE_STOP_WORDS */
1999-05-26 00:15:34 +08:00
return (false);
}
/* for caching of query plans, stolen from contrib/spi/\*.c */
static EPlan *
find_plan(char *ident, EPlan ** eplan, int *nplans)
{
EPlan *newp;
int i;
if (*nplans > 0)
{
for (i = 0; i < *nplans; i++)
{
if (strcmp((*eplan)[i].ident, ident) == 0)
break;
}
if (i != *nplans)
return (*eplan + i);
*eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan));
newp = *eplan + i;
}
else
{
newp = *eplan = (EPlan *) malloc(sizeof(EPlan));
(*nplans) = i = 0;
}
newp->ident = (char *) malloc(strlen(ident) + 1);
strcpy(newp->ident, ident);
newp->nplans = 0;
newp->splan = NULL;
(*nplans)++;
return (newp);
}