postgresql/contrib/fulltextindex/fti.c

382 lines
10 KiB
C
Raw Normal View History

#include "executor/spi.h"
#include "commands/trigger.h"
#include "c.h" /* endof() macro */
#include <ctype.h> /* tolower */
#include <stdio.h> /* debugging */
/*
* Trigger function takes 2 arguments:
1. relation in which to store the substrings
2. field to extract substrings from
The relation in which to insert *must* have the following layout:
string varchar(#)
id oid
Example:
create function fti() returns opaque as
'/home/boekhold/src/postgresql-6.2/contrib/fti/fti.so' language 'c';
create table title_fti (string varchar(25), id oid);
create index title_fti_idx on title_fti (string);
create trigger title_fti_trigger after update or insert or delete on product
for each row execute procedure fti(title_fti, title);
^^^^^^^^^
where to store index in
^^^^^
which column to index
ofcourse don't forget to create an index on title_idx, column string, else
you won't notice much speedup :)
After populating 'product', try something like:
select p.* from product p, title_fti f1, title_fti f2 where
f1.string='slippery' and f2.string='wet' and f1.id=f2.id and p.oid=f1.id;
*/
/*
march 4 1998 Changed breakup() to return less substrings. Only breakup
in word parts which are in turn shortened from the start
of the word (ie. word, ord, rd)
Did allocation of substring buffer outside of breakup()
oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha
characters between words then 1).
oct 4-5 1997 implemented the thing, at least the basic functionallity
of it all....
*/
/* IMPROVEMENTS:
save a plan for deletes
create a function that will make the index *after* we have populated
the main table (probably first delete all contents to be sure there's
nothing in it, then re-populate the fti-table)
can we do something with operator overloading or a seperate function
that can build the final query automatigally?
*/
HeapTuple fti(void);
char *breakup(char*, char*);
bool is_stopword(char*);
bool new_tuple = false;
/* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */
char *StopWords[] = { /* list of words to skip in indexing */
"no"
"the",
"yes",
};
/* stuff for caching query-plans, stolen from contrib/spi/\*.c */
typedef struct
{
char *ident;
int nplans;
void **splan;
} EPlan;
static EPlan *InsertPlans = NULL;
static EPlan *DeletePlans = NULL;
static int nInsertPlans = 0;
static int nDeletePlans = 0;
static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans);
/***********************************************************************/
HeapTuple
fti()
{
Trigger *trigger; /* to get trigger name */
int nargs; /* # of arguments */
char **args; /* arguments */
char *relname; /* triggered relation name */
Relation rel; /* triggered relation */
char *indexname; /* name of table for substrings */
HeapTuple rettuple = NULL;
TupleDesc tupdesc; /* tuple description */
bool isinsert=false;
bool isdelete=false;
int ret;
char query[8192];
Oid oid;
/*
FILE *debug;
*/
/*
debug = fopen("/dev/xconsole", "w");
fprintf(debug, "FTI: entered function\n");
fflush(debug);
*/
if (!CurrentTriggerData)
elog(ERROR, "Full Text Indexing: triggers are not initialized");
if (TRIGGER_FIRED_FOR_STATEMENT(CurrentTriggerData->tg_event))
elog(ERROR, "Full Text Indexing: can't process STATEMENT events");
if (TRIGGER_FIRED_BEFORE(CurrentTriggerData->tg_event))
elog(ERROR, "Full Text Indexing: must be fired AFTER event");
if (TRIGGER_FIRED_BY_INSERT(CurrentTriggerData->tg_event))
isinsert=true;
if (TRIGGER_FIRED_BY_UPDATE(CurrentTriggerData->tg_event))
{ isdelete=true;isinsert=true;}
if (TRIGGER_FIRED_BY_DELETE(CurrentTriggerData->tg_event))
isdelete=true;
trigger = CurrentTriggerData->tg_trigger;
rel = CurrentTriggerData->tg_relation;
relname = SPI_getrelname(rel);
rettuple=CurrentTriggerData->tg_trigtuple;
if (isdelete&&isinsert) /* is an UPDATE */
rettuple=CurrentTriggerData->tg_newtuple;
CurrentTriggerData = NULL; /* invalidate 'normal' calls to this function */
if ((ret = SPI_connect()) <0)
elog(ERROR,"Full Text Indexing: SPI_connect failed, returned %d\n",ret);
nargs = trigger->tgnargs;
if (nargs != 2)
elog(ERROR, "Full Text Indexing: trigger can only have 2 arguments");
args = trigger->tgargs;
indexname = args[0];
tupdesc = rel->rd_att; /* what the tuple looks like (?) */
/* get oid of current tuple, needed by all, so place here */
oid = rettuple->t_oid;
if (!OidIsValid(oid))
elog(ERROR,"Full Text Indexing: oid of current tuple is NULL");
if (isdelete) {
void *pplan;
Oid *argtypes;
Datum values[1];
EPlan *plan;
sprintf(query, "D%s$%s", args[0], args[1]);
plan = find_plan(query, &DeletePlans, &nDeletePlans);
if (plan->nplans <= 0) {
argtypes = (Oid *)palloc(sizeof(Oid));
argtypes[0] = OIDOID;
sprintf(query, "DELETE FROM %s WHERE id = $1", indexname);
pplan = SPI_prepare(query, 1, argtypes);
if (!pplan)
elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL "
"in delete");
pplan = SPI_saveplan(pplan);
if (pplan == NULL)
elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL "
"in delete");
plan->splan = (void **)malloc(sizeof(void*));
*(plan->splan) = pplan;
plan->nplans = 1;
}
values[0] = oid;
ret = SPI_execp(*(plan->splan), values, NULL, 0);
if (ret != SPI_OK_DELETE)
elog(ERROR, "Full Text Indexing: error executing plan in delete");
}
if (isinsert) {
char *substring, *column;
void *pplan;
Oid *argtypes;
Datum values[2];
int colnum;
struct varlena *data;
EPlan *plan;
sprintf(query, "I%s$%s", args[0], args[1]);
plan = find_plan(query, &InsertPlans, &nInsertPlans);
/* no plan yet, so allocate mem for argtypes */
if (plan->nplans <= 0) {
argtypes = (Oid *)palloc(2*sizeof(Oid));
argtypes[0] = VARCHAROID; /*create table t_name
(string varchar, */
argtypes[1] = OIDOID; /* id oid); */
/* prepare plan to gain speed */
sprintf(query, "INSERT INTO %s (string, id) VALUES ($1, $2)",
indexname);
pplan = SPI_prepare(query, 2, argtypes);
if (!pplan)
elog(ERROR, "Full Text Indexing: SPI_prepare returned NULL "
"in insert");
pplan = SPI_saveplan(pplan);
if (pplan == NULL)
elog(ERROR, "Full Text Indexing: SPI_saveplan returned NULL"
" in insert");
plan->splan = (void **)malloc(sizeof(void*));
*(plan->splan) = pplan;
plan->nplans = 1;
}
/* prepare plan for query */
colnum=SPI_fnumber(tupdesc, args[1]);
if (colnum == SPI_ERROR_NOATTRIBUTE)
elog(ERROR, "Full Text Indexing: column '%s' of '%s' not found",
args[1], args[0]);
/* Get the char* representation of the column with name args[1] */
column = SPI_getvalue(rettuple, tupdesc, colnum);
if (column) { /* make sure we don't try to index NULL's */
char *buff;
char *string = column;
while(*string != '\0') { /* placed 'really' inline. */
*string = tolower(*string); /* some compilers will choke */
string++; /* on 'inline' keyword */
}
data = (struct varlena*)palloc(sizeof(int32)+strlen(column)+1);
buff = palloc(strlen(column) + 1);
/* saves lots of calls in while-loop and in breakup()*/
new_tuple=true;
while ((substring = breakup(column, buff))) {
int l;
l = strlen(substring);
data->vl_len = l+sizeof(int32);
memcpy(VARDATA(data), substring, l);
values[0] = PointerGetDatum(data);
values[1] = oid;
ret = SPI_execp(*(plan->splan), values, NULL, 0);
if (ret != SPI_OK_INSERT)
elog(ERROR, "Full Text Indexing: error executing plan "
"in insert");
}
pfree(buff);
pfree(data);
}
}
SPI_finish();
return (rettuple);
}
char *breakup(char *string, char *substring)
{
static char *last_start;
static char *cur_pos;
if (new_tuple)
{
cur_pos=last_start=&string[strlen(string)-1];
new_tuple=false; /* don't initialize this next time */
}
while (cur_pos > string) /* don't read before start of 'string' */
{
/* skip pieces at the end of a string that are not
alfa-numeric (ie. 'string$%^&', last_start first points to
'&', and after this to 'g' */
if (!isalnum((int)*last_start)) {
while (!isalnum((int)*last_start) &&
last_start > string)
last_start--;
cur_pos=last_start;
}
cur_pos--; /* substrings are at minimum 2 characters long */
if (isalnum((int)*cur_pos))
{
/* Houston, we have a substring! :) */
memcpy(substring, cur_pos, last_start - cur_pos + 1);
substring[last_start-cur_pos+1]='\0';
if (!is_stopword(substring)) return substring;
}
else
{
last_start=cur_pos-1;
cur_pos = last_start;
}
}
return NULL; /* we've processed all of 'string' */
}
/* copied from src/backend/parser/keywords.c and adjusted for our situation*/
bool
is_stopword(char *text)
{
char **StopLow; /* for list of stop-words */
char **StopHigh;
char **StopMiddle;
unsigned int difference;
StopLow = &StopWords[0]; /* initialize stuff for binary search */
StopHigh = endof(StopWords);
while (StopLow <= StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
difference = strcmp(*StopMiddle, text);
if (difference == 0)
return (true);
else if (difference < 0)
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle - 1;
}
return (false);
}
/* for caching of query plans, stolen from contrib/spi/\*.c */
static EPlan *
find_plan(char *ident, EPlan ** eplan, int *nplans)
{
EPlan *newp;
int i;
if (*nplans > 0)
{
for (i = 0; i < *nplans; i++)
{
if (strcmp((*eplan)[i].ident, ident) == 0)
break;
}
if (i != *nplans)
return (*eplan + i);
*eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan));
newp = *eplan + i;
}
else
{
newp = *eplan = (EPlan *) malloc(sizeof(EPlan));
(*nplans) = i = 0;
}
newp->ident = (char *) malloc(strlen(ident) + 1);
strcpy(newp->ident, ident);
newp->nplans = 0;
newp->splan = NULL;
(*nplans)++;
return (newp);
}