postgresql/contrib/tsearch2/rank.c

/*
 * Relevation
 * Teodor Sigaev <teodor@sigaev.ru>
 */
#include "postgres.h"

#include <math.h>

#include "access/gist.h"
#include "access/itup.h"
#include "catalog/namespace.h"
#include "commands/trigger.h"
#include "executor/spi.h"
#include "fmgr.h"
#include "funcapi.h"
#include "nodes/pg_list.h"
#include "storage/bufpage.h"
#include "utils/array.h"
#include "utils/builtins.h"

#include "tsvector.h"
#include "query.h"
#include "common.h"

PG_FUNCTION_INFO_V1(rank);
Datum		rank(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(rank_def);
Datum		rank_def(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(rank_cd);
Datum		rank_cd(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(rank_cd_def);
Datum		rank_cd_def(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(get_covers);
Datum		get_covers(PG_FUNCTION_ARGS);

static float weights[] = {0.1, 0.2, 0.4, 1.0};

#define wpos(wep)	( w[ WEP_GETWEIGHT(wep) ] )

#define DEF_NORM_METHOD 0

static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q);
static float calc_rank_and(float *w, tsvector * t, QUERYTYPE * q);

/*
 * Returns a weight of a word collocation
 */
static float4
word_distance(int4 w)
{
	if (w > 100)
		return 1e-30;

	return 1.0 / (1.005 + 0.05 * exp(((float4) w) / 1.5 - 2));
}

static int
cnt_length(tsvector * t)
{
	WordEntry  *ptr = ARRPTR(t),
			   *end = (WordEntry *) STRPTR(t);
	int			len = 0,
				clen;

	while (ptr < end)
	{
		if ((clen = POSDATALEN(t, ptr)) == 0)
			len += 1;
		else
			len += clen;
		ptr++;
	}

	return len;
}

static int4
WordECompareITEM(char *eval, char *qval, WordEntry * ptr, ITEM * item)
{
	if (ptr->len == item->length)
		return strncmp(
					   eval + ptr->pos,
					   qval + item->distance,
					   item->length);

	return (ptr->len > item->length) ? 1 : -1;
}

static WordEntry *
find_wordentry(tsvector * t, QUERYTYPE * q, ITEM * item)
{
	WordEntry  *StopLow = ARRPTR(t);
	WordEntry  *StopHigh = (WordEntry *) STRPTR(t);
	WordEntry  *StopMiddle;
	int			difference;

	/* Loop invariant: StopLow <= item < StopHigh */

	while (StopLow < StopHigh)
	{
		StopMiddle = StopLow + (StopHigh - StopLow) / 2;
		difference = WordECompareITEM(STRPTR(t), GETOPERAND(q), StopMiddle, item);
		if (difference == 0)
			return StopMiddle;
		else if (difference < 0)
			StopLow = StopMiddle + 1;
		else
			StopHigh = StopMiddle;
	}

	return NULL;
}


static char *SortAndUniqOperand = NULL;

static int
compareITEM(const void *a, const void *b)
{
	if ((*(ITEM **) a)->length == (*(ITEM **) b)->length)
		return strncmp(SortAndUniqOperand + (*(ITEM **) a)->distance,
					   SortAndUniqOperand + (*(ITEM **) b)->distance,
					   (*(ITEM **) b)->length);

	return ((*(ITEM **) a)->length > (*(ITEM **) b)->length) ? 1 : -1;
}

static ITEM **
SortAndUniqItems(char *operand, ITEM * item, int *size)
{
	ITEM	  **res,
			  **ptr,
			  **prevptr;

	ptr = res = (ITEM **) palloc(sizeof(ITEM *) * *size);

	while ((*size)--)
	{
		if (item->type == VAL)
		{
			*ptr = item;
			ptr++;
		}
		item++;
	}

	*size = ptr - res;
	if (*size < 2)
		return res;

	SortAndUniqOperand = operand;
	qsort(res, *size, sizeof(ITEM **), compareITEM);

	ptr = res + 1;
	prevptr = res;

	while (ptr - res < *size)
	{
		if (compareITEM((void *) ptr, (void *) prevptr) != 0)
		{
			prevptr++;
			*prevptr = *ptr;
		}
		ptr++;
	}

	*size = prevptr + 1 - res;
	return res;
}

static WordEntryPos POSNULL[] = {
	0,
	0
};

static float
calc_rank_and(float *w, tsvector * t, QUERYTYPE * q)
{
	uint16	  **pos;
	int			i,
				k,
				l,
				p;
	WordEntry  *entry;
	WordEntryPos *post,
			   *ct;
	int4		dimt,
				lenct,
				dist;
	float		res = -1.0;
	ITEM	  **item;
	int			size = q->size;

	item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);
	if (size < 2)
	{
		pfree(item);
		return calc_rank_or(w, t, q);
	}
	pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
	memset(pos, 0, sizeof(uint16 *) * q->size);
	*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
	WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1);

	for (i = 0; i < size; i++)
	{
		entry = find_wordentry(t, q, item[i]);
		if (!entry)
			continue;

		if (entry->haspos)
			pos[i] = (uint16 *) _POSDATAPTR(t, entry);
		else
			pos[i] = (uint16 *) POSNULL;


		dimt = *(uint16 *) (pos[i]);
		post = (WordEntryPos *) (pos[i] + 1);
		for (k = 0; k < i; k++)
		{
			if (!pos[k])
				continue;
			lenct = *(uint16 *) (pos[k]);
			ct = (WordEntryPos *) (pos[k] + 1);
			for (l = 0; l < dimt; l++)
			{
				for (p = 0; p < lenct; p++)
				{
					dist = Abs((int) WEP_GETPOS(post[l]) - (int) WEP_GETPOS(ct[p]));
					if (dist || (dist == 0 && (pos[i] == (uint16 *) POSNULL || pos[k] == (uint16 *) POSNULL)))
					{
						float		curw;

						if (!dist)
							dist = MAXENTRYPOS;
						curw = sqrt(wpos(post[l]) * wpos(ct[p]) * word_distance(dist));
						res = (res < 0) ? curw : 1.0 - (1.0 - res) * (1.0 - curw);
					}
				}
			}
		}
	}
	pfree(pos);
	pfree(item);
	return res;
}

static float
calc_rank_or(float *w, tsvector * t, QUERYTYPE * q)
{
	WordEntry  *entry;
	WordEntryPos *post;
	int4		dimt,
				j,
				i;
	float		res = 0.0;
	ITEM	  **item;
	int			size = q->size;

	*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
	item = SortAndUniqItems(GETOPERAND(q), GETQUERY(q), &size);

	for (i = 0; i < size; i++)
	{
		float		resj,
					wjm;
		int4		jm;

		entry = find_wordentry(t, q, item[i]);
		if (!entry)
			continue;

		if (entry->haspos)
		{
			dimt = POSDATALEN(t, entry);
			post = POSDATAPTR(t, entry);
		}
		else
		{
			dimt = *(uint16 *) POSNULL;
			post = POSNULL + 1;
		}

		resj = 0.0;
		wjm = -1.0;
		jm = 0;
		for (j = 0; j < dimt; j++)
		{
			resj = resj + wpos(post[j]) / ((j + 1) * (j + 1));
			if (wpos(post[j]) > wjm)
			{
				wjm = wpos(post[j]);
				jm = j;
			}
		}
/*
		limit (sum(i/i^2),i->inf) = pi^2/6
		resj = sum(wi/i^2),i=1,noccurence,
		wi - should be sorted desc,
		don't sort for now, just choose maximum weight. This should be corrected
		Oleg Bartunov
*/
		res = res + (wjm + resj - wjm / ((jm + 1) * (jm + 1))) / 1.64493406685;
	}
	if (size > 0)
		res = res / size;
	pfree(item);
	return res;
}

static float
calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
{
	ITEM	   *item = GETQUERY(q);
	float		res = 0.0;
	int			len;

	if (!t->size || !q->size)
		return 0.0;

	res = (item->type != VAL && item->val == (int4) '&') ?
		calc_rank_and(w, t, q) : calc_rank_or(w, t, q);

	if (res < 0)
		res = 1e-20;

	switch (method)
	{
		case 0:
			break;
		case 1:
			res /= log((float) (cnt_length(t) + 1)) / log(2.0);
			break;
		case 2:
			len = cnt_length(t);
			if (len > 0)
				res /= (float) len;
			break;
		default:
			/* internal error */
			elog(ERROR, "unrecognized normalization method: %d", method);
	}

	return res;
}

Datum
rank(PG_FUNCTION_ARGS)
{
	ArrayType  *win = (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
	tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
	QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(2));
	int			method = DEF_NORM_METHOD;
	float		res = 0.0;
	float		ws[lengthof(weights)];
	float4	   *arrdata;
	int			i;

	if (ARR_NDIM(win) != 1)
		ereport(ERROR,
				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
				 errmsg("array of weight must be one-dimensional")));

	if (ARRNELEMS(win) < lengthof(weights))
		ereport(ERROR,
				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
				 errmsg("array of weight is too short")));

	if (ARR_HASNULL(win))
		ereport(ERROR,
				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
				 errmsg("array of weight must not contain nulls")));

	arrdata = (float4 *) ARR_DATA_PTR(win);
	for (i = 0; i < lengthof(weights); i++)
	{
		ws[i] = (arrdata[i] >= 0) ? arrdata[i] : weights[i];
		if (ws[i] > 1.0)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
					 errmsg("weight out of range")));
	}

	if (PG_NARGS() == 4)
		method = PG_GETARG_INT32(3);

	res = calc_rank(ws, txt, query, method);

	PG_FREE_IF_COPY(win, 0);
	PG_FREE_IF_COPY(txt, 1);
	PG_FREE_IF_COPY(query, 2);
	PG_RETURN_FLOAT4(res);
}

Datum
rank_def(PG_FUNCTION_ARGS)
{
	tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
	QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
	float		res = 0.0;
	int			method = DEF_NORM_METHOD;

	if (PG_NARGS() == 3)
		method = PG_GETARG_INT32(2);

	res = calc_rank(weights, txt, query, method);

	PG_FREE_IF_COPY(txt, 0);
	PG_FREE_IF_COPY(query, 1);
	PG_RETURN_FLOAT4(res);
}


typedef struct
{
	ITEM	  **item;
	int16		nitem;
	bool		needfree;
	int32		pos;
}	DocRepresentation;

static int
compareDocR(const void *a, const void *b)
{
	if (((DocRepresentation *) a)->pos == ((DocRepresentation *) b)->pos)
		return 0;
	return (((DocRepresentation *) a)->pos > ((DocRepresentation *) b)->pos) ? 1 : -1;
}

static bool
checkcondition_ITEM(void *checkval, ITEM * val)
{
	return (bool) (val->istrue);
}

static void
reset_istrue_flag(QUERYTYPE * query)
{
	ITEM	   *item = GETQUERY(query);
	int			i;

	/* reset istrue flag */
	for (i = 0; i < query->size; i++)
	{
		if (item->type == VAL)
			item->istrue = 0;
		item++;
	}
}

static bool
Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int *q)
{
	DocRepresentation *ptr;
	int			lastpos = *pos;
	int			i;
	bool		found = false;

	reset_istrue_flag(query);

	*p = 0x7fffffff;
	*q = 0;
	ptr = doc + *pos;

	/* find upper bound of cover from current position, move up */
	while (ptr - doc < len)
	{
		for (i = 0; i < ptr->nitem; i++)
			ptr->item[i]->istrue = 1;
		if (TS_execute(GETQUERY(query), NULL, false, checkcondition_ITEM))
		{
			if (ptr->pos > *q)
			{
				*q = ptr->pos;
				lastpos = ptr - doc;
				found = true;
			}
			break;
		}
		ptr++;
	}

	if (!found)
		return false;

	reset_istrue_flag(query);

	ptr = doc + lastpos;

	/* find lower bound of cover from founded upper bound, move down */
	while (ptr >= doc)
	{
		for (i = 0; i < ptr->nitem; i++)
			ptr->item[i]->istrue = 1;
		if (TS_execute(GETQUERY(query), NULL, true, checkcondition_ITEM))
		{
			if (ptr->pos < *p)
				*p = ptr->pos;
			break;
		}
		ptr--;
	}

	if (*p <= *q)
	{
		/*
		 * set position for next try to next lexeme after begining of founded
		 * cover
		 */
		*pos = (ptr - doc) + 1;
		return true;
	}

	(*pos)++;
	return Cover(doc, len, query, pos, p, q);
}

static DocRepresentation *
get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
{
	ITEM	   *item = GETQUERY(query);
	WordEntry  *entry;
	WordEntryPos *post;
	int4		dimt,
				j,
				i;
	int			len = query->size * 4,
				cur = 0;
	DocRepresentation *doc;

	*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
	doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
	SortAndUniqOperand = GETOPERAND(query);
	reset_istrue_flag(query);

	for (i = 0; i < query->size; i++)
	{
		if (item[i].type != VAL || item[i].istrue)
			continue;

		entry = find_wordentry(txt, query, &(item[i]));
		if (!entry)
			continue;

		if (entry->haspos)
		{
			dimt = POSDATALEN(txt, entry);
			post = POSDATAPTR(txt, entry);
		}
		else
		{
			dimt = *(uint16 *) POSNULL;
			post = POSNULL + 1;
		}

		while (cur + dimt >= len)
		{
			len *= 2;
			doc = (DocRepresentation *) repalloc(doc, sizeof(DocRepresentation) * len);
		}

		for (j = 0; j < dimt; j++)
		{
			if (j == 0)
			{
				ITEM	   *kptr,
						   *iptr = item + i;
				int			k;

				doc[cur].needfree = false;
				doc[cur].nitem = 0;
				doc[cur].item = (ITEM **) palloc(sizeof(ITEM *) * query->size);

				for (k = 0; k < query->size; k++)
				{
					kptr = item + k;
					if (k == i || (item[k].type == VAL && compareITEM(&kptr, &iptr) == 0))
					{
						doc[cur].item[doc[cur].nitem] = item + k;
						doc[cur].nitem++;
						kptr->istrue = 1;
					}
				}
			}
			else
			{
				doc[cur].needfree = false;
				doc[cur].nitem = doc[cur - 1].nitem;
				doc[cur].item = doc[cur - 1].item;
			}
			doc[cur].pos = WEP_GETPOS(post[j]);
			cur++;
		}
	}

	*doclen = cur;

	if (cur > 0)
	{
		if (cur > 1)
			qsort((void *) doc, cur, sizeof(DocRepresentation), compareDocR);
		return doc;
	}

	pfree(doc);
	return NULL;
}


Datum
rank_cd(PG_FUNCTION_ARGS)
{
	int			K = PG_GETARG_INT32(0);
	tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
	QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
	int			method = DEF_NORM_METHOD;
	DocRepresentation *doc;
	float		res = 0.0;
	int			p = 0,
				q = 0,
				len,
				cur,
				i,
				doclen = 0;

	doc = get_docrep(txt, query, &doclen);
	if (!doc)
	{
		PG_FREE_IF_COPY(txt, 1);
		PG_FREE_IF_COPY(query, 2);
		PG_RETURN_FLOAT4(0.0);
	}

	cur = 0;
	if (K <= 0)
		K = 4;
	while (Cover(doc, doclen, query, &cur, &p, &q))
		res += (q - p + 1 > K) ? ((float) K) / ((float) (q - p + 1)) : 1.0;

	if (PG_NARGS() == 4)
		method = PG_GETARG_INT32(3);

	switch (method)
	{
		case 0:
			break;
		case 1:
			res /= log((float) (cnt_length(txt) + 1));
			break;
		case 2:
			len = cnt_length(txt);
			if (len > 0)
				res /= (float) len;
			break;
		default:
			/* internal error */
			elog(ERROR, "unrecognized normalization method: %d", method);
	}

	for (i = 0; i < doclen; i++)
		if (doc[i].needfree)
			pfree(doc[i].item);
	pfree(doc);
	PG_FREE_IF_COPY(txt, 1);
	PG_FREE_IF_COPY(query, 2);

	PG_RETURN_FLOAT4(res);
}


Datum
rank_cd_def(PG_FUNCTION_ARGS)
{
	PG_RETURN_DATUM(DirectFunctionCall4(
										rank_cd,
										Int32GetDatum(-1),
										PG_GETARG_DATUM(0),
										PG_GETARG_DATUM(1),
	  (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : Int32GetDatum(DEF_NORM_METHOD)
										));
}

/**************debug*************/

typedef struct
{
	char	   *w;
	int2		len;
	int2		pos;
	int2		start;
	int2		finish;
}	DocWord;

static int
compareDocWord(const void *a, const void *b)
{
	if (((DocWord *) a)->pos == ((DocWord *) b)->pos)
		return 0;
	return (((DocWord *) a)->pos > ((DocWord *) b)->pos) ? 1 : -1;
}


Datum
get_covers(PG_FUNCTION_ARGS)
{
	tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
	QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1));
	WordEntry  *pptr = ARRPTR(txt);
	int			i,
				dlen = 0,
				j,
				cur = 0,
				len = 0,
				rlen;
	DocWord    *dw,
			   *dwptr;
	text	   *out;
	char	   *cptr;
	DocRepresentation *doc;
	int			pos = 0,
				p,
				q,
				olddwpos = 0;
	int			ncover = 1;

	doc = get_docrep(txt, query, &rlen);

	if (!doc)
	{
		out = palloc(VARHDRSZ);
		VARATT_SIZEP(out) = VARHDRSZ;
		PG_FREE_IF_COPY(txt, 0);
		PG_FREE_IF_COPY(query, 1);
		PG_RETURN_POINTER(out);
	}

	for (i = 0; i < txt->size; i++)
	{
		if (!pptr[i].haspos)
			ereport(ERROR,
					(errcode(ERRCODE_SYNTAX_ERROR),
					 errmsg("no pos info")));
		dlen += POSDATALEN(txt, &(pptr[i]));
	}

	dwptr = dw = palloc(sizeof(DocWord) * dlen);
	memset(dw, 0, sizeof(DocWord) * dlen);

	for (i = 0; i < txt->size; i++)
	{
		WordEntryPos *posdata = POSDATAPTR(txt, &(pptr[i]));

		for (j = 0; j < POSDATALEN(txt, &(pptr[i])); j++)
		{
			dw[cur].w = STRPTR(txt) + pptr[i].pos;
			dw[cur].len = pptr[i].len;
			dw[cur].pos = WEP_GETPOS(posdata[j]);
			cur++;
		}
		len += (pptr[i].len + 1) * (int) POSDATALEN(txt, &(pptr[i]));
	}
	qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord);

	while (Cover(doc, rlen, query, &pos, &p, &q))
	{
		dwptr = dw + olddwpos;
		while (dwptr->pos < p && dwptr - dw < dlen)
			dwptr++;
		olddwpos = dwptr - dw;
		dwptr->start = ncover;
		while (dwptr->pos < q + 1 && dwptr - dw < dlen)
			dwptr++;
		(dwptr - 1)->finish = ncover;
		len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;
		ncover++;
	}

	out = palloc(VARHDRSZ + len);
	cptr = ((char *) out) + VARHDRSZ;
	dwptr = dw;

	while (dwptr - dw < dlen)
	{
		if (dwptr->start)
		{
			sprintf(cptr, "{%d ", dwptr->start);
			cptr = strchr(cptr, '\0');
		}
		memcpy(cptr, dwptr->w, dwptr->len);
		cptr += dwptr->len;
		*cptr = ' ';
		cptr++;
		if (dwptr->finish)
		{
			sprintf(cptr, "}%d ", dwptr->finish);
			cptr = strchr(cptr, '\0');
		}
		dwptr++;
	}

	VARATT_SIZEP(out) = cptr - ((char *) out);

	pfree(dw);
	for (i = 0; i < rlen; i++)
		if (doc[i].needfree)
			pfree(doc[i].item);
	pfree(doc);

	PG_FREE_IF_COPY(txt, 0);
	PG_FREE_IF_COPY(query, 1);
	PG_RETURN_POINTER(out);
}