Improvements from Heikki Linnakangas <heikki@enterprisedb.com>

- change the alignment requirement of lexemes in TSVector slightly. Lexeme strings were always padded to 2-byte aligned length to make sure that if there's position array (uint16[]) it has the right alignment. The patch changes that so that the padding is not done when there's no positions. That makes the storage of tsvectors without positions slightly more compact. - added some #include "miscadmin.h" lines I missed in the earlier when I added calls to check_stack_depth(). - Reimplement the send/recv functions, and added a comment above them describing the on-wire format. The CRC is now recalculated in tsquery as well per previous discussion.
2025-01-12 18:34:36 +08:00 · 2007-09-07 16:03:40 +00:00 · 2007-09-07 16:03:40 +00:00 · 978de9d06d
commit 978de9d06d
parent 8983852e34
9 changed files with 276 additions and 209 deletions
--- a/src/backend/utils/adt/tsginidx.c
+++ b/src/backend/utils/adt/tsginidx.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.2 2007/09/07 15:09:56 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsginidx.c,v 1.3 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -22,7 +22,7 @@ Datum
 gin_extract_tsvector(PG_FUNCTION_ARGS)
 {
 	TSVector	vector = PG_GETARG_TSVECTOR(0);
-	uint32	   *nentries = (uint32 *) PG_GETARG_POINTER(1);
+	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
 	Datum	   *entries = NULL;

 	*nentries = 0;
@ -55,7 +55,7 @@ Datum
 gin_extract_query(PG_FUNCTION_ARGS)
 {
 	TSQuery		query = PG_GETARG_TSQUERY(0);
-	uint32	   *nentries = (uint32 *) PG_GETARG_POINTER(1);
+	int32	   *nentries = (int32 *) PG_GETARG_POINTER(1);
 	StrategyNumber strategy = PG_GETARG_UINT16(2);
 	Datum	   *entries = NULL;

--- a/src/backend/utils/adt/tsquery.c
+++ b/src/backend/utils/adt/tsquery.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.4 2007/09/07 15:35:10 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery.c,v 1.5 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -21,7 +21,6 @@
 #include "tsearch/ts_utils.h"
 #include "utils/memutils.h"
 #include "utils/pg_crc.h"
-#include "nodes/bitmapset.h"


 struct TSQueryParserStateData
@ -384,16 +383,15 @@ makepol(TSQueryParserState state,
 	}
 }

-/*
- * Fills in the left-fields previously left unfilled. The input
- * QueryItems must be in polish (prefix) notation. 
- */
 static void
-findoprnd(QueryItem *ptr, uint32 *pos)
+findoprnd_recurse(QueryItem *ptr, uint32 *pos, int nnodes)
 {
 	/* since this function recurses, it could be driven to stack overflow. */
 	check_stack_depth();

+	if (*pos >= nnodes)
+		elog(ERROR, "malformed tsquery; operand not found");
+
 	if (ptr[*pos].type == QI_VAL ||
 		ptr[*pos].type == QI_VALSTOP) /* need to handle VALSTOP here,
 									   * they haven't been cleaned
@ -410,7 +408,7 @@ findoprnd(QueryItem *ptr, uint32 *pos)
 		{
 			ptr[*pos].operator.left = 1;
 			(*pos)++;
-			findoprnd(ptr, pos);
+			findoprnd_recurse(ptr, pos, nnodes);
 		}
 		else
 		{
@ -420,13 +418,31 @@ findoprnd(QueryItem *ptr, uint32 *pos)
 			Assert(curitem->oper == OP_AND || curitem->oper == OP_OR);

 			(*pos)++;
-			findoprnd(ptr, pos);
+			findoprnd_recurse(ptr, pos, nnodes);
 			curitem->left = *pos - tmp;
-			findoprnd(ptr, pos);
+			findoprnd_recurse(ptr, pos, nnodes);
 		}
 	}
 }

+
+/*
+ * Fills in the left-fields previously left unfilled. The input
+ * QueryItems must be in polish (prefix) notation. 
+ */
+static void
+findoprnd(QueryItem *ptr, int size)
+{
+	uint32 pos;
+
+	pos = 0;
+	findoprnd_recurse(ptr, &pos, size);
+
+	if (pos != size)
+		elog(ERROR, "malformed tsquery; extra nodes");
+}
+
+
 /*
 * Each value (operand) in the query is be passed to pushval. pushval can
 * transform the simple value to an arbitrarily complex expression using
@ -452,7 +468,6 @@ parse_tsquery(char *buf,
 	TSQuery		query;
 	int			commonlen;
 	QueryItem  *ptr;
-	uint32		pos = 0;
 	ListCell   *cell;

 	/* init state */
@ -522,8 +537,7 @@ parse_tsquery(char *buf,
 	pfree(state.op);

 	/* Set left operand pointers for every operator. */
-	pos = 0;
-	findoprnd(ptr, &pos);
+	findoprnd(ptr, query->size);

 	return query;
 }
@ -734,6 +748,22 @@ tsqueryout(PG_FUNCTION_ARGS)
 	PG_RETURN_CSTRING(nrm.buf);
 }

+/*
+ * Binary Input / Output functions. The binary format is as follows:
+ *
+ * uint32	 number of operators/operands in the query
+ * 
+ * Followed by the operators and operands, in prefix notation. For each
+ * operand:
+ *
+ * uint8	type, QI_VAL
+ * uint8	weight
+ * 			operand text in client encoding, null-terminated
+ *
+ * For each operator:
+ * uint8	type, QI_OPR
+ * uint8	operator, one of OP_AND, OP_OR, OP_NOT.
+ */
 Datum
 tsquerysend(PG_FUNCTION_ARGS)
 {
@ -744,7 +774,7 @@ tsquerysend(PG_FUNCTION_ARGS)

 	pq_begintypsend(&buf);

-	pq_sendint(&buf, query->size, sizeof(int32));
+	pq_sendint(&buf, query->size, sizeof(uint32));
 	for (i = 0; i < query->size; i++)
 	{
 		pq_sendint(&buf, item->type, sizeof(item->type));
@ -752,16 +782,13 @@ tsquerysend(PG_FUNCTION_ARGS)
 		switch(item->type)
 		{
 			case QI_VAL:
-				pq_sendint(&buf, item->operand.weight, sizeof(item->operand.weight));
-				pq_sendint(&buf, item->operand.valcrc, sizeof(item->operand.valcrc));
-				pq_sendint(&buf, item->operand.length, sizeof(int16));
+				pq_sendint(&buf, item->operand.weight, sizeof(uint8));
+				pq_sendstring(&buf, GETOPERAND(query) + item->operand.distance);
 				/* istrue flag is just for temporary use in tsrank.c/Cover,
 				 * so we don't need to transfer that */
 				break;
 			case QI_OPR:
 				pq_sendint(&buf, item->operator.oper, sizeof(item->operator.oper));
-				if (item->operator.oper != OP_NOT)
-					pq_sendint(&buf, item->operator.left, sizeof(item->operator.left));
 				break;
 			default:
 				elog(ERROR, "unknown tsquery node type %d", item->type);
@ -769,14 +796,6 @@ tsquerysend(PG_FUNCTION_ARGS)
 		item++;
 	}

-	item = GETQUERY(query);
-	for (i = 0; i < query->size; i++)
-	{
-		if (item->type == QI_VAL)
-			pq_sendbytes(&buf, GETOPERAND(query) + item->operand.distance, item->operand.length);
-		item++;
-	}
-
 	PG_FREE_IF_COPY(query, 0);

 	PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
@ -788,141 +807,113 @@ tsqueryrecv(PG_FUNCTION_ARGS)
 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
 	TSQuery		query;
 	int			i,
-				size,
 				len;
 	QueryItem  *item;
-	int			datalen = 0;
+	int			datalen;
 	char	   *ptr;
-	Bitmapset  *parentset = NULL;
+	uint32		size;
+	const char **operands;

 	size = pq_getmsgint(buf, sizeof(uint32));
-	if (size < 0 || size > (MaxAllocSize / sizeof(QueryItem)))
+	if (size > (MaxAllocSize / sizeof(QueryItem)))
 		elog(ERROR, "invalid size of tsquery");

-	len = HDRSIZETQ + sizeof(QueryItem) * size;
+	/* Allocate space to temporarily hold operand strings */
+	operands = palloc(size * sizeof(char *));

+	/* Allocate space for all the QueryItems. */
+	len = HDRSIZETQ + sizeof(QueryItem) * size;
 	query = (TSQuery) palloc0(len);
 	query->size = size;
 	item = GETQUERY(query);

+	datalen = 0;
 	for (i = 0; i < size; i++)
 	{
 		item->type = (int8) pq_getmsgint(buf, sizeof(int8));

-		switch(item->type)
+		if (item->type == QI_VAL)
 		{
-			case QI_VAL:
-				item->operand.weight = (int8) pq_getmsgint(buf, sizeof(int8));
-				item->operand.valcrc = (int32) pq_getmsgint(buf, sizeof(int32));
-				item->operand.length = pq_getmsgint(buf, sizeof(int16));
+			size_t val_len; /* length after recoding to server encoding */
+			uint8 weight;
+			const char *val;
+			pg_crc32 valcrc;

-				/* Check that the weight bitmap is valid */
-				if (item->operand.weight < 0 || item->operand.weight > 0xF)
-					elog(ERROR, "invalid weight bitmap");
+			weight 	 = (uint8) pq_getmsgint(buf, sizeof(uint8));
+			val = pq_getmsgstring(buf);
+			val_len = strlen(val);

-				/* XXX: We don't check that the CRC is valid. Actually, if we
-				 * bothered to calculate it to verify, there would be no need
-				 * to transfer it.
-				 */
+			/* Sanity checks */

-				/*
-				 * Check that datalen doesn't grow too large. Without the
-				 * check, a malicious client could induce a buffer overflow
-				 * by sending a tsquery whose size exceeds 2GB. datalen
-				 * would overflow, we would allocate a too small buffer below,
-				 * and overflow the buffer. Because operand.length is a 20-bit
-				 * field, adding one such value to datalen must exceed
-				 * MaxAllocSize before wrapping over the 32-bit datalen field,
-				 * so this check will protect from it.
-				 */
-				if (datalen > MAXSTRLEN)
-					elog(ERROR, "invalid tsquery; total operand length exceeded");
+			if (weight > 0xF)
+				elog(ERROR, "invalid tsquery; invalid weight bitmap");

-				/* We can calculate distance from datalen, no need to send it
-				 * across the wire. If we did, we would have to check that
-				 * it's valid anyway.
-				 */
-				item->operand.distance = datalen;
+			if (val_len > MAXSTRLEN)
+				elog(ERROR, "invalid tsquery; operand too long");
+				
+			if (datalen > MAXSTRPOS)
+				elog(ERROR, "invalid tsquery; total operand length exceeded");

-				datalen += item->operand.length + 1;		/* \0 */
+			/* Looks valid. */

-				break;
-			case QI_OPR:
-				item->operator.oper = (int8) pq_getmsgint(buf, sizeof(int8));
-				if (item->operator.oper != OP_NOT &&
-					item->operator.oper != OP_OR &&
-					item->operator.oper != OP_AND)
-					elog(ERROR, "unknown operator type %d", (int) item->operator.oper);
+			INIT_CRC32(valcrc);
+			COMP_CRC32(valcrc, val, val_len);
+			FIN_CRC32(valcrc);

-				/*
-				 * Check that no previous operator node points to the right
-				 * operand. That would mean that the operand node
-				 * has two parents.
-				 */
-				if (bms_is_member(i + 1, parentset))
-					elog(ERROR, "malformed query tree");
+			item->operand.weight = weight;
+			item->operand.valcrc = (int32) valcrc;
+			item->operand.length = val_len;
+			item->operand.distance = datalen;

-				parentset = bms_add_member(parentset, i + 1);
+			/* 
+			 * Operand strings are copied to the final struct after this loop;
+			 * here we just collect them to an array
+			 */
+			operands[i] = val;

-				if(item->operator.oper != OP_NOT)
-				{
-					uint32 left = (uint32) pq_getmsgint(buf, sizeof(uint32));
+			datalen += val_len + 1;		/* + 1 for the '\0' terminator */
+		} 
+		else if (item->type == QI_OPR)
+		{
+			int8 oper;
+			oper = (int8) pq_getmsgint(buf, sizeof(int8));
+			if (oper != OP_NOT && oper != OP_OR && oper != OP_AND)
+				elog(ERROR, "invalid tsquery; unknown operator type %d", (int) oper);
+			if (i == size - 1)
+				elog(ERROR, "invalid pointer to right operand");

-					/*
-					 * Right operand is implicitly at "this+1". Don't allow
-					 * left to point to the right operand, or to self.
-					 */
-					if (left <= 1 || i + left >= size)
-						elog(ERROR, "invalid pointer to left operand");
-
-					/*
-					 * Check that no previous operator node points to the left
-					 * operand.
-					 */
-					if (bms_is_member(i + left, parentset))
-						elog(ERROR, "malformed query tree");
-
-					parentset = bms_add_member(parentset, i + left);
-
-					item->operator.left = left;
-				}
-				else
-					item->operator.left = 1; /* do not leave uninitialized fields */
-
-				if (i == size - 1)
-					elog(ERROR, "invalid pointer to right operand");
-				break;
-			default:
-				elog(ERROR, "unknown tsquery node type %d", item->type);
+			item->operator.oper = oper;
 		}
+		else 
+			elog(ERROR, "unknown tsquery node type %d", item->type);

 		item++;
 	}

-	/* Now check that each node, except the root, has a parent. We
-	 * already checked above that no node has more than one parent. */
-	if (bms_num_members(parentset) != size - 1 && size != 0)
-		elog(ERROR, "malformed query tree");
-
-	bms_free( parentset );
-
+	/* Enlarge buffer to make room for the operand values. */
 	query = (TSQuery) repalloc(query, len + datalen);
-
 	item = GETQUERY(query);
 	ptr = GETOPERAND(query);
+
+	/* 
+	 * Fill in the left-pointers. Checks that the tree is well-formed
+	 * as a side-effect.
+	 */
+	findoprnd(item, size);
+
+	/* Copy operands to output struct */
 	for (i = 0; i < size; i++)
 	{
 		if (item->type == QI_VAL)
 		{
-			memcpy(ptr,
-				   pq_getmsgbytes(buf, item->operand.length),
-				   item->operand.length);
-			ptr += item->operand.length;
-			*ptr++ = '\0';
+			memcpy(ptr, operands[i], item->operand.length + 1);
+			ptr += item->operand.length + 1;
 		}
 		item++;
 	}

+	pfree(operands);
+
 	Assert(ptr - GETOPERAND(query) == datalen);

 	SET_VARSIZE(query, len + datalen);
--- a/src/backend/utils/adt/tsquery_cleanup.c
+++ b/src/backend/utils/adt/tsquery_cleanup.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.3 2007/09/07 15:35:10 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_cleanup.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -17,6 +17,7 @@

 #include "tsearch/ts_type.h"
 #include "tsearch/ts_utils.h"
+#include "miscadmin.h"

 typedef struct NODE
 {
--- a/src/backend/utils/adt/tsquery_rewrite.c
+++ b/src/backend/utils/adt/tsquery_rewrite.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.3 2007/09/07 15:35:10 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_rewrite.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -17,6 +17,7 @@
 #include "executor/spi.h"
 #include "tsearch/ts_type.h"
 #include "tsearch/ts_utils.h"
+#include "miscadmin.h"


 static int
--- a/src/backend/utils/adt/tsquery_util.c
+++ b/src/backend/utils/adt/tsquery_util.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.3 2007/09/07 15:35:10 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsquery_util.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -16,6 +16,7 @@

 #include "tsearch/ts_type.h"
 #include "tsearch/ts_utils.h"
+#include "miscadmin.h"

 QTNode *
 QT2QTN(QueryItem * in, char *operand)
--- a/src/backend/utils/adt/tsrank.c
+++ b/src/backend/utils/adt/tsrank.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.3 2007/09/07 15:35:10 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsrank.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -18,6 +18,7 @@
 #include "tsearch/ts_type.h"
 #include "tsearch/ts_utils.h"
 #include "utils/array.h"
+#include "miscadmin.h"


 static float weights[] = {0.1, 0.2, 0.4, 1.0};
@ -176,8 +177,9 @@ SortAndUniqItems(TSQuery q, int *size)
 	return res;
 }

+/* A dummy WordEntryPos array to use when haspos is false */
 static WordEntryPos POSNULL[] = {
-	0,
+	1, /* Number of elements that follow */
 	0
 };

@ -207,7 +209,6 @@ calc_rank_and(float *w, TSVector t, TSQuery q)
 	}
 	pos = (uint16 **) palloc(sizeof(uint16 *) * q->size);
 	memset(pos, 0, sizeof(uint16 *) * q->size);
-	*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
 	WEP_SETPOS(POSNULL[1], MAXENTRYPOS - 1);

 	for (i = 0; i < size; i++)
@ -265,7 +266,6 @@ calc_rank_or(float *w, TSVector t, TSQuery q)
 	QueryOperand **item;
 	int			size = q->size;

-	*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
 	item = SortAndUniqItems(q, &size);

 	for (i = 0; i < size; i++)
@ -593,7 +593,6 @@ get_docrep(TSVector txt, TSQuery query, int *doclen)
 	DocRepresentation *doc;
 	char	   *operand;

-	*(uint16 *) POSNULL = lengthof(POSNULL) - 1;
 	doc = (DocRepresentation *) palloc(sizeof(DocRepresentation) * len);
 	operand = GETOPERAND(query);
 	reset_istrue_flag(query);
--- a/src/backend/utils/adt/tsvector.c
+++ b/src/backend/utils/adt/tsvector.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.3 2007/09/07 15:09:56 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -75,18 +75,20 @@ uniquePos(WordEntryPos * a, int l)
 }

 static int
-compareentry(const void *a, const void *b, void *arg)
+compareentry(const void *va, const void *vb, void *arg)
 {
 	char	   *BufferStr = (char *) arg;
+	WordEntryIN *a = (WordEntryIN *) va;
+	WordEntryIN *b = (WordEntryIN *) vb;

-	if (((WordEntryIN *) a)->entry.len == ((WordEntryIN *) b)->entry.len)
+	if (a->entry.len == b->entry.len)
 	{
-		return strncmp(&BufferStr[((WordEntryIN *) a)->entry.pos],
-					   &BufferStr[((WordEntryIN *) b)->entry.pos],
-					   ((WordEntryIN *) a)->entry.len);
+		return strncmp(&BufferStr[a->entry.pos],
+					   &BufferStr[b->entry.pos],
+					   a->entry.len);
 	}

-	return (((WordEntryIN *) a)->entry.len > ((WordEntryIN *) b)->entry.len) ? 1 : -1;
+	return (a->entry.len > b->entry.len) ? 1 : -1;
 }

 static int
@ -104,6 +106,9 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
 			a->poslen = uniquePos(a->pos, a->poslen);
 			*outbuflen = SHORTALIGN(a->entry.len) + (a->poslen + 1) * sizeof(WordEntryPos);
 		}
+		else
+			*outbuflen = a->entry.len;
+
 		return l;
 	}
 	res = a;
@ -118,10 +123,12 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
 		{
 			if (res->entry.haspos)
 			{
+				*outbuflen += SHORTALIGN(res->entry.len);
 				res->poslen = uniquePos(res->pos, res->poslen);
 				*outbuflen += res->poslen * sizeof(WordEntryPos);
 			}
-			*outbuflen += SHORTALIGN(res->entry.len);
+			else
+				*outbuflen += res->entry.len;
 			res++;
 			memcpy(res, ptr, sizeof(WordEntryIN));
 		}
@ -147,12 +154,18 @@ uniqueentry(WordEntryIN * a, int l, char *buf, int *outbuflen)
 		}
 		ptr++;
 	}
+
+	/* add last item */
+
 	if (res->entry.haspos)
 	{
+		*outbuflen += SHORTALIGN(res->entry.len);
+
 		res->poslen = uniquePos(res->pos, res->poslen);
 		*outbuflen += res->poslen * sizeof(WordEntryPos);
 	}
-	*outbuflen += SHORTALIGN(res->entry.len);
+	else
+		*outbuflen += res->entry.len;

 	return res + 1 - a;
 }
@ -367,6 +380,18 @@ tsvectorout(PG_FUNCTION_ARGS)
 	PG_RETURN_CSTRING(outbuf);
 }

+/*
+ * Binary Input / Output functions. The binary format is as follows:
+ *
+ * uint32	number of lexemes
+ * 
+ * for each lexeme:
+ *		lexeme text in client encoding, null-terminated
+ * 		uint16	number of positions
+ * 		for each position:
+ *			uint16 WordEntryPos
+ */
+
 Datum
 tsvectorsend(PG_FUNCTION_ARGS)
 {
@ -381,18 +406,22 @@ tsvectorsend(PG_FUNCTION_ARGS)
 	pq_sendint(&buf, vec->size, sizeof(int32));
 	for (i = 0; i < vec->size; i++)
 	{
-		/*
-		 * We are sure that sizeof(WordEntry) == sizeof(int32)
-		 */
-		pq_sendint(&buf, *(int32 *) weptr, sizeof(int32));
+		uint16 npos;

-		pq_sendbytes(&buf, STRPTR(vec) + weptr->pos, weptr->len);
-		if (weptr->haspos)
+		/* the strings in the TSVector array are not null-terminated, so 
+		 * we have to send the null-terminator separately
+		 */
+		pq_sendtext(&buf, STRPTR(vec) + weptr->pos, weptr->len);
+		pq_sendbyte(&buf, '\0');
+
+		npos = POSDATALEN(vec, weptr);
+		pq_sendint(&buf, npos, sizeof(uint16));
+
+		if(npos > 0)
 		{
 			WordEntryPos *wepptr = POSDATAPTR(vec, weptr);

-			pq_sendint(&buf, POSDATALEN(vec, weptr), sizeof(WordEntryPos));
-			for (j = 0; j < POSDATALEN(vec, weptr); j++)
+			for (j = 0; j < npos; j++)
 				pq_sendint(&buf, wepptr[j], sizeof(WordEntryPos));
 		}
 		weptr++;
@ -407,71 +436,92 @@ tsvectorrecv(PG_FUNCTION_ARGS)
 	StringInfo	buf = (StringInfo) PG_GETARG_POINTER(0);
 	TSVector	vec;
 	int			i;
-	uint32		size;
-	WordEntry  *weptr;
-	int			datalen = 0;
-	Size		len;
+	int32		nentries;
+	int			datalen; /* number of bytes used in the variable size area
+						  * after fixed size TSVector header and WordEntries
+						  */
+	Size		hdrlen;
+	Size		len;  /* allocated size of vec */

-	size = pq_getmsgint(buf, sizeof(uint32));
-	if (size < 0 || size > (MaxAllocSize / sizeof(WordEntry)))
+	nentries = pq_getmsgint(buf, sizeof(int32));
+	if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry)))
 		elog(ERROR, "invalid size of tsvector");

-	len = DATAHDRSIZE + sizeof(WordEntry) * size;
+	hdrlen = DATAHDRSIZE + sizeof(WordEntry) * nentries;

-	len = len * 2; /* times two to make room for lexemes */
+	len = hdrlen * 2; /* times two to make room for lexemes */
 	vec = (TSVector) palloc0(len);
-	vec->size = size;
+	vec->size = nentries;

-	weptr = ARRPTR(vec);
-	for (i = 0; i < size; i++)
+	datalen = 0;
+	for (i = 0; i < nentries; i++)
 	{
-		int32 tmp;
+		const char *lexeme;
+		uint16 npos;
+		size_t lex_len;

-		weptr = ARRPTR(vec) + i;
+		lexeme = pq_getmsgstring(buf);
+		npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
+
+		/* sanity checks */
+
+		lex_len = strlen(lexeme);
+		if (lex_len < 0 || lex_len > MAXSTRLEN)
+			elog(ERROR, "invalid tsvector; lexeme too long");
+
+		if (datalen > MAXSTRPOS)
+			elog(ERROR, "invalid tsvector; maximum total lexeme length exceeded"); 
+
+		if (npos > MAXNUMPOS)
+			elog(ERROR, "unexpected number of positions");

 		/*
-		 * We are sure that sizeof(WordEntry) == sizeof(int32)
+		 * Looks valid. Fill the WordEntry struct, and copy lexeme.
+		 *
+		 * But make sure the buffer is large enough first.
 		 */
-		tmp = pq_getmsgint(buf, sizeof(int32));
-		*weptr = *(WordEntry *) & tmp;
-
-		while (CALCDATASIZE(size, datalen + SHORTALIGN(weptr->len)) >= len)
+		while (hdrlen + SHORTALIGN(datalen + lex_len) +
+			   (npos + 1) * sizeof(WordEntryPos) >= len)
 		{
 			len *= 2;
 			vec = (TSVector) repalloc(vec, len);
-			weptr = ARRPTR(vec) + i;
 		}

-		memcpy(STRPTR(vec) + weptr->pos,
-			   pq_getmsgbytes(buf, weptr->len),
-			   weptr->len);
-		datalen += SHORTALIGN(weptr->len);
+		vec->entries[i].haspos = (npos > 0) ? 1 : 0;
+		vec->entries[i].len = lex_len;
+		vec->entries[i].pos = datalen;

-		if (i > 0 && WordEntryCMP(weptr, weptr - 1, STRPTR(vec)) <= 0)
+		memcpy(STRPTR(vec) + datalen, lexeme, lex_len);
+
+		datalen += lex_len;
+
+		if (i > 0 && WordEntryCMP(&vec->entries[i], &vec->entries[i - 1], STRPTR(vec)) <= 0)
 			elog(ERROR, "lexemes are unordered");

-		if (weptr->haspos)
+		/* Receive positions */
+
+		if (npos > 0)
 		{
-			uint16		j,
-						npos;
+			uint16		j;
 			WordEntryPos *wepptr;

-			npos = (uint16) pq_getmsgint(buf, sizeof(uint16));
-			if (npos > MAXNUMPOS)
-				elog(ERROR, "unexpected number of positions");
-
-			while (CALCDATASIZE(size, datalen + (npos + 1) * sizeof(WordEntryPos)) >= len)
+			/*
+			 * Pad to 2-byte alignment if necessary. Though we used palloc0
+			 * for the initial allocation, subsequent repalloc'd memory
+			 * areas are not initialized to zero.
+			 */
+			if (datalen != SHORTALIGN(datalen))
 			{
-				len *= 2;
-				vec = (TSVector) repalloc(vec, len);
-				weptr = ARRPTR(vec) + i;
+				*(STRPTR(vec) + datalen) = '\0';
+				datalen = SHORTALIGN(datalen);
 			}

-			memcpy(_POSDATAPTR(vec, weptr), &npos, sizeof(int16));
-			wepptr = POSDATAPTR(vec, weptr);
+			memcpy(STRPTR(vec) + datalen, &npos, sizeof(uint16));
+
+			wepptr = POSDATAPTR(vec, &vec->entries[i]);
 			for (j = 0; j < npos; j++)
 			{
-				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(int16));
+				wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos));
 				if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1]))
 					elog(ERROR, "position information is unordered");
 			}
@ -480,7 +530,7 @@ tsvectorrecv(PG_FUNCTION_ARGS)
 		}
 	}

-	SET_VARSIZE(vec, CALCDATASIZE(vec->size, datalen));
+	SET_VARSIZE(vec, hdrlen + datalen);

 	PG_RETURN_TSVECTOR(vec);
 }
--- a/src/backend/utils/adt/tsvector_op.c
+++ b/src/backend/utils/adt/tsvector_op.c
@ -7,7 +7,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.3 2007/09/07 15:09:56 teodor Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_op.c,v 1.4 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -165,7 +165,7 @@ tsvector_strip(PG_FUNCTION_ARGS)
 	char	   *cur;

 	for (i = 0; i < in->size; i++)
-		len += SHORTALIGN(arrin[i].len);
+		len += arrin[i].len;

 	len = CALCDATASIZE(in->size, len);
 	out = (TSVector) palloc0(len);
@ -179,7 +179,7 @@ tsvector_strip(PG_FUNCTION_ARGS)
 		arrout[i].haspos = 0;
 		arrout[i].len = arrin[i].len;
 		arrout[i].pos = cur - STRPTR(out);
-		cur += SHORTALIGN(arrout[i].len);
+		cur += arrout[i].len;
 	}

 	PG_FREE_IF_COPY(in, 0);
@ -351,12 +351,15 @@ tsvector_concat(PG_FUNCTION_ARGS)
 			ptr->len = ptr1->len;
 			memcpy(cur, data1 + ptr1->pos, ptr1->len);
 			ptr->pos = cur - data;
-			cur += SHORTALIGN(ptr1->len);
 			if (ptr->haspos)
 			{
+				cur += SHORTALIGN(ptr1->len);
 				memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
 				cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
 			}
+			else
+				cur += ptr1->len;
+				
 			ptr++;
 			ptr1++;
 			i1--;
@ -367,16 +370,20 @@ tsvector_concat(PG_FUNCTION_ARGS)
 			ptr->len = ptr2->len;
 			memcpy(cur, data2 + ptr2->pos, ptr2->len);
 			ptr->pos = cur - data;
-			cur += SHORTALIGN(ptr2->len);
 			if (ptr->haspos)
 			{
 				int			addlen = add_pos(in2, ptr2, out, ptr, maxpos);

+				cur += SHORTALIGN(ptr2->len);
+
 				if (addlen == 0)
 					ptr->haspos = 0;
 				else
 					cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
 			}
+			else
+				cur += ptr2->len;
+
 			ptr++;
 			ptr2++;
 			i2--;
@ -387,9 +394,9 @@ tsvector_concat(PG_FUNCTION_ARGS)
 			ptr->len = ptr1->len;
 			memcpy(cur, data1 + ptr1->pos, ptr1->len);
 			ptr->pos = cur - data;
-			cur += SHORTALIGN(ptr1->len);
 			if (ptr->haspos)
 			{
+				cur += SHORTALIGN(ptr1->len);
 				if (ptr1->haspos)
 				{
 					memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
@ -407,6 +414,9 @@ tsvector_concat(PG_FUNCTION_ARGS)
 						cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
 				}
 			}
+			else
+				cur += ptr1->len;
+
 			ptr++;
 			ptr1++;
 			ptr2++;
@ -421,12 +431,15 @@ tsvector_concat(PG_FUNCTION_ARGS)
 		ptr->len = ptr1->len;
 		memcpy(cur, data1 + ptr1->pos, ptr1->len);
 		ptr->pos = cur - data;
-		cur += SHORTALIGN(ptr1->len);
 		if (ptr->haspos)
 		{
+			cur += SHORTALIGN(ptr1->len);
 			memcpy(cur, _POSDATAPTR(in1, ptr1), POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16));
 			cur += POSDATALEN(in1, ptr1) * sizeof(WordEntryPos) + sizeof(uint16);
 		}
+		else
+			cur += ptr1->len;
+
 		ptr++;
 		ptr1++;
 		i1--;
@ -438,16 +451,20 @@ tsvector_concat(PG_FUNCTION_ARGS)
 		ptr->len = ptr2->len;
 		memcpy(cur, data2 + ptr2->pos, ptr2->len);
 		ptr->pos = cur - data;
-		cur += SHORTALIGN(ptr2->len);
 		if (ptr->haspos)
 		{
 			int			addlen = add_pos(in2, ptr2, out, ptr, maxpos);

+			cur += SHORTALIGN(ptr2->len);
+
 			if (addlen == 0)
 				ptr->haspos = 0;
 			else
 				cur += addlen * sizeof(WordEntryPos) + sizeof(uint16);
 		}
+		else
+			cur += ptr2->len;
+
 		ptr++;
 		ptr2++;
 		i2--;
@ -484,8 +501,8 @@ ValCompare(CHKVAL * chkval, WordEntry * ptr, QueryOperand * item)
 static bool
 checkclass_str(CHKVAL * chkval, WordEntry * val, QueryOperand * item)
 {
-	WordEntryPos *ptr = (WordEntryPos *) (chkval->values + val->pos + SHORTALIGN(val->len) + sizeof(uint16));
-	uint16		len = *((uint16 *) (chkval->values + val->pos + SHORTALIGN(val->len)));
+	WordEntryPos *ptr = (WordEntryPos *) (chkval->values + SHORTALIGN(val->pos + val->len) + sizeof(uint16));
+	uint16		len = *((uint16 *) (chkval->values + SHORTALIGN(val->pos + val->len)));

 	while (len--)
 	{
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@ -5,7 +5,7 @@
 *
 * Copyright (c) 1998-2007, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.3 2007/09/07 15:35:11 teodor Exp $
+ * $PostgreSQL: pgsql/src/include/tsearch/ts_type.h,v 1.4 2007/09/07 16:03:40 teodor Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -62,26 +62,33 @@ typedef uint16 WordEntryPos;
 *							bytes from end of WordEntry array to start of
 *							corresponding lexeme.
 * 4) Lexeme's storage:
- *	  SHORTALIGNED(lexeme) and position information if it exists
- *	  Position information: first int2 - is a number of positions and it
- *	  follows array of WordEntryPos
+ *	  lexeme (without null-terminator)
+ *    if haspos is true:
+ *		padding byte if necessary to make the number of positions 2-byte aligned
+ *		uint16		number of positions that follow.
+ *		uint16[]	positions
+ *
+ * The positions must be sorted.
 */

 typedef struct
 {
 	int32		vl_len_;		/* varlena header (do not touch directly!) */
-	uint32		size;
-	char		data[1];
+	int32		size;
+	WordEntry	entries[1]; /* var size */
+	/* lexemes follow */
 } TSVectorData;

 typedef TSVectorData *TSVector;

-#define DATAHDRSIZE (VARHDRSZ + sizeof(int4))
-#define CALCDATASIZE(x, lenstr) ( (x) * sizeof(WordEntry) + DATAHDRSIZE + (lenstr) )
-#define ARRPTR(x)	( (WordEntry*) ( (char*)(x) + DATAHDRSIZE ) )
-#define STRPTR(x)	( (char*)(x) + DATAHDRSIZE + ( sizeof(WordEntry) * ((TSVector)(x))->size ) )
-#define STRSIZE(x)	( ((TSVector)(x))->len - DATAHDRSIZE - ( sizeof(WordEntry) * ((TSVector)(x))->size ) )
-#define _POSDATAPTR(x,e)	(STRPTR(x)+((WordEntry*)(e))->pos+SHORTALIGN(((WordEntry*)(e))->len))
+#define DATAHDRSIZE (offsetof(TSVectorData, entries))
+#define CALCDATASIZE(x, lenstr) (DATAHDRSIZE + (x) * sizeof(WordEntry) + (lenstr) )
+#define ARRPTR(x)	( (x)->entries )
+
+/* returns a pointer to the beginning of lexemes */
+#define STRPTR(x)	( (char *) &(x)->entries[x->size] )
+
+#define _POSDATAPTR(x,e)	(STRPTR(x) + SHORTALIGN((e)->pos + (e)->len))
 #define POSDATALEN(x,e) ( ( ((WordEntry*)(e))->haspos ) ? (*(uint16*)_POSDATAPTR(x,e)) : 0 )
 #define POSDATAPTR(x,e) ( (WordEntryPos*)( _POSDATAPTR(x,e)+sizeof(uint16) ) )

@ -159,7 +166,7 @@ typedef int8 QueryItemType;
 typedef struct
 {
 	QueryItemType		type;	/* operand or kind of operator (ts_tokentype) */
-	int8		weight;			/* weights of operand to search. It's a bitmask of allowed weights.
+	uint8		weight;			/* weights of operand to search. It's a bitmask of allowed weights.
 								 * if it =0 then any weight are allowed.
 								 * Weights and bit map:
 								 * A: 1<<3