The new LZ compression and an lztext data type based on it.

Jan
2025-01-06 15:24:56 +08:00 · 1999-11-17 21:21:51 +00:00 · 1999-11-17 21:21:51 +00:00 · 79c3b71c1b
commit 79c3b71c1b
parent ddc335290c
8 changed files with 1123 additions and 5 deletions
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@ -4,7 +4,7 @@
 #    Makefile for utils/adt
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/utils/adt/Makefile,v 1.26 1999/09/30 14:54:22 wieck Exp $
+#    $Header: /cvsroot/pgsql/src/backend/utils/adt/Makefile,v 1.27 1999/11/17 21:21:50 wieck Exp $
 #
 #-------------------------------------------------------------------------

@ -35,7 +35,7 @@ OBJS = acl.o arrayfuncs.o arrayutils.o bool.o cash.o char.o chunk.o \
 	regexp.o regproc.o ruleutils.o selfuncs.o sets.o \
 	tid.o timestamp.o varchar.o varlena.o version.o \
 	network.o mac.o inet_net_ntop.o inet_net_pton.o \
-	ri_triggers.o
+	ri_triggers.o pg_lzcompress.o lztext.o

 all: SUBSYS.o

--- a/src/backend/utils/adt/lztext.c
+++ b/src/backend/utils/adt/lztext.c
@ -0,0 +1,266 @@
+/* ----------
+ * lztext.c -
+ *
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/Attic/lztext.c,v 1.1 1999/11/17 21:21:50 wieck Exp $
+ *
+ *	Text type with internal LZ compressed representation. Uses the
+ *	standard PostgreSQL compression method.
+ * ----------
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "utils/palloc.h"
+#include "utils/pg_lzcompress.h"
+
+
+/* ----------
+ * lztextin -
+ *
+ *		Input function for datatype lztext
+ * ----------
+ */
+lztext *
+lztextin(char *str)
+{
+	lztext		   *result;
+	int32			rawsize;
+	lztext		   *tmp;
+	int				tmp_size;
+
+	/* ----------
+	 * Handle NULL
+	 * ----------
+	 */
+	if (str == NULL)
+		return NULL;
+
+	/* ----------
+	 * Determine input size and eventually tuple size
+	 * ----------
+	 */
+	rawsize = strlen(str);
+	tmp_size = PGLZ_MAX_OUTPUT(rawsize);
+
+	/* ----------
+	 * Allocate a temporary result and compress into it
+	 * ----------
+	 */
+	tmp = (lztext *) palloc(tmp_size);
+	pglz_compress(str, rawsize, tmp, NULL);
+
+	/* ----------
+	 * If we miss less than x% bytes at the end of the temp value,
+	 * so be it. Therefore we save a memcpy().
+	 * ----------
+	 */
+	if (tmp_size - tmp->varsize < 256 || 
+					tmp_size - tmp->varsize < tmp_size / 4)
+	{
+		result = tmp;
+	} else {
+		result = (lztext *) palloc(tmp->varsize);
+		memcpy(result, tmp, tmp->varsize);
+		pfree(tmp);
+	}
+
+	return result;
+}
+
+
+/* ----------
+ * lztextout -
+ *
+ *		Output function for data type lztext
+ * ----------
+ */
+char *
+lztextout(lztext *lz)
+{
+	char			*result;
+
+	/* ----------
+	 * Handle NULL
+	 * ----------
+	 */
+	if (lz == NULL)
+	{
+		result = (char *) palloc(2);
+		result[0] = '-';
+		result[1] = '\0';
+		return result;
+	}
+
+	/* ----------
+	 * Allocate the result string - the required size is remembered
+	 * in the lztext header so we don't need a temporary buffer or
+	 * have to diddle with realloc's.
+	 * ----------
+	 */
+	result = (char *) palloc(PGLZ_RAW_SIZE(lz) + 1);
+
+	/* ----------
+	 * Decompress and add terminating ZERO
+	 * ----------
+	 */
+	pglz_decompress(lz, result);
+	result[lz->rawsize] = '\0';
+
+	/* ----------
+	 * Return the result
+	 * ----------
+	 */
+	return result;
+}
+
+
+/* ----------
+ * lztextlen -
+ *
+ *	Logical length of lztext field (it's the uncompressed size
+ *	of the original data).
+ * ----------
+ */
+int32
+lztextlen(lztext *lz)
+{
+	/* ----------
+	 * Handle NULL
+	 * ----------
+	 */
+	if (lz == NULL)
+		return 0;
+
+	/* ----------
+	 * without multibyte support, it's the remembered rawsize
+	 * ----------
+	 */
+	return lz->rawsize;
+}
+
+
+/* ----------
+ * lztextoctetlen -
+ *
+ *	Physical length of lztext field (it's the compressed size
+ *	plus the rawsize field).
+ * ----------
+ */
+int32
+lztextoctetlen(lztext *lz)
+{
+	/* ----------
+	 * Handle NULL
+	 * ----------
+	 */
+	if (lz == NULL)
+		return 0;
+
+	/* ----------
+	 * Return the varsize minus the VARSIZE field itself.
+	 * ----------
+	 */
+	return lz->varsize - sizeof(int32);
+}
+
+
+/* ----------
+ * text_lztext -
+ *
+ *	Convert text to lztext
+ * ----------
+ */
+lztext *
+text_lztext(text *txt)
+{
+	lztext		   *result;
+	int32			rawsize;
+	lztext		   *tmp;
+	int				tmp_size;
+	char		   *str;
+
+	/* ----------
+	 * Handle NULL
+	 * ----------
+	 */
+	if (txt == NULL)
+		return NULL;
+
+	/* ----------
+	 * Determine input size and eventually tuple size
+	 * ----------
+	 */
+	rawsize  = VARSIZE(txt) - VARHDRSZ;
+	str      = VARDATA(txt);
+	tmp_size = PGLZ_MAX_OUTPUT(rawsize);
+
+	/* ----------
+	 * Allocate a temporary result and compress into it
+	 * ----------
+	 */
+	tmp = (lztext *) palloc(tmp_size);
+	pglz_compress(str, rawsize, tmp, NULL);
+
+	/* ----------
+	 * If we miss less than x% bytes at the end of the temp value,
+	 * so be it. Therefore we save a memcpy().
+	 * ----------
+	 */
+	if (tmp_size - tmp->varsize < 256 || 
+					tmp_size - tmp->varsize < tmp_size / 4)
+	{
+		result = tmp;
+	} else {
+		result = (lztext *) palloc(tmp->varsize);
+		memcpy(result, tmp, tmp->varsize);
+		pfree(tmp);
+	}
+
+	return result;
+
+	
+}
+
+
+/* ----------
+ * lztext_text -
+ *
+ *	Convert lztext to text
+ * ----------
+ */
+text *
+lztext_text(lztext *lz)
+{
+	text	   *result;
+
+	/* ----------
+	 * Handle NULL
+	 * ----------
+	 */
+	if (lz == NULL)
+		return NULL;
+
+	/* ----------
+	 * Allocate and initialize the text result
+	 * ----------
+	 */
+	result = (text *) palloc(lz->rawsize + VARHDRSZ + 1);
+	VARSIZE(result) = lz->rawsize + VARHDRSZ;
+
+	/* ----------
+	 * Decompress directly into the text data area.
+	 * ----------
+	 */
+	pglz_decompress(lz, VARDATA(result));
+	VARDATA(result)[lz->rawsize] = 0;
+
+	return result;
+}
+
+
--- a/src/backend/utils/adt/pg_lzcompress.c
+++ b/src/backend/utils/adt/pg_lzcompress.c
@ -0,0 +1,669 @@
+/* ----------
+ * pg_lzcompress.c -
+ *
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.1 1999/11/17 21:21:50 wieck Exp $
+ *
+ *		This is an implementation of LZ compression for PostgreSQL.
+ *		It uses a simple history table and generates 2-3 byte tags
+ *		capable of backward copy information for 3-273 bytes with
+ *		an offset of max. 4095.
+ *
+ *		Entry routines:
+ *
+ *			int
+ *			pglz_compress(char *source, int slen, PGLZ_Header *dest,
+ *										PGLZ_Strategy *strategy);
+ *
+ *				source is the input data to be compressed.
+ *
+ *				slen is the length of the input data.
+ *
+ *				dest is the output area for the compressed result.
+ *					It must be big enough to hold the worst case of
+ *					compression failure and can be computed by the
+ *					macro PGLZ_MAX_OUTPUT(slen). Don't be surprised,
+ *					it is larger than the input data size.
+ *
+ *				strategy is a pointer to some information controlling
+ *					the compression algorithm. If NULL, the compiled
+ *					in default strategy is used.
+ *
+ *				The return value is the size of bytes written to buff.
+ *
+ *			int
+ *			pglz_decompress(PGLZ_Header *source, char *dest)
+ *
+ *				source is the compressed input.
+ *
+ *				dest is the area where the uncompressed data will be
+ *					written to. It is the callers responsibility to
+ *					provide enough space. The required amount can be
+ *					obtained with the macro PGLZ_RAW_SIZE(source).
+ *
+ *					The data is written to buff exactly as it was handed
+ *					to pglz_compress(). No terminating zero byte is added.
+ *
+ *				The return value is the size of bytes written to buff.
+ *					Obviously the same as PGLZ_RAW_SIZE() returns.
+ *
+ *		The compression algorithm and internal data format:
+ *
+ *			PGLZ_Header is defined as
+ *
+ *				typedef struct PGLZ_Header {
+ *					int32		varsize;
+ *					int32		rawsize;
+ *				}
+ *
+ *			The header is followed by the compressed data itself.
+ *
+ *			The algorithm is easiest explained by describing the process
+ *			of decompression.
+ *
+ *			If varsize == rawsize + sizeof(PGLZ_Header), then the data
+ *			is stored uncompressed as plain bytes. Thus, the decompressor
+ *			simply copies rawsize bytes from the location after the
+ *			header to the destination.
+ *
+ *			Otherwise the first byte after the header tells what to do
+ *			the next 8 times. We call this the control byte.
+ *
+ *			An unset bit in the control byte means, that one uncompressed
+ *			byte follows, which is copied from input to output.
+ *
+ *			A set bit in the control byte means, that a tag of 2-3 bytes
+ *			follows. A tag contains information to copy some bytes, that
+ *			are already in the output buffer, to the current location in
+ *			the output. Let's call the three tag bytes T1, T2 and T3. The
+ *			position of the data to copy is coded as an offset from the
+ *			actual output position.
+ *
+ *			The offset is in the upper nibble of T1 and in T2.
+ *			The length is in the lower nibble of T1.
+ *
+ *			So the 16 bits of a 2 byte tag are coded as
+ *
+ *              7---T1--0  7---T2--0
+ *				OOOO LLLL  OOOO OOOO
+ *
+ *			This limits the offset to 1-4095 (12 bits) and the length
+ *			to 3-18 (4 bits) because 3 is allways added to it. To emit
+ *			a tag of 2 bytes with a length of 2 only saves one control
+ *			bit. But we loose one byte in the possible length of a tag.
+ *
+ *			In the actual implementation, the 2 byte tag's length is
+ *			limited to 3-17, because the value 0xF in the length nibble
+ *			has special meaning. It means, that the next following
+ *			byte (T3) has to be added to the length value of 18. That
+ *			makes total limits of 1-4095 for offset and 3-273 for length.
+ *
+ *			Now that we have successfully decoded a tag. We simply copy
+ *			the output that occured <offset> bytes back to the current
+ *			output location in the specified <length>. Thus, a
+ *			sequence of 200 spaces (think about bpchar fields) could be
+ *			coded in 4 bytes. One literal space and a three byte tag to
+ *			copy 199 bytes with a -1 offset. Whow - that's a compression
+ *			rate of 98%! Well, the implementation needs to save the
+ *			original data size too, so we need another 4 bytes for it
+ *			and end up with a total compression rate of 96%, what's still
+ *			worth a Whow.
+ *
+ *		Acknowledgements:
+ *
+ *			Many thanks to Adisak Pochanayon, who's article about SLZ
+ *			inspired me to write the PostgreSQL compression this way.
+ *
+ *			Jan Wieck
+ * ----------
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+
+#include "postgres.h"
+#include "utils/palloc.h"
+#include "utils/pg_lzcompress.h"
+
+
+/* ----------
+ * Local definitions
+ * ----------
+ */
+#define PGLZ_HISTORY_SIZE		8192
+#define PGLZ_HISTORY_MASK		0x1fff
+#define PGLZ_HISTORY_PREALLOC	8192
+#define PGLZ_MAX_MATCH			273
+
+
+/* ----------
+ * PGLZ_HistEntry -
+ *
+ *		Linked list for the backward history lookup
+ * ----------
+ */
+typedef struct PGLZ_HistEntry {
+	struct PGLZ_HistEntry	   *next;
+	char					   *pos;
+} PGLZ_HistEntry;
+
+
+/* ----------
+ * The provided standard strategies
+ * ----------
+ */
+static PGLZ_Strategy strategy_default_data = {
+	256,	/* Data chunks smaller 256 bytes are nott compressed			*/
+	6144,	/* Data chunks greater equal 6K force compression				*/
+			/* except compressed result is greater uncompressed data		*/
+	20,		/* Compression rates below 20% mean fallback to uncompressed	*/
+			/* storage except compression is forced by previous parameter	*/
+	128,	/* Stop history lookup if a match of 128 bytes is found			*/
+	10		/* Lower good match size by 10% at every lookup loop iteration.	*/
+};
+PGLZ_Strategy	*PGLZ_strategy_default = &strategy_default_data;
+
+
+static PGLZ_Strategy strategy_allways_data = {
+	0,		/* Chunks of any size are compressed							*/
+	0,		/* 																*/
+	0,		/* We want to save at least one single byte						*/
+	128,	/* Stop history lookup if a match of 128 bytes is found			*/
+	6		/* Look harder for a good match.								*/
+};
+PGLZ_Strategy	*PGLZ_strategy_allways = &strategy_allways_data;
+
+
+static PGLZ_Strategy strategy_never_data = {
+	0,		/* 																*/
+	0,		/* 																*/
+	0,		/* 																*/
+	0,		/* Zero indicates "store uncompressed allways"					*/
+	0		/* 																*/
+};
+PGLZ_Strategy	*PGLZ_strategy_never = &strategy_never_data;
+
+
+
+/* ----------
+ * pglz_hist_idx -
+ *
+ *		Computes the history table slot for the lookup by the next 4
+ *		characters in the input.
+ * ----------
+ */
+#if 1
+#define pglz_hist_idx(_s,_e) (												\
+			(((_e) - (_s)) < 4) ? 0 :										\
+			((((_s)[0] << 9) ^ ((_s)[1] << 6) ^ 							\
+			((_s)[2] << 3) ^ (_s)[3]) & (PGLZ_HISTORY_MASK))				\
+		)
+#else
+#define pglz_hist_idx(_s,_e) (												\
+			(((_e) - (_s)) < 2) ? 0 :										\
+			((((_s)[0] << 8) ^ (_s)[1]) & (PGLZ_HISTORY_MASK))				\
+		)
+#endif
+
+
+/* ----------
+ * pglz_hist_add -
+ *
+ *		Adds a new entry to the history table.
+ * ----------
+ */
+#define pglz_hist_add(_hs,_hn,_s,_e) {										\
+			int __hindex = pglz_hist_idx((_s),(_e));						\
+			(_hn)->next = (_hs)[__hindex];									\
+			(_hn)->pos  = (_s);												\
+			(_hs)[__hindex] = (_hn)++;										\
+		}
+
+
+/* ----------
+ * pglz_out_ctrl -
+ *
+ *		Outputs the last and allocates a new control byte if needed.
+ * ----------
+ */
+#define pglz_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) {						\
+	if ((__ctrl & 0xff) == 0)												\
+	{																		\
+		*__ctrlp = __ctrlb;													\
+		__ctrlp = __buf++;													\
+		__ctrlb = 0;														\
+		__ctrl = 1;															\
+	}																		\
+}
+
+
+/* ----------
+ * pglz_out_literal -
+ *
+ *		Outputs a literal byte to the destination buffer including the
+ *		appropriate control bit.
+ * ----------
+ */
+#define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) {					\
+	pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf);								\
+	*_buf++ = (unsigned char)(_byte);										\
+	_ctrl <<= 1;															\
+}
+
+
+/* ----------
+ * pglz_out_tag -
+ *
+ *		Outputs a backward reference tag of 2-4 bytes (depending on
+ *		offset and length) to the destination buffer including the
+ *		appropriate control bit.
+ * ----------
+ */
+#define pglz_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off) {					\
+	pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf);								\
+	_ctrlb |= _ctrl;														\
+	_ctrl <<= 1;															\
+	if (_len > 17)															\
+	{																		\
+		_buf[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f);			\
+		_buf[1] = (unsigned char)((_off & 0xff));							\
+		_buf[2] = (unsigned char)((_len) - 18);								\
+		_buf += 3;															\
+	} else {																\
+		_buf[0] = (unsigned char)((((_off) & 0xf00) >> 4) | (_len - 3));	\
+		_buf[1] = (unsigned char)((_off) & 0xff);							\
+		_buf += 2;															\
+	}																		\
+}
+
+
+/* ----------
+ * pglz_find_match -
+ *
+ *		Lookup the history table if the actual input stream matches
+ *		another sequence of characters, starting somewhere earlier
+ *		in the input buffer.
+ * ----------
+ */
+static inline int
+pglz_find_match (PGLZ_HistEntry **hstart, char *input, char *end, 
+						int *lenp, int *offp, int good_match, int good_drop)
+{
+	PGLZ_HistEntry	   *hent;
+	int32				len = 0;
+	int32				off = 0;
+	int32				thislen;
+	int32				thisoff;
+	char			   *ip;
+	char			   *hp;
+
+	/* ----------
+	 * Traverse the linked history list until a good enough
+	 * match is found.
+	 * ----------
+	 */
+	hent = hstart[pglz_hist_idx(input, end)];
+	while (hent && len < good_match)
+	{
+		/* ----------
+		 * Be happy with lesser good matches the more entries we visited.
+		 * ----------
+		 */
+		good_match -= (good_match * good_drop) /100;
+
+		/* ----------
+		 * Stop if the offset does not fit into our tag anymore.
+		 * ----------
+		 */
+		thisoff = (ip = input) - (hp = hent->pos);
+		if (thisoff >= 0x0fff)
+			break;
+
+		/* ----------
+		 * Determine length of match. A better match must be larger than
+		 * the best so far. And if we already have a match of 16 or more
+		 * bytes, it's worth the call overhead to use memcmp() to check
+		 * if this match is equal for the same size. After that we must
+		 * fallback to character by character comparision to know the
+		 * exact position where the diff occured.
+		 * ----------
+		 */
+		if (len >= 16)
+		{
+			if (memcmp(ip, hp, len) != 0)
+			{
+				hent = hent->next;
+				continue;
+			}
+			thislen = len;
+			ip += len;
+			hp += len;
+		} else {
+			thislen = 0;
+		}
+		while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH)
+		{
+			thislen++;
+			ip++;
+			hp++;
+		}
+
+		/* ----------
+		 * Remember this match as the best (if it is)
+		 * ----------
+		 */
+		if (thislen > len)
+		{
+			len = thislen;
+			off = thisoff;
+		}
+
+		/* ----------
+		 * Advance to the next history entry
+		 * ----------
+		 */
+		hent = hent->next;
+	}
+
+	/* ----------
+	 * Return match information only if it results at least in one
+	 * byte reduction.
+	 * ----------
+	 */
+	if (len > 2)
+	{
+		*lenp = len;
+		*offp = off;
+		return 1;
+	}
+
+	return 0;
+}
+
+
+/* ----------
+ * pglz_compress -
+ * ----------
+ */
+int
+pglz_compress (char *source, int slen, PGLZ_Header *dest, PGLZ_Strategy *strategy)
+{
+	PGLZ_HistEntry	   *hist_start[PGLZ_HISTORY_SIZE];
+	PGLZ_HistEntry	   *hist_alloc;
+	PGLZ_HistEntry	   hist_prealloc[PGLZ_HISTORY_PREALLOC];
+	PGLZ_HistEntry	   *hist_next;
+
+	unsigned char	   *bp = ((unsigned char *)dest) + sizeof(PGLZ_Header);
+	unsigned char	   *bstart = bp;
+	char			   *dp = source;
+	char			   *dend = source + slen;
+	unsigned char		ctrl_dummy = 0;
+	unsigned char	   *ctrlp = &ctrl_dummy;
+	unsigned char		ctrlb = 0;
+	unsigned char		ctrl = 0;
+	int32				match_len;
+	int32				match_off;
+	int32				good_match;
+	int32				good_drop;
+	int32				do_compress = 1;
+	int32				result_size = -1;
+	int32				result_max;
+	int32				need_rate;
+
+	/* ----------
+	 * Our fallback strategy is the default.
+	 * ----------
+	 */
+	if (strategy == NULL)
+		strategy = PGLZ_strategy_default;
+
+	/* ----------
+	 * Save the original source size in the header.
+	 * ----------
+	 */
+	dest->rawsize = slen;
+
+	/* ----------
+	 * If the strategy forbids compression (at all or if source chunk too
+	 * small), copy input to output without compression.
+	 * ----------
+	 */
+	if (strategy->match_size_good == 0)
+	{
+		memcpy(bstart, source, slen);
+		return (dest->varsize = slen + sizeof(PGLZ_Header));
+	} else {
+		if (slen < strategy->min_input_size)
+		{
+			memcpy(bstart, source, slen);
+			return (dest->varsize = slen + sizeof(PGLZ_Header));
+		}
+	}
+
+	/* ----------
+	 * Limit the match size to the maximum implementation allowed value
+	 * ----------
+	 */
+	if ((good_match = strategy->match_size_good) > PGLZ_MAX_MATCH)
+		good_match = PGLZ_MAX_MATCH;
+	if (good_match < 17)
+		good_match = 17;
+
+	if ((good_drop = strategy->match_size_drop) < 0)
+		good_drop = 0;
+	if (good_drop > 100)
+		good_drop = 100;
+
+	/* ----------
+	 * Initialize the history tables. For inputs smaller than
+	 * PGLZ_HISTORY_PREALLOC, we already have a big enough history
+	 * table on the stack frame.
+	 * ----------
+	 */
+	memset((void *)hist_start, 0, sizeof(hist_start));
+	if (slen + 1 <= PGLZ_HISTORY_PREALLOC)
+		hist_alloc = hist_prealloc;
+	else
+		hist_alloc = (PGLZ_HistEntry *)
+							palloc(sizeof(PGLZ_HistEntry) * (slen + 1));
+	hist_next = hist_alloc;
+
+	/* ----------
+	 * Compute the maximum result size allowed by the strategy.
+	 * If the input size exceeds force_input_size, the max result size
+	 * is the input size itself.
+	 * Otherwise, it is the input size minus the minimum wanted
+	 * compression rate.
+	 * ----------
+	 */
+	if (slen >= strategy->force_input_size)
+	{
+		result_max = slen;
+	} else {
+		need_rate = strategy->min_comp_rate;
+		if (need_rate < 0)
+			need_rate = 0;
+		else if (need_rate > 99)
+			need_rate = 99;
+		result_max = slen - ((slen * need_rate) / 100);
+	}
+
+	/* ----------
+	 * Compress the source directly into the output buffer.
+	 * ----------
+	 */
+	while (dp < dend)
+	{
+		/* ----------
+		 * If we already exceeded the maximum result size, set no compression
+		 * flag and stop this. But don't check too often.
+		 * ----------
+		 */
+		if (bp - bstart >= result_max)
+		{
+			do_compress = 0;
+			break;
+		}
+
+		/* ----------
+		 * Try to find a match in the history
+		 * ----------
+		 */
+		if (pglz_find_match(hist_start, dp, dend, &match_len, 
+										&match_off, good_match, good_drop))
+		{
+			/* ----------
+			 * Create the tag and add history entries for
+			 * all matched characters.
+			 * ----------
+			 */
+			pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off);
+			while(match_len--)
+			{
+				pglz_hist_add(hist_start, hist_next, dp, dend);
+				dp++;	/* Do not do this ++ in the line above!		*/
+						/* The macro would do it four times - Jan.	*/
+			}
+		} else {
+			/* ----------
+			 * No match found. Copy one literal byte.
+			 * ----------
+			 */
+			pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
+			pglz_hist_add(hist_start, hist_next, dp, dend);
+			dp++;	/* Do not do this ++ in the line above!		*/
+					/* The macro would do it four times - Jan.	*/
+		}
+	}
+
+	/* ----------
+	 * Get rid of the history (if allocated)
+	 * ----------
+	 */
+	if (hist_alloc != hist_prealloc)
+		pfree((void *)hist_alloc);
+
+	/* ----------
+	 * If we are still in compressing mode, write out the last
+	 * control byte and determine if the compression gained the
+	 * rate requested by the strategy.
+	 * ----------
+	 */
+	if (do_compress)
+	{
+		*ctrlp = ctrlb;
+
+		result_size = bp - bstart;
+		if (result_size >= result_max) {
+			do_compress = 0;
+		}
+	}
+
+	/* ----------
+	 * Done - if we successfully compressed and matched the
+	 * strategy's constraints, return the compressed result.
+	 * Otherwise copy the original source over it and return
+	 * the original length.
+	 * ----------
+	 */
+	if (do_compress)
+	{
+		return (dest->varsize = result_size + sizeof(PGLZ_Header));
+	} else {
+		memcpy(((char *)dest) + sizeof(PGLZ_Header), source, slen);
+		return (dest->varsize = slen + sizeof(PGLZ_Header));
+	}
+}
+
+
+/* ----------
+ * pglz_decompress -
+ * ----------
+ */
+int
+pglz_decompress (PGLZ_Header *source, char *dest)
+{
+	unsigned char	   *dp;
+	unsigned char	   *dend;
+	unsigned char	   *bp;
+	unsigned char		ctrl;
+	int32				ctrlc;
+	int32				len;
+	int32				off;
+
+	dp		= ((unsigned char *)source) + sizeof(PGLZ_Header);
+	dend	= ((unsigned char *)source) + source->varsize;
+	bp		= (unsigned char *)dest;
+
+	if (source->varsize == source->rawsize + sizeof(PGLZ_Header))
+	{
+		memcpy(dest, dp, source->rawsize);
+		return source->rawsize;
+	}
+
+	while (dp < dend)
+	{
+		/* ----------
+		 * Read one control byte and process the next 8 items.
+		 * ----------
+		 */
+		ctrl = *dp++;
+		for (ctrlc = 0; ctrlc < 8 && dp < dend; ctrlc++)
+		{
+			if (ctrl & 1)
+			{
+				/* ----------
+				 * Otherwise it contains the match length minus 3
+				 * and the upper 4 bits of the offset. The next following
+				 * byte contains the lower 8 bits of the offset. If
+				 * the length is coded as 18, another extension tag byte
+				 * tells how much longer the match really was (0-255).
+				 * ----------
+				 */
+				len = (dp[0] & 0x0f) + 3;
+				off = ((dp[0] & 0xf0) << 4) | dp[1];
+				dp += 2;
+				if (len == 18)
+				{
+					len += *dp++;
+				}
+
+				/* ----------
+				 * Now we copy the bytes specified by the tag from
+				 * OUTPUT to OUTPUT. It is dangerous and platform
+				 * dependant to use memcpy() here, because the copied
+				 * areas could overlap extremely!
+				 * ----------
+				 */
+				while (len--)
+				{
+					*bp = bp[-off];
+					bp++;
+				}
+			} else {
+				/* ----------
+				 * An unset control bit means LITERAL BYTE. So we
+				 * just copy one from INPUT to OUTPUT.
+				 * ----------
+				 */
+				*bp++ = *dp++;
+			}
+
+			/* ----------
+			 * Advance the control bit
+			 * ----------
+			 */
+			ctrl >>= 1;
+		}
+	}
+
+	/* ----------
+	 * That's it.
+	 * ----------
+	 */
+	return (char *)bp - dest;
+}
+
+
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@ -6,7 +6,7 @@
 *
 * Copyright (c) 1994, Regents of the University of California
 *
- * $Id: pg_proc.h,v 1.105 1999/10/11 06:28:28 inoue Exp $
+ * $Id: pg_proc.h,v 1.106 1999/11/17 21:21:50 wieck Exp $
 *
 * NOTES
 *	  The script catalog/genbki.sh reads this file and generates .bki
@ -2338,6 +2338,28 @@ DESCR("larger of two numbers");
 DATA(insert OID = 1769 ( numeric_cmp			PGUID 11 f t t 2 f 23 "1700 1700" 100 0 0 100  numeric_cmp - ));
 DESCR("compare two numbers");

+/* OID's 1625 - 1639 LZTEXT data type */
+DATA(insert OID = 1626 ( lztextin				PGUID 11 f t t 1 f 1625 "0" 100 0 0 100  lztextin - ));
+DESCR("(internal)");
+DATA(insert OID = 1627 ( lztextout				PGUID 11 f t t 1 f 23 "0" 100 0 0 100  lztextout - ));
+DESCR("(internal)");
+DATA(insert OID = 1628 ( lztext_text	   		PGUID 11 f t t 1 f 25 "1625" 100 0 0 100  lztext_text -));
+DESCR("convert lztext to text");
+DATA(insert OID = 1629 ( text			   		PGUID 11 f t t 1 f 25 "1625" 100 0 0 100  lztext_text -));
+DESCR("convert lztext to text");
+DATA(insert OID = 1630 ( text_lztext	   		PGUID 11 f t t 1 f 1625 "25" 100 0 0 100  text_lztext -));
+DESCR("convert text to lztext");
+DATA(insert OID = 1631 ( lztext			   		PGUID 11 f t t 1 f 1625 "25" 100 0 0 100  text_lztext -));
+DESCR("convert text to lztext");
+DATA(insert OID = 1632 ( lztextlen		   		PGUID 11 f t t 1 f 23 "1625" 100 0 1 0  lztextlen - ));
+DESCR("length");
+DATA(insert OID = 1633 ( length			   		PGUID 11 f t t 1 f 23 "1625" 100 0 1 0  lztextlen - ));
+DESCR("length");
+DATA(insert OID = 1634 ( lztextoctetlen	   		PGUID 11 f t t 1 f 23 "1625" 100 0 1 0  lztextoctetlen - ));
+DESCR("octet length");
+DATA(insert OID = 1635 ( octet_length	   		PGUID 11 f t t 1 f 23 "1625" 100 0 1 0  lztextoctetlen - ));
+DESCR("octet length");
+

 /*
 * prototypes for functions pg_proc.c
--- a/src/include/catalog/pg_type.h
+++ b/src/include/catalog/pg_type.h
@ -7,7 +7,7 @@
 *
 * Copyright (c) 1994, Regents of the University of California
 *
- * $Id: pg_type.h,v 1.70 1999/10/18 14:14:04 momjian Exp $
+ * $Id: pg_type.h,v 1.71 1999/11/17 21:21:51 wieck Exp $
 *
 * NOTES
 *	  the genbki.sh script reads this file and generates .bki
@ -382,6 +382,11 @@ DATA(insert OID = 1296 ( timestamp	 PGUID	4  19 t b t \054 0	0 timestamp_in time
 DESCR("date time timezone, limited-range ISO-formated date and time");
 #define TIMESTAMPOID	1296

+/* OIDS 1625 - 1639 */
+DATA(insert OID = 1625 ( lztext	   PGUID -1  -1 f b t \054 0  0 lztextin lztextout lztextin lztextout i _null_ ));
+DESCR("variable-length string, stored compressed");
+#define LZTEXTOID		1625
+
 /* OIDS 1700 - 1799 */
 DATA(insert OID = 1700 ( numeric	   PGUID -1  -1 f b t \054 0  0 numeric_in numeric_out numeric_in numeric_out i _null_ ));
 DESCR("numeric(precision, decimal), arbitrary precision number");
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@ -6,7 +6,7 @@
 *
 * Copyright (c) 1994, Regents of the University of California
 *
- * $Id: builtins.h,v 1.89 1999/10/11 06:28:28 inoue Exp $
+ * $Id: builtins.h,v 1.90 1999/11/17 21:21:51 wieck Exp $
 *
 * NOTES
 *	  This should normally only be included by fmgr.h.
@ -30,6 +30,7 @@
 #include "utils/int8.h"
 #include "utils/nabstime.h"
 #include "utils/numeric.h"
+#include "utils/lztext.h"
 #include "access/heapam.h"		/* for HeapTuple */

 /*
@ -627,4 +628,12 @@ HeapTuple	RI_FKey_setnull_upd(FmgrInfo *proinfo);
 HeapTuple	RI_FKey_setdefault_del(FmgrInfo *proinfo);
 HeapTuple	RI_FKey_setdefault_upd(FmgrInfo *proinfo);

+/* lztext.c */
+lztext	   *lztextin(char *str);
+char	   *lztextout(lztext *lz);
+text	   *lztext_text(lztext *lz);
+lztext	   *text_lztext(text *txt);
+int32		lztextlen(lztext *lz);
+int32		lztextoctetlen(lztext *lz);
+
 #endif	 /* BUILTINS_H */
--- a/src/include/utils/lztext.h
+++ b/src/include/utils/lztext.h
@ -0,0 +1,22 @@
+/* ----------
+ * lztext.h
+ *
+ * $Header: /cvsroot/pgsql/src/include/utils/Attic/lztext.h,v 1.1 1999/11/17 21:21:51 wieck Exp $
+ *
+ *	Definitions for the lztext compressed data type
+ * ----------
+ */
+
+#ifndef _LZTEXT_H_
+#define _LZTEXT_H_
+
+#include "utils/pg_lzcompress.h"
+
+
+/* ----------
+ * The internal storage format of an LZ compressed text field
+ * ----------
+ */
+typedef PGLZ_Header		lztext;
+
+#endif /* _LZTEXT_H_ */
--- a/src/include/utils/pg_lzcompress.h
+++ b/src/include/utils/pg_lzcompress.h
@ -0,0 +1,125 @@
+/* ----------
+ * pg_lzcompress.h -
+ *
+ * $Header: /cvsroot/pgsql/src/include/utils/pg_lzcompress.h,v 1.1 1999/11/17 21:21:51 wieck Exp $
+ *
+ *	Definitions for the builtin LZ compressor
+ * ----------
+ */
+
+#ifndef _PG_LZCOMPRESS_H_
+#define _PG_LZCOMPRESS_H_
+
+
+/* ----------
+ * PGLZ_Header -
+ *
+ *      The information at the top of the compressed data.
+ *		The varsize must be kept the same data type as the value
+ *		in front of all variable size data types in PostgreSQL.
+ * ----------
+ */
+typedef struct PGLZ_Header {
+    int32                       varsize;
+    int32                       rawsize;
+} PGLZ_Header;
+
+
+/* ----------
+ * PGLZ_MAX_OUTPUT -
+ *
+ *		Macro to compute the maximum buffer required for the
+ *		compression output. It is larger than the input, because
+ *		in the worst case, we cannot write out one single tag but
+ *		need one control byte per 8 literal data bytes plus the
+ *		EOF mark at the end.
+ * ----------
+ */
+#define PGLZ_MAX_OUTPUT(_dlen)			((_dlen) + (((_dlen) | 0x07) >> 3)	\
+													 + sizeof(PGLZ_Header))
+#define PGLZ_RAW_SIZE(_lzdata)			(_lzdata->rawsize)
+#define PGLZ_IS_COMPRESSED(_lzdata)		(_lzdata->varsize != 				\
+										 _lzdata->rawsize + sizeof(PGLZ_Header))
+
+/* ----------
+ * PGLZ_Strategy -
+ *
+ *		Some values that control the compression algorithm.
+ *
+ *		min_input_size		Minimum input data size to start compression.
+ *
+ *		force_input_size	Input data size at which compressed storage is
+ *							forced even if the compression rate drops below
+ *							min_comp_rate (but not below 0).
+ *
+ *		min_comp_rate		Minimum compression rate (0-99%), the output
+ *							must be smaller than the input. If that isn't
+ *							the case, the compressor will throw away it's
+ *							output and copy the original, uncompressed data
+ *							to the output buffer.
+ *
+ *		match_size_good		The initial GOOD match size when starting history
+ *							lookup. When looking up the history to find a
+ *							match that could be expressed as a tag, the
+ *							algorithm does not allways walk back entirely.
+ *							A good match fast is usually better than the 
+ *							best possible one very late. For each iteration
+ *							in the lookup, this value is lowered so the
+ *							longer the lookup takes, the smaller matches
+ *							are considered good.
+ *
+ *		match_size_drop		The percentage, match_size_good is lowered
+ *							at each history check. Allowed values are
+ *							0 (no change until end) to 100 (only check
+ *							latest history entry at all).
+ * ----------
+ */
+typedef struct PGLZ_Strategy {
+	int32		min_input_size;
+	int32		force_input_size;
+	int32		min_comp_rate;
+	int32		match_size_good;
+	int32		match_size_drop;
+} PGLZ_Strategy;
+
+
+/* ----------
+ * The standard strategies
+ *
+ *		PGLZ_strategy_default		Starts compression only if input is
+ *									at least 256 bytes large. Stores output
+ *									uncompressed if compression does not
+ *									gain at least 20% size reducture but
+ *									input does not exceed 6K. Stops history
+ *									lookup if at least a 128 byte long
+ *									match has been found.
+ *
+ *									This is the default strategy if none
+ *									is given to pglz_compress().
+ *
+ *		PGLZ_strategy_allways		Starts compression on any infinitely
+ *									small input and does fallback to
+ *									uncompressed storage only if output
+ *									would be larger than input.
+ *
+ *		PGLZ_strategy_never			Force pglz_compress to act as a custom
+ *									interface for memcpy(). Only useful
+ *									for generic interfacing.
+ * ----------
+ */
+extern PGLZ_Strategy	*PGLZ_strategy_default;
+extern PGLZ_Strategy	*PGLZ_strategy_allways;
+extern PGLZ_Strategy	*PGLZ_strategy_never;
+
+
+/* ----------
+ * Global function declarations
+ * ----------
+ */
+int	pglz_compress (char *source, int32 slen, PGLZ_Header *dest,
+									 PGLZ_Strategy *strategy);
+int pglz_decompress (PGLZ_Header *source, char *dest);
+
+
+#endif /* _PG_LZCOMPRESS_H_ */
+