mirror of
https://git.postgresql.org/git/postgresql.git
synced 2024-12-27 08:39:28 +08:00
8224de4f42
This patch introduces INCLUDE clause to index definition. This clause specifies a list of columns which will be included as a non-key part in the index. The INCLUDE columns exist solely to allow more queries to benefit from index-only scans. Also, such columns don't need to have appropriate operator classes. Expressions are not supported as INCLUDE columns since they cannot be used in index-only scans. Index access methods supporting INCLUDE are indicated by amcaninclude flag in IndexAmRoutine. For now, only B-tree indexes support INCLUDE clause. In B-tree indexes INCLUDE columns are truncated from pivot index tuples (tuples located in non-leaf pages and high keys). Therefore, B-tree indexes now might have variable number of attributes. This patch also provides generic facility to support that: pivot tuples contain number of their attributes in t_tid.ip_posid. Free 13th bit of t_info is used for indicating that. This facility will simplify further support of index suffix truncation. The changes of above are backward-compatible, pg_upgrade doesn't need special handling of B-tree indexes for that. Bump catalog version Author: Anastasia Lubennikova with contribition by Alexander Korotkov and me Reviewed by: Peter Geoghegan, Tomas Vondra, Antonin Houska, Jeff Janes, David Rowley, Alexander Korotkov Discussion: https://www.postgresql.org/message-id/flat/56168952.4010101@postgrespro.ru
491 lines
13 KiB
C
491 lines
13 KiB
C
/*-------------------------------------------------------------------------
|
|
*
|
|
* blutils.c
|
|
* Bloom index utilities.
|
|
*
|
|
* Portions Copyright (c) 2016-2018, PostgreSQL Global Development Group
|
|
* Portions Copyright (c) 1990-1993, Regents of the University of California
|
|
*
|
|
* IDENTIFICATION
|
|
* contrib/bloom/blutils.c
|
|
*
|
|
*-------------------------------------------------------------------------
|
|
*/
|
|
#include "postgres.h"
|
|
|
|
#include "access/amapi.h"
|
|
#include "access/generic_xlog.h"
|
|
#include "catalog/index.h"
|
|
#include "storage/lmgr.h"
|
|
#include "miscadmin.h"
|
|
#include "storage/bufmgr.h"
|
|
#include "storage/indexfsm.h"
|
|
#include "utils/memutils.h"
|
|
#include "access/reloptions.h"
|
|
#include "storage/freespace.h"
|
|
#include "storage/indexfsm.h"
|
|
|
|
#include "bloom.h"
|
|
|
|
/* Signature dealing macros - note i is assumed to be of type int */
|
|
#define GETWORD(x,i) ( *( (BloomSignatureWord *)(x) + ( (i) / SIGNWORDBITS ) ) )
|
|
#define CLRBIT(x,i) GETWORD(x,i) &= ~( 0x01 << ( (i) % SIGNWORDBITS ) )
|
|
#define SETBIT(x,i) GETWORD(x,i) |= ( 0x01 << ( (i) % SIGNWORDBITS ) )
|
|
#define GETBIT(x,i) ( (GETWORD(x,i) >> ( (i) % SIGNWORDBITS )) & 0x01 )
|
|
|
|
PG_FUNCTION_INFO_V1(blhandler);
|
|
|
|
/* Kind of relation options for bloom index */
|
|
static relopt_kind bl_relopt_kind;
|
|
|
|
/* parse table for fillRelOptions */
|
|
static relopt_parse_elt bl_relopt_tab[INDEX_MAX_KEYS + 1];
|
|
|
|
static int32 myRand(void);
|
|
static void mySrand(uint32 seed);
|
|
|
|
/*
|
|
* Module initialize function: initialize info about Bloom relation options.
|
|
*
|
|
* Note: keep this in sync with makeDefaultBloomOptions().
|
|
*/
|
|
void
|
|
_PG_init(void)
|
|
{
|
|
int i;
|
|
char buf[16];
|
|
|
|
bl_relopt_kind = add_reloption_kind();
|
|
|
|
/* Option for length of signature */
|
|
add_int_reloption(bl_relopt_kind, "length",
|
|
"Length of signature in bits",
|
|
DEFAULT_BLOOM_LENGTH, 1, MAX_BLOOM_LENGTH);
|
|
bl_relopt_tab[0].optname = "length";
|
|
bl_relopt_tab[0].opttype = RELOPT_TYPE_INT;
|
|
bl_relopt_tab[0].offset = offsetof(BloomOptions, bloomLength);
|
|
|
|
/* Number of bits for each possible index column: col1, col2, ... */
|
|
for (i = 0; i < INDEX_MAX_KEYS; i++)
|
|
{
|
|
snprintf(buf, sizeof(buf), "col%d", i + 1);
|
|
add_int_reloption(bl_relopt_kind, buf,
|
|
"Number of bits generated for each index column",
|
|
DEFAULT_BLOOM_BITS, 1, MAX_BLOOM_BITS);
|
|
bl_relopt_tab[i + 1].optname = MemoryContextStrdup(TopMemoryContext,
|
|
buf);
|
|
bl_relopt_tab[i + 1].opttype = RELOPT_TYPE_INT;
|
|
bl_relopt_tab[i + 1].offset = offsetof(BloomOptions, bitSize[0]) + sizeof(int) * i;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Construct a default set of Bloom options.
|
|
*/
|
|
static BloomOptions *
|
|
makeDefaultBloomOptions(void)
|
|
{
|
|
BloomOptions *opts;
|
|
int i;
|
|
|
|
opts = (BloomOptions *) palloc0(sizeof(BloomOptions));
|
|
/* Convert DEFAULT_BLOOM_LENGTH from # of bits to # of words */
|
|
opts->bloomLength = (DEFAULT_BLOOM_LENGTH + SIGNWORDBITS - 1) / SIGNWORDBITS;
|
|
for (i = 0; i < INDEX_MAX_KEYS; i++)
|
|
opts->bitSize[i] = DEFAULT_BLOOM_BITS;
|
|
SET_VARSIZE(opts, sizeof(BloomOptions));
|
|
return opts;
|
|
}
|
|
|
|
/*
|
|
* Bloom handler function: return IndexAmRoutine with access method parameters
|
|
* and callbacks.
|
|
*/
|
|
Datum
|
|
blhandler(PG_FUNCTION_ARGS)
|
|
{
|
|
IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
|
|
|
|
amroutine->amstrategies = BLOOM_NSTRATEGIES;
|
|
amroutine->amsupport = BLOOM_NPROC;
|
|
amroutine->amcanorder = false;
|
|
amroutine->amcanorderbyop = false;
|
|
amroutine->amcanbackward = false;
|
|
amroutine->amcanunique = false;
|
|
amroutine->amcanmulticol = true;
|
|
amroutine->amoptionalkey = true;
|
|
amroutine->amsearcharray = false;
|
|
amroutine->amsearchnulls = false;
|
|
amroutine->amstorage = false;
|
|
amroutine->amclusterable = false;
|
|
amroutine->ampredlocks = false;
|
|
amroutine->amcanparallel = false;
|
|
amroutine->amcaninclude = false;
|
|
amroutine->amkeytype = InvalidOid;
|
|
|
|
amroutine->ambuild = blbuild;
|
|
amroutine->ambuildempty = blbuildempty;
|
|
amroutine->aminsert = blinsert;
|
|
amroutine->ambulkdelete = blbulkdelete;
|
|
amroutine->amvacuumcleanup = blvacuumcleanup;
|
|
amroutine->amcanreturn = NULL;
|
|
amroutine->amcostestimate = blcostestimate;
|
|
amroutine->amoptions = bloptions;
|
|
amroutine->amproperty = NULL;
|
|
amroutine->amvalidate = blvalidate;
|
|
amroutine->ambeginscan = blbeginscan;
|
|
amroutine->amrescan = blrescan;
|
|
amroutine->amgettuple = NULL;
|
|
amroutine->amgetbitmap = blgetbitmap;
|
|
amroutine->amendscan = blendscan;
|
|
amroutine->ammarkpos = NULL;
|
|
amroutine->amrestrpos = NULL;
|
|
amroutine->amestimateparallelscan = NULL;
|
|
amroutine->aminitparallelscan = NULL;
|
|
amroutine->amparallelrescan = NULL;
|
|
|
|
PG_RETURN_POINTER(amroutine);
|
|
}
|
|
|
|
/*
|
|
* Fill BloomState structure for particular index.
|
|
*/
|
|
void
|
|
initBloomState(BloomState *state, Relation index)
|
|
{
|
|
int i;
|
|
|
|
state->nColumns = index->rd_att->natts;
|
|
|
|
/* Initialize hash function for each attribute */
|
|
for (i = 0; i < index->rd_att->natts; i++)
|
|
{
|
|
fmgr_info_copy(&(state->hashFn[i]),
|
|
index_getprocinfo(index, i + 1, BLOOM_HASH_PROC),
|
|
CurrentMemoryContext);
|
|
}
|
|
|
|
/* Initialize amcache if needed with options from metapage */
|
|
if (!index->rd_amcache)
|
|
{
|
|
Buffer buffer;
|
|
Page page;
|
|
BloomMetaPageData *meta;
|
|
BloomOptions *opts;
|
|
|
|
opts = MemoryContextAlloc(index->rd_indexcxt, sizeof(BloomOptions));
|
|
|
|
buffer = ReadBuffer(index, BLOOM_METAPAGE_BLKNO);
|
|
LockBuffer(buffer, BUFFER_LOCK_SHARE);
|
|
|
|
page = BufferGetPage(buffer);
|
|
|
|
if (!BloomPageIsMeta(page))
|
|
elog(ERROR, "Relation is not a bloom index");
|
|
meta = BloomPageGetMeta(BufferGetPage(buffer));
|
|
|
|
if (meta->magickNumber != BLOOM_MAGICK_NUMBER)
|
|
elog(ERROR, "Relation is not a bloom index");
|
|
|
|
*opts = meta->opts;
|
|
|
|
UnlockReleaseBuffer(buffer);
|
|
|
|
index->rd_amcache = (void *) opts;
|
|
}
|
|
|
|
memcpy(&state->opts, index->rd_amcache, sizeof(state->opts));
|
|
state->sizeOfBloomTuple = BLOOMTUPLEHDRSZ +
|
|
sizeof(BloomSignatureWord) * state->opts.bloomLength;
|
|
}
|
|
|
|
/*
|
|
* Random generator copied from FreeBSD. Using own random generator here for
|
|
* two reasons:
|
|
*
|
|
* 1) In this case random numbers are used for on-disk storage. Usage of
|
|
* PostgreSQL number generator would obstruct it from all possible changes.
|
|
* 2) Changing seed of PostgreSQL random generator would be undesirable side
|
|
* effect.
|
|
*/
|
|
static int32 next;
|
|
|
|
static int32
|
|
myRand(void)
|
|
{
|
|
/*----------
|
|
* Compute x = (7^5 * x) mod (2^31 - 1)
|
|
* without overflowing 31 bits:
|
|
* (2^31 - 1) = 127773 * (7^5) + 2836
|
|
* From "Random number generators: good ones are hard to find",
|
|
* Park and Miller, Communications of the ACM, vol. 31, no. 10,
|
|
* October 1988, p. 1195.
|
|
*----------
|
|
*/
|
|
int32 hi,
|
|
lo,
|
|
x;
|
|
|
|
/* Must be in [1, 0x7ffffffe] range at this point. */
|
|
hi = next / 127773;
|
|
lo = next % 127773;
|
|
x = 16807 * lo - 2836 * hi;
|
|
if (x < 0)
|
|
x += 0x7fffffff;
|
|
next = x;
|
|
/* Transform to [0, 0x7ffffffd] range. */
|
|
return (x - 1);
|
|
}
|
|
|
|
static void
|
|
mySrand(uint32 seed)
|
|
{
|
|
next = seed;
|
|
/* Transform to [1, 0x7ffffffe] range. */
|
|
next = (next % 0x7ffffffe) + 1;
|
|
}
|
|
|
|
/*
|
|
* Add bits of given value to the signature.
|
|
*/
|
|
void
|
|
signValue(BloomState *state, BloomSignatureWord *sign, Datum value, int attno)
|
|
{
|
|
uint32 hashVal;
|
|
int nBit,
|
|
j;
|
|
|
|
/*
|
|
* init generator with "column's" number to get "hashed" seed for new
|
|
* value. We don't want to map the same numbers from different columns
|
|
* into the same bits!
|
|
*/
|
|
mySrand(attno);
|
|
|
|
/*
|
|
* Init hash sequence to map our value into bits. the same values in
|
|
* different columns will be mapped into different bits because of step
|
|
* above
|
|
*/
|
|
hashVal = DatumGetInt32(FunctionCall1(&state->hashFn[attno], value));
|
|
mySrand(hashVal ^ myRand());
|
|
|
|
for (j = 0; j < state->opts.bitSize[attno]; j++)
|
|
{
|
|
/* prevent multiple evaluation in SETBIT macro */
|
|
nBit = myRand() % (state->opts.bloomLength * SIGNWORDBITS);
|
|
SETBIT(sign, nBit);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Make bloom tuple from values.
|
|
*/
|
|
BloomTuple *
|
|
BloomFormTuple(BloomState *state, ItemPointer iptr, Datum *values, bool *isnull)
|
|
{
|
|
int i;
|
|
BloomTuple *res = (BloomTuple *) palloc0(state->sizeOfBloomTuple);
|
|
|
|
res->heapPtr = *iptr;
|
|
|
|
/* Blooming each column */
|
|
for (i = 0; i < state->nColumns; i++)
|
|
{
|
|
/* skip nulls */
|
|
if (isnull[i])
|
|
continue;
|
|
|
|
signValue(state, res->sign, values[i], i);
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
/*
|
|
* Add new bloom tuple to the page. Returns true if new tuple was successfully
|
|
* added to the page. Returns false if it doesn't fit on the page.
|
|
*/
|
|
bool
|
|
BloomPageAddItem(BloomState *state, Page page, BloomTuple *tuple)
|
|
{
|
|
BloomTuple *itup;
|
|
BloomPageOpaque opaque;
|
|
Pointer ptr;
|
|
|
|
/* We shouldn't be pointed to an invalid page */
|
|
Assert(!PageIsNew(page) && !BloomPageIsDeleted(page));
|
|
|
|
/* Does new tuple fit on the page? */
|
|
if (BloomPageGetFreeSpace(state, page) < state->sizeOfBloomTuple)
|
|
return false;
|
|
|
|
/* Copy new tuple to the end of page */
|
|
opaque = BloomPageGetOpaque(page);
|
|
itup = BloomPageGetTuple(state, page, opaque->maxoff + 1);
|
|
memcpy((Pointer) itup, (Pointer) tuple, state->sizeOfBloomTuple);
|
|
|
|
/* Adjust maxoff and pd_lower */
|
|
opaque->maxoff++;
|
|
ptr = (Pointer) BloomPageGetTuple(state, page, opaque->maxoff + 1);
|
|
((PageHeader) page)->pd_lower = ptr - page;
|
|
|
|
/* Assert we didn't overrun available space */
|
|
Assert(((PageHeader) page)->pd_lower <= ((PageHeader) page)->pd_upper);
|
|
|
|
return true;
|
|
}
|
|
|
|
/*
|
|
* Allocate a new page (either by recycling, or by extending the index file)
|
|
* The returned buffer is already pinned and exclusive-locked
|
|
* Caller is responsible for initializing the page by calling BloomInitBuffer
|
|
*/
|
|
Buffer
|
|
BloomNewBuffer(Relation index)
|
|
{
|
|
Buffer buffer;
|
|
bool needLock;
|
|
|
|
/* First, try to get a page from FSM */
|
|
for (;;)
|
|
{
|
|
BlockNumber blkno = GetFreeIndexPage(index);
|
|
|
|
if (blkno == InvalidBlockNumber)
|
|
break;
|
|
|
|
buffer = ReadBuffer(index, blkno);
|
|
|
|
/*
|
|
* We have to guard against the possibility that someone else already
|
|
* recycled this page; the buffer may be locked if so.
|
|
*/
|
|
if (ConditionalLockBuffer(buffer))
|
|
{
|
|
Page page = BufferGetPage(buffer);
|
|
|
|
if (PageIsNew(page))
|
|
return buffer; /* OK to use, if never initialized */
|
|
|
|
if (BloomPageIsDeleted(page))
|
|
return buffer; /* OK to use */
|
|
|
|
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
|
|
}
|
|
|
|
/* Can't use it, so release buffer and try again */
|
|
ReleaseBuffer(buffer);
|
|
}
|
|
|
|
/* Must extend the file */
|
|
needLock = !RELATION_IS_LOCAL(index);
|
|
if (needLock)
|
|
LockRelationForExtension(index, ExclusiveLock);
|
|
|
|
buffer = ReadBuffer(index, P_NEW);
|
|
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
|
|
|
|
if (needLock)
|
|
UnlockRelationForExtension(index, ExclusiveLock);
|
|
|
|
return buffer;
|
|
}
|
|
|
|
/*
|
|
* Initialize any page of a bloom index.
|
|
*/
|
|
void
|
|
BloomInitPage(Page page, uint16 flags)
|
|
{
|
|
BloomPageOpaque opaque;
|
|
|
|
PageInit(page, BLCKSZ, sizeof(BloomPageOpaqueData));
|
|
|
|
opaque = BloomPageGetOpaque(page);
|
|
memset(opaque, 0, sizeof(BloomPageOpaqueData));
|
|
opaque->flags = flags;
|
|
opaque->bloom_page_id = BLOOM_PAGE_ID;
|
|
}
|
|
|
|
/*
|
|
* Fill in metapage for bloom index.
|
|
*/
|
|
void
|
|
BloomFillMetapage(Relation index, Page metaPage)
|
|
{
|
|
BloomOptions *opts;
|
|
BloomMetaPageData *metadata;
|
|
|
|
/*
|
|
* Choose the index's options. If reloptions have been assigned, use
|
|
* those, otherwise create default options.
|
|
*/
|
|
opts = (BloomOptions *) index->rd_options;
|
|
if (!opts)
|
|
opts = makeDefaultBloomOptions();
|
|
|
|
/*
|
|
* Initialize contents of meta page, including a copy of the options,
|
|
* which are now frozen for the life of the index.
|
|
*/
|
|
BloomInitPage(metaPage, BLOOM_META);
|
|
metadata = BloomPageGetMeta(metaPage);
|
|
memset(metadata, 0, sizeof(BloomMetaPageData));
|
|
metadata->magickNumber = BLOOM_MAGICK_NUMBER;
|
|
metadata->opts = *opts;
|
|
((PageHeader) metaPage)->pd_lower += sizeof(BloomMetaPageData);
|
|
|
|
/* If this fails, probably FreeBlockNumberArray size calc is wrong: */
|
|
Assert(((PageHeader) metaPage)->pd_lower <= ((PageHeader) metaPage)->pd_upper);
|
|
}
|
|
|
|
/*
|
|
* Initialize metapage for bloom index.
|
|
*/
|
|
void
|
|
BloomInitMetapage(Relation index)
|
|
{
|
|
Buffer metaBuffer;
|
|
Page metaPage;
|
|
GenericXLogState *state;
|
|
|
|
/*
|
|
* Make a new page; since it is first page it should be associated with
|
|
* block number 0 (BLOOM_METAPAGE_BLKNO).
|
|
*/
|
|
metaBuffer = BloomNewBuffer(index);
|
|
Assert(BufferGetBlockNumber(metaBuffer) == BLOOM_METAPAGE_BLKNO);
|
|
|
|
/* Initialize contents of meta page */
|
|
state = GenericXLogStart(index);
|
|
metaPage = GenericXLogRegisterBuffer(state, metaBuffer,
|
|
GENERIC_XLOG_FULL_IMAGE);
|
|
BloomFillMetapage(index, metaPage);
|
|
GenericXLogFinish(state);
|
|
|
|
UnlockReleaseBuffer(metaBuffer);
|
|
}
|
|
|
|
/*
|
|
* Parse reloptions for bloom index, producing a BloomOptions struct.
|
|
*/
|
|
bytea *
|
|
bloptions(Datum reloptions, bool validate)
|
|
{
|
|
relopt_value *options;
|
|
int numoptions;
|
|
BloomOptions *rdopts;
|
|
|
|
/* Parse the user-given reloptions */
|
|
options = parseRelOptions(reloptions, validate, bl_relopt_kind, &numoptions);
|
|
rdopts = allocateReloptStruct(sizeof(BloomOptions), options, numoptions);
|
|
fillRelOptions((void *) rdopts, sizeof(BloomOptions), options, numoptions,
|
|
validate, bl_relopt_tab, lengthof(bl_relopt_tab));
|
|
|
|
/* Convert signature length from # of bits to # to words, rounding up */
|
|
rdopts->bloomLength = (rdopts->bloomLength + SIGNWORDBITS - 1) / SIGNWORDBITS;
|
|
|
|
return (bytea *) rdopts;
|
|
}
|