Add support to dynahash.c for partitioning shared hashtables according

to the low-order bits of the entry hash value. Also make some incidental cleanups in the dynahash API, such as not exporting the hash header structs to the world.
2024-12-27 08:39:28 +08:00 · 2006-07-22 23:04:39 +00:00 · 2006-07-22 23:04:39 +00:00 · 51ee9fa157
commit 51ee9fa157
parent c0e9b3139f
4 changed files with 387 additions and 180 deletions
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.93 2006/07/14 14:52:22 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/shmem.c,v 1.94 2006/07/22 23:04:39 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -211,9 +211,6 @@ InitShmemIndex(void)
 {
 	HASHCTL		info;
 	int			hash_flags;
-	ShmemIndexEnt *result,
-				item;
-	bool		found;

 	/*
 	 * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
@ -227,32 +224,11 @@ InitShmemIndex(void)
 	info.entrysize = sizeof(ShmemIndexEnt);
 	hash_flags = HASH_ELEM;

-	/* This will acquire the shmem index lock, but not release it. */
 	ShmemIndex = ShmemInitHash("ShmemIndex",
 							   SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
 							   &info, hash_flags);
 	if (!ShmemIndex)
 		elog(FATAL, "could not initialize Shmem Index");
-
-	/*
-	 * Now, create an entry in the hashtable for the index itself.
-	 */
-	if (!IsUnderPostmaster)
-	{
-		MemSet(item.key, 0, SHMEM_INDEX_KEYSIZE);
-		strncpy(item.key, "ShmemIndex", SHMEM_INDEX_KEYSIZE);
-
-		result = (ShmemIndexEnt *)
-			hash_search(ShmemIndex, (void *) &item, HASH_ENTER, &found);
-
-		Assert(!found);
-
-		result->location = MAKE_OFFSET(ShmemIndex->hctl);
-		result->size = SHMEM_INDEX_SIZE;
-	}
-
-	/* now release the lock acquired in ShmemInitStruct */
-	LWLockRelease(ShmemIndexLock);
 }

 /*
@ -295,7 +271,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */

 	/* look it up in the shmem index */
 	location = ShmemInitStruct(name,
-						sizeof(HASHHDR) + infoP->dsize * sizeof(HASHSEGMENT),
+							   hash_get_shared_size(infoP, hash_flags),
 							   &found);

 	/*
@ -312,9 +288,8 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
 	if (found)
 		hash_flags |= HASH_ATTACH;

-	/* Now provide the header and directory pointers */
+	/* Pass location of hashtable header to hash_create */
 	infoP->hctl = (HASHHDR *) location;
-	infoP->dir = (HASHSEGMENT *) (((char *) location) + sizeof(HASHHDR));

 	return hash_create(name, init_size, infoP, hash_flags);
 }
@ -363,14 +338,16 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr)
 			 * If the shmem index doesn't exist, we are bootstrapping: we must
 			 * be trying to init the shmem index itself.
 			 *
-			 * Notice that the ShmemIndexLock is held until the shmem index
-			 * has been completely initialized.
+			 * Notice that the ShmemIndexLock is released before the shmem
+			 * index has been initialized.  This should be OK because no
+			 * other process can be accessing shared memory yet.
 			 */
 			Assert(shmemseghdr->indexoffset == 0);
 			structPtr = ShmemAlloc(size);
 			shmemseghdr->indexoffset = MAKE_OFFSET(structPtr);
 			*foundPtr = FALSE;
 		}
+		LWLockRelease(ShmemIndexLock);
 		return structPtr;
 	}

--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@ -8,7 +8,7 @@
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.166 2006/07/14 14:52:23 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.167 2006/07/22 23:04:39 tgl Exp $
 *
 * NOTES
 *	  A lock table is a shared memory hash table.  When
@ -1958,7 +1958,7 @@ GetLockStatusData(void)
 	{
 		LWLockAcquire(FirstLockMgrLock + i, LW_SHARED);
 		proclockTable = LockMethodProcLockHash[i];
-		els += proclockTable->hctl->nentries;
+		els += hash_get_num_entries(proclockTable);
 	}

 	data->nelements = els;
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@ -3,17 +3,36 @@
 * dynahash.c
 *	  dynamic hash tables
 *
+ * dynahash.c supports both local-to-a-backend hash tables and hash tables in
+ * shared memory.  For shared hash tables, it is the caller's responsibility
+ * to provide appropriate access interlocking.  The simplest convention is
+ * that a single LWLock protects the whole hash table.  Searches (HASH_FIND or
+ * hash_seq_search) need only shared lock, but any update requires exclusive
+ * lock.  For heavily-used shared tables, the single-lock approach creates a
+ * concurrency bottleneck, so we also support "partitioned" locking wherein
+ * there are multiple LWLocks guarding distinct subsets of the table.  To use
+ * a hash table in partitioned mode, the HASH_PARTITION flag must be given
+ * to hash_create.  This prevents any attempt to split buckets on-the-fly.
+ * Therefore, each hash bucket chain operates independently, and no fields
+ * of the hash header change after init except nentries and freeList.
+ * A partitioned table uses a spinlock to guard changes of those two fields.
+ * This lets any subset of the hash buckets be treated as a separately
+ * lockable partition.  We expect callers to use the low-order bits of a
+ * lookup key's hash value as a partition number --- this will work because
+ * of the way calc_bucket() maps hash values to bucket numbers.
 *
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *
 * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.69 2006/07/14 14:52:25 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/hash/dynahash.c,v 1.70 2006/07/22 23:04:39 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
+
 /*
+ * Original comments:
 *
 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
 * Coded into C, with minor code improvements, and with hsearch(3) interface,
@ -45,9 +64,107 @@
 #include "postgres.h"

 #include "storage/shmem.h"
+#include "storage/spin.h"
 #include "utils/dynahash.h"
 #include "utils/memutils.h"

+
+/*
+ * Constants
+ *
+ * A hash table has a top-level "directory", each of whose entries points
+ * to a "segment" of ssize bucket headers.	The maximum number of hash
+ * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
+ * the number of records in the table can be larger, but we don't want a
+ * whole lot of records per bucket or performance goes down.
+ *
+ * In a hash table allocated in shared memory, the directory cannot be
+ * expanded because it must stay at a fixed address.  The directory size
+ * should be selected using hash_select_dirsize (and you'd better have
+ * a good idea of the maximum number of entries!).	For non-shared hash
+ * tables, the initial directory size can be left at the default.
+ */
+#define DEF_SEGSIZE			   256
+#define DEF_SEGSIZE_SHIFT	   8	/* must be log2(DEF_SEGSIZE) */
+#define DEF_DIRSIZE			   256
+#define DEF_FFACTOR			   1	/* default fill factor */
+
+
+/* A hash bucket is a linked list of HASHELEMENTs */
+typedef HASHELEMENT *HASHBUCKET;
+
+/* A hash segment is an array of bucket headers */
+typedef HASHBUCKET *HASHSEGMENT;
+
+/*
+ * Header structure for a hash table --- contains all changeable info
+ *
+ * In a shared-memory hash table, the HASHHDR is in shared memory, while
+ * each backend has a local HTAB struct.  For a non-shared table, there isn't
+ * any functional difference between HASHHDR and HTAB, but we separate them
+ * anyway to share code between shared and non-shared tables.
+ */
+struct HASHHDR
+{
+	/* In a partitioned table, take this lock to touch nentries or freeList */
+	slock_t		mutex;			/* unused if not partitioned table */
+
+	/* These fields change during entry addition/deletion */
+	long		nentries;		/* number of entries in hash table */
+	HASHELEMENT *freeList;		/* linked list of free elements */
+
+	/* These fields can change, but not in a partitioned table */
+	/* Also, dsize can't change in a shared table, even if unpartitioned */
+	long		dsize;			/* directory size */
+	long		nsegs;			/* number of allocated segments (<= dsize) */
+	uint32		max_bucket;		/* ID of maximum bucket in use */
+	uint32		high_mask;		/* mask to modulo into entire table */
+	uint32		low_mask;		/* mask to modulo into lower half of table */
+
+	/* These fields are fixed at hashtable creation */
+	Size		keysize;		/* hash key length in bytes */
+	Size		entrysize;		/* total user element size in bytes */
+	long		num_partitions;	/* # partitions (must be power of 2), or 0 */
+	long		ffactor;		/* target fill factor */
+	long		max_dsize;		/* 'dsize' limit if directory is fixed size */
+	long		ssize;			/* segment size --- must be power of 2 */
+	int			sshift;			/* segment shift = log2(ssize) */
+	int			nelem_alloc;	/* number of entries to allocate at once */
+
+#ifdef HASH_STATISTICS
+	/*
+	 * Count statistics here.  NB: stats code doesn't bother with mutex,
+	 * so counts could be corrupted a bit in a partitioned table.
+	 */
+	long		accesses;
+	long		collisions;
+#endif
+};
+
+#define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
+
+/*
+ * Top control structure for a hashtable --- in a shared table, each backend
+ * has its own copy (OK since no fields change at runtime)
+ */
+struct HTAB
+{
+	HASHHDR    *hctl;			/* => shared control information */
+	HASHSEGMENT *dir;			/* directory of segment starts */
+	HashValueFunc hash;			/* hash function */
+	HashCompareFunc match;		/* key comparison function */
+	HashCopyFunc keycopy;		/* key copying function */
+	HashAllocFunc alloc;		/* memory allocator */
+	MemoryContext hcxt;			/* memory context if default allocator used */
+	char	   *tabname;		/* table name (for error messages) */
+	bool		isshared;		/* true if table is in shared memory */
+
+	/* We keep local copies of these fixed values to reduce contention */
+	Size		keysize;		/* hash key length in bytes */
+	long		ssize;			/* segment size --- must be power of 2 */
+	int			sshift;			/* segment shift = log2(ssize) */
+};
+
 /*
 * Key (also entry) part of a HASHELEMENT
 */
@ -58,6 +175,12 @@
 */
 #define MOD(x,y)			   ((x) & ((y)-1))

+#if HASH_STATISTICS
+static long hash_accesses,
+			hash_collisions,
+			hash_expansions;
+#endif
+
 /*
 * Private function prototypes
 */
@ -66,6 +189,7 @@ static HASHSEGMENT seg_alloc(HTAB *hashp);
 static bool element_alloc(HTAB *hashp, int nelem);
 static bool dir_realloc(HTAB *hashp);
 static bool expand_table(HTAB *hashp);
+static HASHBUCKET get_hash_entry(HTAB *hashp);
 static void hdefault(HTAB *hashp);
 static int	choose_nelem_alloc(Size entrysize);
 static bool init_htab(HTAB *hashp, long nelem);
@ -85,13 +209,6 @@ DynaHashAlloc(Size size)
 }


-#if HASH_STATISTICS
-static long hash_accesses,
-			hash_collisions,
-			hash_expansions;
-#endif
-
-
 /************************** CREATE ROUTINES **********************/

 /*
@ -185,17 +302,26 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	if (flags & HASH_SHARED_MEM)
 	{
 		/*
-		 * ctl structure is preallocated for shared memory tables. Note that
-		 * HASH_DIRSIZE and HASH_ALLOC had better be set as well.
+		 * ctl structure and directory are preallocated for shared memory
+		 * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set
+		 * as well.
 		 */
 		hashp->hctl = info->hctl;
-		hashp->dir = info->dir;
+		hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
 		hashp->hcxt = NULL;
 		hashp->isshared = true;

 		/* hash table already exists, we're just attaching to it */
 		if (flags & HASH_ATTACH)
+		{
+			/* make local copies of some heavily-used values */
+			hctl = hashp->hctl;
+			hashp->keysize = hctl->keysize;
+			hashp->ssize = hctl->ssize;
+			hashp->sshift = hctl->sshift;
+
 			return hashp;
+		}
 	}
 	else
 	{
@ -218,9 +344,16 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	hdefault(hashp);

 	hctl = hashp->hctl;
-#ifdef HASH_STATISTICS
-	hctl->accesses = hctl->collisions = 0;
-#endif
+
+	if (flags & HASH_PARTITION)
+	{
+		/* Doesn't make sense to partition a local hash table */
+		Assert(flags & HASH_SHARED_MEM);
+		/* # of partitions had better be a power of 2 */
+		Assert(info->num_partitions == (1L << my_log2(info->num_partitions)));
+
+		hctl->num_partitions = info->num_partitions;
+	}

 	if (flags & HASH_SEGMENT)
 	{
@ -252,6 +385,11 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 		hctl->entrysize = info->entrysize;
 	}

+	/* make local copies of heavily-used constant fields */
+	hashp->keysize = hctl->keysize;
+	hashp->ssize = hctl->ssize;
+	hashp->sshift = hctl->sshift;
+
 	/* Build the hash directory structure */
 	if (!init_htab(hashp, nelem))
 	{
@ -292,22 +430,29 @@ hdefault(HTAB *hashp)

 	MemSet(hctl, 0, sizeof(HASHHDR));

-	hctl->ssize = DEF_SEGSIZE;
-	hctl->sshift = DEF_SEGSIZE_SHIFT;
-	hctl->dsize = DEF_DIRSIZE;
-	hctl->ffactor = DEF_FFACTOR;
 	hctl->nentries = 0;
+	hctl->freeList = NULL;
+
+	hctl->dsize = DEF_DIRSIZE;
 	hctl->nsegs = 0;

 	/* rather pointless defaults for key & entry size */
 	hctl->keysize = sizeof(char *);
 	hctl->entrysize = 2 * sizeof(char *);

+	hctl->num_partitions = 0;	/* not partitioned */
+
+	hctl->ffactor = DEF_FFACTOR;
+
 	/* table has no fixed maximum size */
 	hctl->max_dsize = NO_MAX_DSIZE;

-	/* garbage collection for HASH_REMOVE */
-	hctl->freeList = NULL;
+	hctl->ssize = DEF_SEGSIZE;
+	hctl->sshift = DEF_SEGSIZE_SHIFT;
+
+#ifdef HASH_STATISTICS
+	hctl->accesses = hctl->collisions = 0;
+#endif
 }

 /*
@ -342,6 +487,10 @@ choose_nelem_alloc(Size entrysize)
 	return nelem_alloc;
 }

+/*
+ * Compute derived fields of hctl and build the initial directory/segment
+ * arrays
+ */
 static bool
 init_htab(HTAB *hashp, long nelem)
 {
@ -351,6 +500,12 @@ init_htab(HTAB *hashp, long nelem)
 	int			nbuckets;
 	int			nsegs;

+	/*
+	 * initialize mutex if it's a partitioned table
+	 */
+	if (IS_PARTITIONED(hctl))
+		SpinLockInit(&hctl->mutex);
+
 	/*
 	 * Divide number of elements by the fill factor to determine a desired
 	 * number of buckets.  Allocate space for the next greater power of two
@ -360,6 +515,15 @@ init_htab(HTAB *hashp, long nelem)

 	nbuckets = 1 << my_log2(lnbuckets);

+	/*
+	 * In a partitioned table, nbuckets must be at least equal to
+	 * num_partitions; were it less, keys with apparently different partition
+	 * numbers would map to the same bucket, breaking partition independence.
+	 * (Normally nbuckets will be much bigger; this is just a safety check.)
+	 */
+	while (nbuckets < hctl->num_partitions)
+		nbuckets <<= 1;
+
 	hctl->max_bucket = hctl->low_mask = nbuckets - 1;
 	hctl->high_mask = (nbuckets << 1) - 1;

@ -491,6 +655,19 @@ hash_select_dirsize(long num_entries)
 	return nDirEntries;
 }

+/*
+ * Compute the required initial memory allocation for a shared-memory
+ * hashtable with the given parameters.  We need space for the HASHHDR
+ * and for the (non expansible) directory.
+ */
+Size
+hash_get_shared_size(HASHCTL *info, int flags)
+{
+	Assert(flags & HASH_DIRSIZE);
+	Assert(info->dsize == info->max_dsize);
+	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
+}
+

 /********************** DESTROY ROUTINES ************************/

@ -521,7 +698,7 @@ hash_stats(const char *where, HTAB *hashp)
 			where, hashp->hctl->accesses, hashp->hctl->collisions);

 	fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
-			hashp->hctl->nentries, hashp->hctl->keysize,
+			hashp->hctl->nentries, (long) hashp->hctl->keysize,
 			hashp->hctl->max_bucket, hashp->hctl->nsegs);
 	fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
 			where, hash_accesses, hash_collisions);
@ -533,6 +710,19 @@ hash_stats(const char *where, HTAB *hashp)
 /*******************************SEARCH ROUTINES *****************************/


+/*
+ * get_hash_value -- exported routine to calculate a key's hash value
+ *
+ * We export this because for partitioned tables, callers need to compute
+ * the partition number (from the low-order bits of the hash value) before
+ * searching.
+ */
+uint32
+get_hash_value(HTAB *hashp, const void *keyPtr)
+{
+	return hashp->hash(keyPtr, hashp->keysize);
+}
+
 /* Convert a hash value to a bucket number */
 static inline uint32
 calc_bucket(HASHHDR *hctl, uint32 hash_val)
@ -546,8 +736,9 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val)
 	return bucket;
 }

-/*----------
+/*
 * hash_search -- look up key in table and perform action
+ * hash_search_with_hash_value -- same, with key's hash value already computed
 *
 * action is one of:
 *		HASH_FIND: look up key in table
@ -568,17 +759,32 @@ calc_bucket(HASHHDR *hctl, uint32 hash_val)
 * If foundPtr isn't NULL, then *foundPtr is set TRUE if we found an
 * existing entry in the table, FALSE otherwise.  This is needed in the
 * HASH_ENTER case, but is redundant with the return value otherwise.
- *----------
+ *
+ * For hash_search_with_hash_value, the hashvalue parameter must have been
+ * calculated with get_hash_value().
 */
 void *
 hash_search(HTAB *hashp,
 			const void *keyPtr,
 			HASHACTION action,
 			bool *foundPtr)
+{
+	return hash_search_with_hash_value(hashp,
+									   keyPtr,
+									   hashp->hash(keyPtr, hashp->keysize),
+									   action,
+									   foundPtr);
+}
+
+void *
+hash_search_with_hash_value(HTAB *hashp,
+							const void *keyPtr,
+							uint32 hashvalue,
+							HASHACTION action,
+							bool *foundPtr)
 {
 	HASHHDR    *hctl = hashp->hctl;
-	Size		keysize = hctl->keysize;
-	uint32		hashvalue;
+	Size		keysize;
 	uint32		bucket;
 	long		segment_num;
 	long		segment_ndx;
@ -595,11 +801,10 @@ hash_search(HTAB *hashp,
 	/*
 	 * Do the initial lookup
 	 */
-	hashvalue = hashp->hash(keyPtr, keysize);
 	bucket = calc_bucket(hctl, hashvalue);

-	segment_num = bucket >> hctl->sshift;
-	segment_ndx = MOD(bucket, hctl->ssize);
+	segment_num = bucket >> hashp->sshift;
+	segment_ndx = MOD(bucket, hashp->ssize);

 	segp = hashp->dir[segment_num];

@ -613,6 +818,7 @@ hash_search(HTAB *hashp,
 	 * Follow collision chain looking for matching key
 	 */
 	match = hashp->match;		/* save one fetch in inner loop */
+	keysize = hashp->keysize;	/* ditto */

 	while (currBucket != NULL)
 	{
@ -643,15 +849,25 @@ hash_search(HTAB *hashp,
 		case HASH_REMOVE:
 			if (currBucket != NULL)
 			{
-				Assert(hctl->nentries > 0);
-				hctl->nentries--;
+				/* use volatile pointer to prevent code rearrangement */
+				volatile HASHHDR *hctlv = hctl;
+
+				/* if partitioned, must lock to touch nentries and freeList */
+				if (IS_PARTITIONED(hctlv))
+					SpinLockAcquire(&hctlv->mutex);
+
+				Assert(hctlv->nentries > 0);
+				hctlv->nentries--;

 				/* remove record from hash bucket's chain. */
 				*prevBucketPtr = currBucket->link;

 				/* add the record to the freelist for this table.  */
-				currBucket->link = hctl->freeList;
-				hctl->freeList = currBucket;
+				currBucket->link = hctlv->freeList;
+				hctlv->freeList = currBucket;
+
+				if (IS_PARTITIONED(hctlv))
+					SpinLockRelease(&hctlv->mutex);

 				/*
 				 * better hope the caller is synchronizing access to this
@ -672,32 +888,23 @@ hash_search(HTAB *hashp,
 			if (currBucket != NULL)
 				return (void *) ELEMENTKEY(currBucket);

-			/* get the next free element */
-			currBucket = hctl->freeList;
+			currBucket = get_hash_entry(hashp);
 			if (currBucket == NULL)
 			{
-				/* no free elements.  allocate another chunk of buckets */
-				if (!element_alloc(hashp, hctl->nelem_alloc))
-				{
-					/* out of memory */
-					if (action == HASH_ENTER_NULL)
-						return NULL;
-					/* report a generic message */
-					if (hashp->isshared)
-						ereport(ERROR,
-								(errcode(ERRCODE_OUT_OF_MEMORY),
-								 errmsg("out of shared memory")));
-					else
-						ereport(ERROR,
-								(errcode(ERRCODE_OUT_OF_MEMORY),
-								 errmsg("out of memory")));
-				}
-				currBucket = hctl->freeList;
-				Assert(currBucket != NULL);
+				/* out of memory */
+				if (action == HASH_ENTER_NULL)
+					return NULL;
+				/* report a generic message */
+				if (hashp->isshared)
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of shared memory")));
+				else
+					ereport(ERROR,
+							(errcode(ERRCODE_OUT_OF_MEMORY),
+							 errmsg("out of memory")));
 			}

-			hctl->freeList = currBucket->link;
-
 			/* link into hashbucket chain */
 			*prevBucketPtr = currBucket;
 			currBucket->link = NULL;
@ -708,8 +915,10 @@ hash_search(HTAB *hashp,

 			/* caller is expected to fill the data field on return */

-			/* Check if it is time to split the segment */
-			if (++hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
+			/* Check if it is time to split a bucket */
+			/* Can't split if running in partitioned mode */
+			if (!IS_PARTITIONED(hctl) &&
+				hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor)
 			{
 				/*
 				 * NOTE: failure to expand table is not a fatal error, it just
@ -726,6 +935,61 @@ hash_search(HTAB *hashp,
 	return NULL;				/* keep compiler quiet */
 }

+/*
+ * create a new entry if possible
+ */
+static HASHBUCKET
+get_hash_entry(HTAB *hashp)
+{
+	/* use volatile pointer to prevent code rearrangement */
+	volatile HASHHDR *hctlv = hashp->hctl;
+	HASHBUCKET	newElement;
+
+	for (;;)
+	{
+		/* if partitioned, must lock to touch nentries and freeList */
+		if (IS_PARTITIONED(hctlv))
+			SpinLockAcquire(&hctlv->mutex);
+
+		/* try to get an entry from the freelist */
+		newElement = hctlv->freeList;
+		if (newElement != NULL)
+			break;
+
+		/* no free elements.  allocate another chunk of buckets */
+		if (IS_PARTITIONED(hctlv))
+			SpinLockRelease(&hctlv->mutex);
+
+		if (!element_alloc(hashp, hctlv->nelem_alloc))
+		{
+			/* out of memory */
+			return NULL;
+		}
+	}
+
+	/* remove entry from freelist, bump nentries */
+	hctlv->freeList = newElement->link;
+	hctlv->nentries++;
+
+	if (IS_PARTITIONED(hctlv))
+		SpinLockRelease(&hctlv->mutex);
+
+	return newElement;
+}
+
+/*
+ * hash_get_num_entries -- get the number of entries in a hashtable
+ */
+long
+hash_get_num_entries(HTAB *hashp)
+{
+	/*
+	 * We currently don't bother with the mutex; it's only sensible to call
+	 * this function if you've got lock on all partitions of the table.
+	 */
+	return hashp->hctl->nentries;
+}
+
 /*
 * hash_seq_init/_search
 *			Sequentially search through hash table and return
@ -736,6 +1000,9 @@ hash_search(HTAB *hashp,
 * UNDEFINED (it might be the one that curIndex is pointing at!).  Also,
 * if elements are added to the table while the scan is in progress, it is
 * unspecified whether they will be visited by the scan or not.
+ *
+ * NOTE: to use this with a partitioned hashtable, caller had better hold
+ * at least shared lock on all partitions of the table throughout the scan!
 */
 void
 hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
@ -773,7 +1040,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 	curBucket = status->curBucket;
 	hashp = status->hashp;
 	hctl = hashp->hctl;
-	ssize = hctl->ssize;
+	ssize = hashp->ssize;
 	max_bucket = hctl->max_bucket;

 	if (curBucket > max_bucket)
@ -782,7 +1049,7 @@ hash_seq_search(HASH_SEQ_STATUS *status)
 	/*
 	 * first find the right segment in the table directory.
 	 */
-	segment_num = curBucket >> hctl->sshift;
+	segment_num = curBucket >> hashp->sshift;
 	segment_ndx = MOD(curBucket, ssize);

 	segp = hashp->dir[segment_num];
@ -840,13 +1107,15 @@ expand_table(HTAB *hashp)
 	HASHBUCKET	currElement,
 				nextElement;

+	Assert(!IS_PARTITIONED(hctl));
+
 #ifdef HASH_STATISTICS
 	hash_expansions++;
 #endif

 	new_bucket = hctl->max_bucket + 1;
-	new_segnum = new_bucket >> hctl->sshift;
-	new_segndx = MOD(new_bucket, hctl->ssize);
+	new_segnum = new_bucket >> hashp->sshift;
+	new_segndx = MOD(new_bucket, hashp->ssize);

 	if (new_segnum >= hctl->nsegs)
 	{
@ -885,8 +1154,8 @@ expand_table(HTAB *hashp)
 	 * split at this point.  With a different way of reducing the hash value,
 	 * that might not be true!
 	 */
-	old_segnum = old_bucket >> hctl->sshift;
-	old_segndx = MOD(old_bucket, hctl->ssize);
+	old_segnum = old_bucket >> hashp->sshift;
+	old_segndx = MOD(old_bucket, hashp->ssize);

 	old_seg = hashp->dir[old_segnum];
 	new_seg = hashp->dir[new_segnum];
@ -963,12 +1232,12 @@ seg_alloc(HTAB *hashp)
 	HASHSEGMENT segp;

 	CurrentDynaHashCxt = hashp->hcxt;
-	segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->hctl->ssize);
+	segp = (HASHSEGMENT) hashp->alloc(sizeof(HASHBUCKET) * hashp->ssize);

 	if (!segp)
 		return NULL;

-	MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->hctl->ssize);
+	MemSet(segp, 0, sizeof(HASHBUCKET) * hashp->ssize);

 	return segp;
 }
@ -979,29 +1248,44 @@ seg_alloc(HTAB *hashp)
 static bool
 element_alloc(HTAB *hashp, int nelem)
 {
-	HASHHDR    *hctl = hashp->hctl;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile HASHHDR *hctlv = hashp->hctl;
 	Size		elementSize;
+	HASHELEMENT *firstElement;
 	HASHELEMENT *tmpElement;
+	HASHELEMENT *prevElement;
 	int			i;

 	/* Each element has a HASHELEMENT header plus user data. */
-	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctl->entrysize);
+	elementSize = MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(hctlv->entrysize);

 	CurrentDynaHashCxt = hashp->hcxt;
-	tmpElement = (HASHELEMENT *)
-		hashp->alloc(nelem * elementSize);
+	firstElement = (HASHELEMENT *) hashp->alloc(nelem * elementSize);

-	if (!tmpElement)
+	if (!firstElement)
 		return false;

-	/* link all the new entries into the freelist */
+	/* prepare to link all the new entries into the freelist */
+	prevElement = NULL;
+	tmpElement = firstElement;
 	for (i = 0; i < nelem; i++)
 	{
-		tmpElement->link = hctl->freeList;
-		hctl->freeList = tmpElement;
+		tmpElement->link = prevElement;
+		prevElement = tmpElement;
 		tmpElement = (HASHELEMENT *) (((char *) tmpElement) + elementSize);
 	}

+	/* if partitioned, must lock to touch freeList */
+	if (IS_PARTITIONED(hctlv))
+		SpinLockAcquire(&hctlv->mutex);
+
+	/* freelist could be nonempty if two backends did this concurrently */
+	firstElement->link = hctlv->freeList;
+	hctlv->freeList = prevElement;
+
+	if (IS_PARTITIONED(hctlv))
+		SpinLockRelease(&hctlv->mutex);
+
 	return true;
 }

--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@ -1,13 +1,13 @@
 /*-------------------------------------------------------------------------
 *
 * hsearch.h
- *	  for hash tables, particularly hash tables in shared memory
+ *	  exported definitions for utils/hash/dynahash.c; see notes therein
 *
 *
 * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.43 2006/06/25 18:29:49 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/hsearch.h,v 1.44 2006/07/22 23:04:39 tgl Exp $
 *
 *-------------------------------------------------------------------------
 */
@ -43,27 +43,6 @@ typedef void *(*HashCopyFunc) (void *dest, const void *src, Size keysize);
 */
 typedef void *(*HashAllocFunc) (Size request);

-/*
- * Constants
- *
- * A hash table has a top-level "directory", each of whose entries points
- * to a "segment" of ssize bucket headers.	The maximum number of hash
- * buckets is thus dsize * ssize (but dsize may be expansible).  Of course,
- * the number of records in the table can be larger, but we don't want a
- * whole lot of records per bucket or performance goes down.
- *
- * In a hash table allocated in shared memory, the directory cannot be
- * expanded because it must stay at a fixed address.  The directory size
- * should be selected using hash_select_dirsize (and you'd better have
- * a good idea of the maximum number of entries!).	For non-shared hash
- * tables, the initial directory size can be left at the default.
- */
-#define DEF_SEGSIZE			   256
-#define DEF_SEGSIZE_SHIFT	   8	/* must be log2(DEF_SEGSIZE) */
-#define DEF_DIRSIZE			   256
-#define DEF_FFACTOR			   1	/* default fill factor */
-
-
 /*
 * HASHELEMENT is the private part of a hashtable entry.  The caller's data
 * follows the HASHELEMENT structure (on a MAXALIGN'd boundary).  The hash key
@ -75,81 +54,42 @@ typedef struct HASHELEMENT
 	uint32		hashvalue;		/* hash function result for this entry */
 } HASHELEMENT;

-/* A hash bucket is a linked list of HASHELEMENTs */
-typedef HASHELEMENT *HASHBUCKET;
+/* Hash table header struct is an opaque type known only within dynahash.c */
+typedef struct HASHHDR HASHHDR;

-/* A hash segment is an array of bucket headers */
-typedef HASHBUCKET *HASHSEGMENT;
-
-/* Header structure for a hash table --- contains all changeable info */
-typedef struct HASHHDR
-{
-	long		dsize;			/* Directory Size */
-	long		ssize;			/* Segment Size --- must be power of 2 */
-	int			sshift;			/* Segment shift = log2(ssize) */
-	uint32		max_bucket;		/* ID of Maximum bucket in use */
-	uint32		high_mask;		/* Mask to modulo into entire table */
-	uint32		low_mask;		/* Mask to modulo into lower half of table */
-	long		ffactor;		/* Fill factor */
-	long		nentries;		/* Number of entries in hash table */
-	long		nsegs;			/* Number of allocated segments */
-	Size		keysize;		/* hash key length in bytes */
-	Size		entrysize;		/* total user element size in bytes */
-	long		max_dsize;		/* 'dsize' limit if directory is fixed size */
-	int			nelem_alloc;	/* number of entries to allocate at once */
-	HASHELEMENT *freeList;		/* linked list of free elements */
-#ifdef HASH_STATISTICS
-	long		accesses;
-	long		collisions;
-#endif
-} HASHHDR;
-
-/*
- * Top control structure for a hashtable --- need not be shared, since
- * no fields change at runtime
- */
-typedef struct HTAB
-{
-	HASHHDR    *hctl;			/* shared control information */
-	HASHSEGMENT *dir;			/* directory of segment starts */
-	HashValueFunc hash;			/* hash function */
-	HashCompareFunc match;		/* key comparison function */
-	HashCopyFunc keycopy;		/* key copying function */
-	HashAllocFunc alloc;		/* memory allocator */
-	MemoryContext hcxt;			/* memory context if default allocator used */
-	char	   *tabname;		/* table name (for error messages) */
-	bool		isshared;		/* true if table is in shared memory */
-} HTAB;
+/* Hash table control struct is an opaque type known only within dynahash.c */
+typedef struct HTAB HTAB;

 /* Parameter data structure for hash_create */
 /* Only those fields indicated by hash_flags need be set */
 typedef struct HASHCTL
 {
-	long		ssize;			/* Segment Size */
-	long		dsize;			/* (initial) Directory Size */
-	long		max_dsize;		/* limit to dsize if directory size is limited */
-	long		ffactor;		/* Fill factor */
+	long		num_partitions;	/* # partitions (must be power of 2) */
+	long		ssize;			/* segment size */
+	long		dsize;			/* (initial) directory size */
+	long		max_dsize;		/* limit to dsize if dir size is limited */
+	long		ffactor;		/* fill factor */
 	Size		keysize;		/* hash key length in bytes */
 	Size		entrysize;		/* total user element size in bytes */
 	HashValueFunc hash;			/* hash function */
 	HashCompareFunc match;		/* key comparison function */
 	HashCopyFunc keycopy;		/* key copying function */
 	HashAllocFunc alloc;		/* memory allocator */
-	HASHSEGMENT *dir;			/* directory of segment starts */
-	HASHHDR    *hctl;			/* location of header in shared mem */
 	MemoryContext hcxt;			/* memory context to use for allocations */
+	HASHHDR    *hctl;			/* location of header in shared mem */
 } HASHCTL;

 /* Flags to indicate which parameters are supplied */
+#define HASH_PARTITION	0x001	/* Hashtable is used w/partitioned locking */
 #define HASH_SEGMENT	0x002	/* Set segment size */
-#define HASH_DIRSIZE	0x004	/* Set directory size */
+#define HASH_DIRSIZE	0x004	/* Set directory size (initial and max) */
 #define HASH_FFACTOR	0x008	/* Set fill factor */
 #define HASH_FUNCTION	0x010	/* Set user defined hash function */
-#define HASH_ELEM		0x020	/* Set key/entry size */
+#define HASH_ELEM		0x020	/* Set keysize and entrysize */
 #define HASH_SHARED_MEM 0x040	/* Hashtable is in shared memory */
 #define HASH_ATTACH		0x080	/* Do not initialize hctl */
 #define HASH_ALLOC		0x100	/* Set memory allocator */
-#define HASH_CONTEXT	0x200	/* Set explicit memory context */
+#define HASH_CONTEXT	0x200	/* Set memory allocation context */
 #define HASH_COMPARE	0x400	/* Set user defined comparison function */
 #define HASH_KEYCOPY	0x800	/* Set user defined key-copying function */

@ -183,10 +123,16 @@ extern void hash_destroy(HTAB *hashp);
 extern void hash_stats(const char *where, HTAB *hashp);
 extern void *hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action,
 			bool *foundPtr);
+extern uint32 get_hash_value(HTAB *hashp, const void *keyPtr);
+extern void *hash_search_with_hash_value(HTAB *hashp, const void *keyPtr,
+			uint32 hashvalue, HASHACTION action,
+			bool *foundPtr);
+extern long hash_get_num_entries(HTAB *hashp);
 extern void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp);
 extern void *hash_seq_search(HASH_SEQ_STATUS *status);
 extern Size hash_estimate_size(long num_entries, Size entrysize);
 extern long hash_select_dirsize(long num_entries);
+extern Size hash_get_shared_size(HASHCTL *info, int flags);

 /*
 * prototypes for functions in hashfn.c