diff --git a/doc/src/sgml/ref/pgbench.sgml b/doc/src/sgml/ref/pgbench.sgml
index a8085463a5e..2517a3abe7f 100644
--- a/doc/src/sgml/ref/pgbench.sgml
+++ b/doc/src/sgml/ref/pgbench.sgml
@@ -326,8 +326,7 @@ pgbench <optional> <replaceable>options</> </optional> <replaceable>dbname</>
        <para>
         Number of worker threads within <application>pgbench</application>.
         Using more than one thread can be helpful on multi-CPU machines.
-        The number of clients must be a multiple of the number of threads,
-        since each thread is given the same number of client sessions to manage.
+        Clients are distributed as evenly as possible among available threads.
         Default is 1.
        </para>
       </listitem>
diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c
index 2c3e3650c8a..74c3371c21d 100644
--- a/src/bin/pgbench/pgbench.c
+++ b/src/bin/pgbench/pgbench.c
@@ -2819,6 +2819,7 @@ main(int argc, char **argv)
 	int64		latency_late = 0;
 
 	int			i;
+	int			nclients_dealt;
 
 #ifdef HAVE_GETRLIMIT
 	struct rlimit rlim;
@@ -3114,6 +3115,14 @@ main(int argc, char **argv)
 		}
 	}
 
+	/*
+	 * Don't need more threads than there are clients.  (This is not merely an
+	 * optimization; throttle_delay is calculated incorrectly below if some
+	 * threads have no clients assigned to them.)
+	 */
+	if (nthreads > nclients)
+		nthreads = nclients;
+
 	/* compute a per thread delay */
 	throttle_delay *= nthreads;
 
@@ -3153,12 +3162,6 @@ main(int argc, char **argv)
 	if (nxacts <= 0 && duration <= 0)
 		nxacts = DEFAULT_NXACTS;
 
-	if (nclients % nthreads != 0)
-	{
-		fprintf(stderr, "number of clients (%d) must be a multiple of number of threads (%d)\n", nclients, nthreads);
-		exit(1);
-	}
-
 	/* --sampling-rate may be used only with -l */
 	if (sample_rate > 0.0 && !use_log)
 	{
@@ -3359,19 +3362,24 @@ main(int argc, char **argv)
 
 	/* set up thread data structures */
 	threads = (TState *) pg_malloc(sizeof(TState) * nthreads);
+	nclients_dealt = 0;
+
 	for (i = 0; i < nthreads; i++)
 	{
 		TState	   *thread = &threads[i];
 
 		thread->tid = i;
-		thread->state = &state[nclients / nthreads * i];
-		thread->nstate = nclients / nthreads;
+		thread->state = &state[nclients_dealt];
+		thread->nstate =
+			(nclients - nclients_dealt + nthreads - i - 1) / (nthreads - i);
 		thread->random_state[0] = random();
 		thread->random_state[1] = random();
 		thread->random_state[2] = random();
 		thread->throttle_latency_skipped = 0;
 		thread->latency_late = 0;
 
+		nclients_dealt += thread->nstate;
+
 		if (is_latencies)
 		{
 			/* Reserve memory for the thread to store per-command latencies */
@@ -3395,6 +3403,9 @@ main(int argc, char **argv)
 		}
 	}
 
+	/* all clients must be assigned to a thread */
+	Assert(nclients_dealt == nclients);
+
 	/* get start up time */
 	INSTR_TIME_SET_CURRENT(start_time);