Fix replication origin-related race conditions

Similar to what was fixed in commit 9915de6c1c for replication slots,
but this time it's related to replication origins: DROP SUBSCRIPTION
attempts to drop the replication origin, but that fails if the
replication worker process hasn't yet marked it unused.  This causes
failures in the buildfarm:
ERROR:  could not drop replication origin with OID 1, in use by PID 34069

Like the aforementioned commit, fix by having the process running DROP
SUBSCRIPTION sleep until the worker marks the the replication origin
struct as free.  This uses a condition variable on each replication
origin shmem state struct, so that the session trying to drop can sleep
and expect to be awakened by the process keeping the origin open.

Also fix a SGML markup in the previous commit.

Discussion: https://postgr.es/m/20170808001433.rozlseaf4m2wkw3n@alvherre.pgsql
This commit is contained in:
Alvaro Herrera 2017-08-08 16:07:46 -04:00
parent 030273b7ea
commit b2c95a3798
6 changed files with 58 additions and 15 deletions

View File

@ -1222,11 +1222,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
</row>
<row>
<entry><literal>LibPQWalReceiverConnect</></entry>
<entry>Waiting in WAL receiver to establish connection to remote server.<entry>
<entry>Waiting in WAL receiver to establish connection to remote server.</entry>
</row>
<row>
<entry><literal>LibPQWalReceiverReceive</></entry>
<entry>Waiting in WAL receiver to receive data from remote server.<entry>
<entry>Waiting in WAL receiver to receive data from remote server.</entry>
</row>
<row>
<entry><literal>SSLOpenServer</></entry>
@ -1302,6 +1302,10 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser
<entry><literal>ProcArrayGroupUpdate</></entry>
<entry>Waiting for group leader to clear transaction id at transaction end.</entry>
</row>
<row>
<entry><literal>ReplicationOriginDrop</></entry>
<entry>Waiting for a replication origin to become inactive to be dropped.</entry>
</row>
<row>
<entry><literal>ReplicationSlotDrop</></entry>
<entry>Waiting for a replication slot to become inactive to be dropped.</entry>

View File

@ -939,7 +939,7 @@ DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel)
snprintf(originname, sizeof(originname), "pg_%u", subid);
originid = replorigin_by_name(originname, true);
if (originid != InvalidRepOriginId)
replorigin_drop(originid);
replorigin_drop(originid, false);
/*
* If there is no slot associated with the subscription, we can finish

View File

@ -3609,6 +3609,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
case WAIT_EVENT_PROCARRAY_GROUP_UPDATE:
event_name = "ProcArrayGroupUpdate";
break;
case WAIT_EVENT_REPLICATION_ORIGIN_DROP:
event_name = "ReplicationOriginDrop";
break;
case WAIT_EVENT_REPLICATION_SLOT_DROP:
event_name = "ReplicationSlotDrop";
break;

View File

@ -79,15 +79,15 @@
#include "access/xact.h"
#include "catalog/indexing.h"
#include "nodes/execnodes.h"
#include "replication/origin.h"
#include "replication/logical.h"
#include "pgstat.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/condition_variable.h"
#include "storage/copydir.h"
#include "utils/builtins.h"
@ -124,6 +124,11 @@ typedef struct ReplicationState
*/
int acquired_by;
/*
* Condition variable that's signalled when acquired_by changes.
*/
ConditionVariable origin_cv;
/*
* Lock protecting remote_lsn and local_lsn.
*/
@ -324,9 +329,9 @@ replorigin_create(char *roname)
* Needs to be called in a transaction.
*/
void
replorigin_drop(RepOriginId roident)
replorigin_drop(RepOriginId roident, bool nowait)
{
HeapTuple tuple = NULL;
HeapTuple tuple;
Relation rel;
int i;
@ -334,6 +339,8 @@ replorigin_drop(RepOriginId roident)
rel = heap_open(ReplicationOriginRelationId, ExclusiveLock);
restart:
tuple = NULL;
/* cleanup the slot state info */
LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE);
@ -346,11 +353,21 @@ replorigin_drop(RepOriginId roident)
{
if (state->acquired_by != 0)
{
ereport(ERROR,
(errcode(ERRCODE_OBJECT_IN_USE),
errmsg("could not drop replication origin with OID %d, in use by PID %d",
state->roident,
state->acquired_by)));
ConditionVariable *cv;
if (nowait)
ereport(ERROR,
(errcode(ERRCODE_OBJECT_IN_USE),
errmsg("could not drop replication origin with OID %d, in use by PID %d",
state->roident,
state->acquired_by)));
cv = &state->origin_cv;
LWLockRelease(ReplicationOriginLock);
ConditionVariablePrepareToSleep(cv);
ConditionVariableSleep(cv, WAIT_EVENT_REPLICATION_ORIGIN_DROP);
ConditionVariableCancelSleep();
goto restart;
}
/* first WAL log */
@ -382,7 +399,7 @@ replorigin_drop(RepOriginId roident)
CommandCounterIncrement();
/* now release lock again, */
/* now release lock again */
heap_close(rel, ExclusiveLock);
}
@ -476,8 +493,11 @@ ReplicationOriginShmemInit(void)
MemSet(replication_states, 0, ReplicationOriginShmemSize());
for (i = 0; i < max_replication_slots; i++)
{
LWLockInitialize(&replication_states[i].lock,
replication_states_ctl->tranche_id);
ConditionVariableInit(&replication_states[i].origin_cv);
}
}
LWLockRegisterTranche(replication_states_ctl->tranche_id,
@ -957,16 +977,23 @@ replorigin_get_progress(RepOriginId node, bool flush)
static void
ReplicationOriginExitCleanup(int code, Datum arg)
{
ConditionVariable *cv = NULL;
LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE);
if (session_replication_state != NULL &&
session_replication_state->acquired_by == MyProcPid)
{
cv = &session_replication_state->origin_cv;
session_replication_state->acquired_by = 0;
session_replication_state = NULL;
}
LWLockRelease(ReplicationOriginLock);
if (cv)
ConditionVariableBroadcast(cv);
}
/*
@ -1056,6 +1083,9 @@ replorigin_session_setup(RepOriginId node)
session_replication_state->acquired_by = MyProcPid;
LWLockRelease(ReplicationOriginLock);
/* probably this one is pointless */
ConditionVariableBroadcast(&session_replication_state->origin_cv);
}
/*
@ -1067,6 +1097,8 @@ replorigin_session_setup(RepOriginId node)
void
replorigin_session_reset(void)
{
ConditionVariable *cv;
Assert(max_replication_slots != 0);
if (session_replication_state == NULL)
@ -1077,9 +1109,12 @@ replorigin_session_reset(void)
LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE);
session_replication_state->acquired_by = 0;
cv = &session_replication_state->origin_cv;
session_replication_state = NULL;
LWLockRelease(ReplicationOriginLock);
ConditionVariableBroadcast(cv);
}
/*
@ -1170,7 +1205,7 @@ pg_replication_origin_drop(PG_FUNCTION_ARGS)
roident = replorigin_by_name(name, false);
Assert(OidIsValid(roident));
replorigin_drop(roident);
replorigin_drop(roident, false);
pfree(name);

View File

@ -812,6 +812,7 @@ typedef enum
WAIT_EVENT_PARALLEL_FINISH,
WAIT_EVENT_PARALLEL_BITMAP_SCAN,
WAIT_EVENT_PROCARRAY_GROUP_UPDATE,
WAIT_EVENT_REPLICATION_ORIGIN_DROP,
WAIT_EVENT_REPLICATION_SLOT_DROP,
WAIT_EVENT_SAFE_SNAPSHOT,
WAIT_EVENT_SYNC_REP

View File

@ -41,7 +41,7 @@ extern PGDLLIMPORT TimestampTz replorigin_session_origin_timestamp;
/* API for querying & manipulating replication origins */
extern RepOriginId replorigin_by_name(char *name, bool missing_ok);
extern RepOriginId replorigin_create(char *name);
extern void replorigin_drop(RepOriginId roident);
extern void replorigin_drop(RepOriginId roident, bool nowait);
extern bool replorigin_by_oid(RepOriginId roident, bool missing_ok,
char **roname);