Fix signal handling in logical replication workers

The logical replication worker processes now use the normal die()
handler for SIGTERM and CHECK_FOR_INTERRUPTS() instead of custom code.
One problem before was that the apply worker would not exit promptly
when a subscription was dropped, which could lead to deadlocks.

Author: Petr Jelinek <petr.jelinek@2ndquadrant.com>
Reported-by: Masahiko Sawada <sawada.mshk@gmail.com>
This commit is contained in:
Peter Eisentraut 2017-06-02 14:46:00 -04:00
parent acbd8375e9
commit 9fcf670c2e
6 changed files with 50 additions and 21 deletions

View File

@ -80,8 +80,8 @@ static void logicalrep_worker_detach(void);
static void logicalrep_worker_cleanup(LogicalRepWorker *worker);
/* Flags set by signal handlers */
volatile sig_atomic_t got_SIGHUP = false;
volatile sig_atomic_t got_SIGTERM = false;
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t got_SIGTERM = false;
static bool on_commit_launcher_wakeup = false;
@ -624,8 +624,8 @@ logicalrep_worker_onexit(int code, Datum arg)
}
/* SIGTERM: set flag to exit at next convenient time */
void
logicalrep_worker_sigterm(SIGNAL_ARGS)
static void
logicalrep_launcher_sigterm(SIGNAL_ARGS)
{
int save_errno = errno;
@ -638,8 +638,8 @@ logicalrep_worker_sigterm(SIGNAL_ARGS)
}
/* SIGHUP: set flag to reload configuration at next convenient time */
void
logicalrep_worker_sighup(SIGNAL_ARGS)
static void
logicalrep_launcher_sighup(SIGNAL_ARGS)
{
int save_errno = errno;
@ -799,8 +799,8 @@ ApplyLauncherMain(Datum main_arg)
before_shmem_exit(logicalrep_launcher_onexit, (Datum) 0);
/* Establish signal handlers. */
pqsignal(SIGHUP, logicalrep_worker_sighup);
pqsignal(SIGTERM, logicalrep_worker_sigterm);
pqsignal(SIGHUP, logicalrep_launcher_sighup);
pqsignal(SIGTERM, logicalrep_launcher_sigterm);
BackgroundWorkerUnblockSignals();
/* Make it easy to identify our processes. */

View File

@ -154,10 +154,12 @@ wait_for_sync_status_change(Oid relid, char origstate)
int rc;
char state = origstate;
while (!got_SIGTERM)
for (;;)
{
LogicalRepWorker *worker;
CHECK_FOR_INTERRUPTS();
LWLockAcquire(LogicalRepWorkerLock, LW_SHARED);
worker = logicalrep_worker_find(MyLogicalRepWorker->subid,
relid, false);
@ -525,7 +527,7 @@ copy_read_data(void *outbuf, int minread, int maxread)
bytesread += avail;
}
while (!got_SIGTERM && maxread > 0 && bytesread < minread)
while (maxread > 0 && bytesread < minread)
{
pgsocket fd = PGINVALID_SOCKET;
int rc;
@ -579,10 +581,6 @@ copy_read_data(void *outbuf, int minread, int maxread)
ResetLatch(&MyProc->procLatch);
}
/* Check for exit condition. */
if (got_SIGTERM)
proc_exit(0);
return bytesread;
}

View File

@ -72,6 +72,8 @@
#include "storage/proc.h"
#include "storage/procarray.h"
#include "tcop/tcopprot.h"
#include "utils/builtins.h"
#include "utils/catcache.h"
#include "utils/datum.h"
@ -118,6 +120,9 @@ static void store_flush_position(XLogRecPtr remote_lsn);
static void reread_subscription(void);
/* Flags set by signal handlers */
static volatile sig_atomic_t got_SIGHUP = false;
/*
* Should this worker apply changes for given relation.
*
@ -1005,7 +1010,7 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
/* mark as idle, before starting to loop */
pgstat_report_activity(STATE_IDLE, NULL);
while (!got_SIGTERM)
for (;;)
{
pgsocket fd = PGINVALID_SOCKET;
int rc;
@ -1015,6 +1020,8 @@ LogicalRepApplyLoop(XLogRecPtr last_received)
TimestampTz last_recv_timestamp = GetCurrentTimestamp();
bool ping_sent = false;
CHECK_FOR_INTERRUPTS();
MemoryContextSwitchTo(ApplyMessageContext);
len = walrcv_receive(wrconn, &buf, &fd);
@ -1437,6 +1444,19 @@ subscription_change_cb(Datum arg, int cacheid, uint32 hashvalue)
MySubscriptionValid = false;
}
/* SIGHUP: set flag to reload configuration at next convenient time */
static void
logicalrep_worker_sighup(SIGNAL_ARGS)
{
int save_errno = errno;
got_SIGHUP = true;
/* Waken anything waiting on the process latch */
SetLatch(MyLatch);
errno = save_errno;
}
/* Logical Replication Apply worker entry point */
void
@ -1454,7 +1474,7 @@ ApplyWorkerMain(Datum main_arg)
/* Setup signal handling */
pqsignal(SIGHUP, logicalrep_worker_sighup);
pqsignal(SIGTERM, logicalrep_worker_sigterm);
pqsignal(SIGTERM, die);
BackgroundWorkerUnblockSignals();
/* Initialise stats to a sanish value */
@ -1604,6 +1624,14 @@ ApplyWorkerMain(Datum main_arg)
/* Run the main loop. */
LogicalRepApplyLoop(origin_startpos);
/* We should only get here if we received SIGTERM */
proc_exit(0);
}
/*
* Is current process a logical replication worker?
*/
bool
IsLogicalWorker(void)
{
return MyLogicalRepWorker != NULL;
}

View File

@ -55,6 +55,7 @@
#include "pg_getopt.h"
#include "postmaster/autovacuum.h"
#include "postmaster/postmaster.h"
#include "replication/logicalworker.h"
#include "replication/slot.h"
#include "replication/walsender.h"
#include "rewrite/rewriteHandler.h"
@ -2845,6 +2846,10 @@ ProcessInterrupts(void)
ereport(FATAL,
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating autovacuum process due to administrator command")));
else if (IsLogicalWorker())
ereport(FATAL,
(errcode(ERRCODE_ADMIN_SHUTDOWN),
errmsg("terminating logical replication worker due to administrator command")));
else if (RecoveryConflictPending && RecoveryConflictRetryable)
{
pgstat_report_recovery_conflict(RecoveryConflictReason);

View File

@ -14,4 +14,6 @@
extern void ApplyWorkerMain(Datum main_arg);
extern bool IsLogicalWorker(void);
#endif /* LOGICALWORKER_H */

View File

@ -67,8 +67,6 @@ extern Subscription *MySubscription;
extern LogicalRepWorker *MyLogicalRepWorker;
extern bool in_remote_transaction;
extern volatile sig_atomic_t got_SIGHUP;
extern volatile sig_atomic_t got_SIGTERM;
extern void logicalrep_worker_attach(int slot);
extern LogicalRepWorker *logicalrep_worker_find(Oid subid, Oid relid,
@ -81,8 +79,6 @@ extern void logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker);
extern int logicalrep_sync_worker_count(Oid subid);
extern void logicalrep_worker_sighup(SIGNAL_ARGS);
extern void logicalrep_worker_sigterm(SIGNAL_ARGS);
extern char *LogicalRepSyncTableStart(XLogRecPtr *origin_startpos);
void process_syncing_tables(XLogRecPtr current_lsn);
void invalidate_syncing_table_states(Datum arg, int cacheid,