Fix old-fd issues using global barriers everywhere.

Commits 4eb21763 and b74e94dc introduced a way to force every backend to close all relation files, to fix an ancient Windows-only bug. This commit extends that behavior to all operating systems and adds a couple of extra barrier points, to fix a totally different class of bug: the reuse of relfilenodes in scenarios that have no other kind of cache invalidation to prevent file descriptor mix-ups. In all releases, data corruption could occur when you moved a database to another tablespace and then back again. Despite that, no back-patch for now as the infrastructure required is too new and invasive. In master only, since commit aa010514, it could also happen when using CREATE DATABASE with a user-supplied OID or via pg_upgrade. Author: Andres Freund <andres@anarazel.de> Reviewed-by: Robert Haas <robertmhaas@gmail.com> Reviewed-by: Thomas Munro <thomas.munro@gmail.com> Discussion: https://postgr.es/m/20220209220004.kb3dgtn2x2k2gtdm%40alap3.anarazel.de
2025-02-23 19:39:53 +08:00 · 2022-05-07 15:19:52 +12:00 · 2022-05-07 15:19:52 +12:00 · e2f65f4255
commit e2f65f4255
parent b74e94dc27
5 changed files with 241 additions and 25 deletions
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@ -1687,10 +1687,8 @@ dropdb(const char *dbname, bool missing_ok, bool force)
 	 */
 	RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT);

-#if defined(USE_BARRIER_SMGRRELEASE)
 	/* Close all smgr fds in all backends. */
 	WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
-#endif

 	/*
 	 * Remove all tablespace subdirs belonging to the database.
@ -1940,10 +1938,8 @@ movedb(const char *dbname, const char *tblspcname)
 	RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_FORCE | CHECKPOINT_WAIT
 					  | CHECKPOINT_FLUSH_ALL);

-#if defined(USE_BARRIER_SMGRRELEASE)
 	/* Close all smgr fds in all backends. */
 	WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
-#endif

 	/*
 	 * Now drop all buffers holding data of the target database; they should
@ -3054,6 +3050,9 @@ dbase_redo(XLogReaderState *record)
 		 */
 		FlushDatabaseBuffers(xlrec->src_db_id);

+		/* Close all sgmr fds in all backends. */
+		WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
+
 		/*
 		 * Copy this subdirectory to the new location
 		 *
@ -3111,10 +3110,8 @@ dbase_redo(XLogReaderState *record)
 		/* Clean out the xlog relcache too */
 		XLogDropDatabase(xlrec->db_id);

-#if defined(USE_BARRIER_SMGRRELEASE)
 		/* Close all sgmr fds in all backends. */
 		WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
-#endif

 		for (i = 0; i < xlrec->ntablespaces; i++)
 		{
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@ -548,11 +548,10 @@ DropTableSpace(DropTableSpaceStmt *stmt)
 		 * use a global barrier to ask all backends to close all files, and
 		 * wait until they're finished.
 		 */
-#if defined(USE_BARRIER_SMGRRELEASE)
 		LWLockRelease(TablespaceCreateLock);
 		WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
 		LWLockAcquire(TablespaceCreateLock, LW_EXCLUSIVE);
-#endif
+
 		/* And now try again. */
 		if (!destroy_tablespace_directories(tablespaceoid, false))
 		{
@ -1574,6 +1573,9 @@ tblspc_redo(XLogReaderState *record)
 	{
 		xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record);

+		/* Close all smgr fds in all backends. */
+		WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
+
 		/*
 		 * If we issued a WAL record for a drop tablespace it implies that
 		 * there were no files in it at all when the DROP was done. That means
@ -1591,11 +1593,6 @@ tblspc_redo(XLogReaderState *record)
 		 */
 		if (!destroy_tablespace_directories(xlrec->ts_id, true))
 		{
-#if defined(USE_BARRIER_SMGRRELEASE)
-			/* Close all smgr fds in all backends. */
-			WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
-#endif
-
 			ResolveRecoveryConflictWithTablespace(xlrec->ts_id);

 			/*
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@ -152,17 +152,6 @@
 #define EXEC_BACKEND
 #endif

-/*
- * If USE_BARRIER_SMGRRELEASE is defined, certain code paths that unlink
- * directories will ask other backends to close all smgr file descriptors.
- * This is enabled on Windows, because otherwise unlinked but still open files
- * can prevent rmdir(containing_directory) from succeeding.  On other
- * platforms, it can be defined to exercise those code paths.
- */
-#if defined(WIN32)
-#define USE_BARRIER_SMGRRELEASE
-#endif
-
 /*
 * Define this if your operating system supports link()
 */
--- a/src/test/recovery/Makefile
+++ b/src/test/recovery/Makefile
@ -9,7 +9,7 @@
 #
 #-------------------------------------------------------------------------

-EXTRA_INSTALL=contrib/test_decoding
+EXTRA_INSTALL=contrib/test_decoding contrib/pg_prewarm

 subdir = src/test/recovery
 top_builddir = ../../..
--- a/src/test/recovery/t/032_relfilenode_reuse.pl
+++ b/src/test/recovery/t/032_relfilenode_reuse.pl
@ -0,0 +1,233 @@
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use File::Basename;
+
+
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+$node_primary->append_conf('postgresql.conf', q[
+allow_in_place_tablespaces = true
+log_connections=on
+# to avoid "repairing" corruption
+full_page_writes=off
+log_min_messages=debug2
+autovacuum_naptime=1s
+shared_buffers=1MB
+]);
+$node_primary->start;
+
+
+# Create streaming standby linking to primary
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	has_streaming => 1);
+$node_standby->start;
+
+# To avoid hanging while expecting some specific input from a psql
+# instance being driven by us, add a timeout high enough that it
+# should never trigger even on very slow machines, unless something
+# is really wrong.
+my $psql_timeout = IPC::Run::timer(300);
+
+my %psql_primary = (stdin => '', stdout => '', stderr => '');
+$psql_primary{run} = IPC::Run::start(
+	[ 'psql', '-XA', '-f', '-', '-d', $node_primary->connstr('postgres') ],
+	'<',
+	\$psql_primary{stdin},
+	'>',
+	\$psql_primary{stdout},
+	'2>',
+	\$psql_primary{stderr},
+	$psql_timeout);
+
+my %psql_standby = ('stdin' => '', 'stdout' => '', 'stderr' => '');
+$psql_standby{run} = IPC::Run::start(
+	[ 'psql', '-XA', '-f', '-', '-d', $node_standby->connstr('postgres') ],
+	'<',
+	\$psql_standby{stdin},
+	'>',
+	\$psql_standby{stdout},
+	'2>',
+	\$psql_standby{stderr},
+	$psql_timeout);
+
+
+# Create template database with a table that we'll update, to trigger dirty
+# rows. Using a template database + preexisting rows makes it a bit easier to
+# reproduce, because there's no cache invalidations generated.
+
+$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db_template OID = 50000;");
+$node_primary->safe_psql('conflict_db_template', q[
+    CREATE TABLE large(id serial primary key, dataa text, datab text);
+    INSERT INTO large(dataa, datab) SELECT g.i::text, 1 FROM generate_series(1, 4000) g(i);]);
+$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;");
+
+$node_primary->safe_psql('postgres', q[
+    CREATE EXTENSION pg_prewarm;
+    CREATE TABLE replace_sb(data text);
+    INSERT INTO replace_sb(data) SELECT random()::text FROM generate_series(1, 15000);]);
+
+# Use longrunning transactions, so that AtEOXact_SMgr doesn't close files
+send_query_and_wait(
+	\%psql_primary,
+	q[BEGIN;],
+	qr/BEGIN/m);
+send_query_and_wait(
+	\%psql_standby,
+	q[BEGIN;],
+	qr/BEGIN/m);
+
+# Cause lots of dirty rows in shared_buffers
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 1;");
+
+# Now do a bunch of work in another database. That will end up needing to
+# write back dirty data from the previous step, opening the relevant file
+# descriptors
+cause_eviction(\%psql_primary, \%psql_standby);
+
+# drop and recreate database
+$node_primary->safe_psql('postgres', "DROP DATABASE conflict_db;");
+$node_primary->safe_psql('postgres', "CREATE DATABASE conflict_db TEMPLATE conflict_db_template OID = 50001;");
+
+verify($node_primary, $node_standby, 1,
+	   "initial contents as expected");
+
+# Again cause lots of dirty rows in shared_buffers, but use a different update
+# value so we can check everything is OK
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 2;");
+
+# Again cause a lot of IO. That'll again write back dirty data, but uses (XXX
+# adjust after bugfix) the already opened file descriptor.
+# FIXME
+cause_eviction(\%psql_primary, \%psql_standby);
+
+verify($node_primary, $node_standby, 2,
+	   "update to reused relfilenode (due to DB oid conflict) is not lost");
+
+
+$node_primary->safe_psql('conflict_db', "VACUUM FULL large;");
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 3;");
+
+verify($node_primary, $node_standby, 3,
+	   "restored contents as expected");
+
+# Test for old filehandles after moving a database in / out of tablespace
+$node_primary->safe_psql('postgres', q[CREATE TABLESPACE test_tablespace LOCATION '']);
+
+# cause dirty buffers
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 4;");
+# cause files to be opened in backend in other database
+cause_eviction(\%psql_primary, \%psql_standby);
+
+# move database back / forth
+$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace');
+$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE pg_default');
+
+# cause dirty buffers
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 5;");
+cause_eviction(\%psql_primary, \%psql_standby);
+
+verify($node_primary, $node_standby, 5,
+	   "post move contents as expected");
+
+$node_primary->safe_psql('postgres', 'ALTER DATABASE conflict_db SET TABLESPACE test_tablespace');
+
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 7;");
+cause_eviction(\%psql_primary, \%psql_standby);
+$node_primary->safe_psql('conflict_db', "UPDATE large SET datab = 8;");
+$node_primary->safe_psql('postgres', 'DROP DATABASE conflict_db');
+$node_primary->safe_psql('postgres', 'DROP TABLESPACE test_tablespace');
+
+$node_primary->safe_psql('postgres', 'REINDEX TABLE pg_database');
+
+
+# explicitly shut down psql instances gracefully - to avoid hangs
+# or worse on windows
+$psql_primary{stdin} .= "\\q\n";
+$psql_primary{run}->finish;
+$psql_standby{stdin} .= "\\q\n";
+$psql_standby{run}->finish;
+
+$node_primary->stop();
+$node_standby->stop();
+
+# Make sure that there weren't crashes during shutdown
+
+command_like([ 'pg_controldata', $node_primary->data_dir ],
+	qr/Database cluster state:\s+shut down\n/, 'primary shut down ok');
+command_like([ 'pg_controldata', $node_standby->data_dir ],
+	qr/Database cluster state:\s+shut down in recovery\n/, 'standby shut down ok');
+done_testing();
+
+sub verify
+{
+	my ($primary, $standby, $counter, $message) = @_;
+
+	my $query = "SELECT datab, count(*) FROM large GROUP BY 1 ORDER BY 1 LIMIT 10";
+	is($primary->safe_psql('conflict_db', $query),
+	   "$counter|4000",
+	   "primary: $message");
+
+	$primary->wait_for_catchup($standby);
+	is($standby->safe_psql('conflict_db', $query),
+	   "$counter|4000",
+	   "standby: $message");
+}
+
+sub cause_eviction
+{
+	my ($psql_primary, $psql_standby) = @_;
+
+	send_query_and_wait(
+		$psql_primary,
+		q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;],
+		qr/warmed_buffers/m);
+
+	send_query_and_wait(
+		$psql_standby,
+		q[SELECT SUM(pg_prewarm(oid)) warmed_buffers FROM pg_class WHERE pg_relation_filenode(oid) != 0;],
+		qr/warmed_buffers/m);
+}
+
+# Send query, wait until string matches
+sub send_query_and_wait
+{
+	my ($psql, $query, $untl) = @_;
+	my $ret;
+
+	# send query
+	$$psql{stdin} .= $query;
+	$$psql{stdin} .= "\n";
+
+	# wait for query results
+	$$psql{run}->pump_nb();
+	while (1)
+	{
+		last if $$psql{stdout} =~ /$untl/;
+
+		if ($psql_timeout->is_expired)
+		{
+			BAIL_OUT("aborting wait: program timed out\n"
+				  . "stream contents: >>$$psql{stdout}<<\n"
+				  . "pattern searched for: $untl\n");
+			return 0;
+		}
+		if (not $$psql{run}->pumpable())
+		{
+			BAIL_OUT("aborting wait: program died\n"
+				  . "stream contents: >>$$psql{stdout}<<\n"
+				  . "pattern searched for: $untl\n");
+			return 0;
+		}
+		$$psql{run}->pump();
+	}
+
+	$$psql{stdout} = '';
+
+	return 1;
+}