mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-01-06 15:24:56 +08:00
Rearrange mdsync() looping logic to avoid the problem that a sufficiently
fast flow of new fsync requests can prevent mdsync() from ever completing. This was an unforeseen consequence of a patch added in Mar 2006 to prevent the fsync request queue from overflowing. Problem identified by Heikki Linnakangas and independently by ITAGAKI Takahiro; fix based on ideas from Takahiro-san, Heikki, and Tom. Back-patch as far as 8.1 because a previous back-patch introduced the problem into 8.1 ...
This commit is contained in:
parent
ebb6bae539
commit
995ba280c1
@ -8,7 +8,7 @@
|
|||||||
*
|
*
|
||||||
*
|
*
|
||||||
* IDENTIFICATION
|
* IDENTIFICATION
|
||||||
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.127 2007/01/17 16:25:01 tgl Exp $
|
* $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.128 2007/04/12 17:10:55 tgl Exp $
|
||||||
*
|
*
|
||||||
*-------------------------------------------------------------------------
|
*-------------------------------------------------------------------------
|
||||||
*/
|
*/
|
||||||
@ -122,14 +122,19 @@ typedef struct
|
|||||||
BlockNumber segno; /* which segment */
|
BlockNumber segno; /* which segment */
|
||||||
} PendingOperationTag;
|
} PendingOperationTag;
|
||||||
|
|
||||||
|
typedef uint16 CycleCtr; /* can be any convenient integer size */
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
PendingOperationTag tag; /* hash table key (must be first!) */
|
PendingOperationTag tag; /* hash table key (must be first!) */
|
||||||
int failures; /* number of failed attempts to fsync */
|
bool canceled; /* T => request canceled, not yet removed */
|
||||||
|
CycleCtr cycle_ctr; /* mdsync_cycle_ctr when request was made */
|
||||||
} PendingOperationEntry;
|
} PendingOperationEntry;
|
||||||
|
|
||||||
static HTAB *pendingOpsTable = NULL;
|
static HTAB *pendingOpsTable = NULL;
|
||||||
|
|
||||||
|
static CycleCtr mdsync_cycle_ctr = 0;
|
||||||
|
|
||||||
|
|
||||||
typedef enum /* behavior for mdopen & _mdfd_getseg */
|
typedef enum /* behavior for mdopen & _mdfd_getseg */
|
||||||
{
|
{
|
||||||
@ -856,70 +861,125 @@ mdimmedsync(SMgrRelation reln)
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* mdsync() -- Sync previous writes to stable storage.
|
* mdsync() -- Sync previous writes to stable storage.
|
||||||
*
|
|
||||||
* This is only called during checkpoints, and checkpoints should only
|
|
||||||
* occur in processes that have created a pendingOpsTable.
|
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
mdsync(void)
|
mdsync(void)
|
||||||
{
|
{
|
||||||
bool need_retry;
|
static bool mdsync_in_progress = false;
|
||||||
|
|
||||||
|
HASH_SEQ_STATUS hstat;
|
||||||
|
PendingOperationEntry *entry;
|
||||||
|
int absorb_counter;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This is only called during checkpoints, and checkpoints should only
|
||||||
|
* occur in processes that have created a pendingOpsTable.
|
||||||
|
*/
|
||||||
if (!pendingOpsTable)
|
if (!pendingOpsTable)
|
||||||
elog(ERROR, "cannot sync without a pendingOpsTable");
|
elog(ERROR, "cannot sync without a pendingOpsTable");
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The fsync table could contain requests to fsync relations that have
|
* If we are in the bgwriter, the sync had better include all fsync
|
||||||
* been deleted (unlinked) by the time we get to them. Rather than
|
* requests that were queued by backends before the checkpoint REDO
|
||||||
* just hoping an ENOENT (or EACCES on Windows) error can be ignored,
|
* point was determined. We go that a little better by accepting all
|
||||||
* what we will do is retry the whole process after absorbing fsync
|
* requests queued up to the point where we start fsync'ing.
|
||||||
* request messages again. Since mdunlink() queues a "revoke" message
|
|
||||||
* before actually unlinking, the fsync request is guaranteed to be gone
|
|
||||||
* the second time if it really was this case. DROP DATABASE likewise
|
|
||||||
* has to tell us to forget fsync requests before it starts deletions.
|
|
||||||
*/
|
*/
|
||||||
do {
|
AbsorbFsyncRequests();
|
||||||
HASH_SEQ_STATUS hstat;
|
|
||||||
PendingOperationEntry *entry;
|
|
||||||
int absorb_counter;
|
|
||||||
|
|
||||||
need_retry = false;
|
/*
|
||||||
|
* To avoid excess fsync'ing (in the worst case, maybe a never-terminating
|
||||||
/*
|
* checkpoint), we want to ignore fsync requests that are entered into the
|
||||||
* If we are in the bgwriter, the sync had better include all fsync
|
* hashtable after this point --- they should be processed next time,
|
||||||
* requests that were queued by backends before the checkpoint REDO
|
* instead. We use mdsync_cycle_ctr to tell old entries apart from new
|
||||||
* point was determined. We go that a little better by accepting all
|
* ones: new ones will have cycle_ctr equal to the incremented value of
|
||||||
* requests queued up to the point where we start fsync'ing.
|
* mdsync_cycle_ctr.
|
||||||
*/
|
*
|
||||||
AbsorbFsyncRequests();
|
* In normal circumstances, all entries present in the table at this
|
||||||
|
* point will have cycle_ctr exactly equal to the current (about to be old)
|
||||||
absorb_counter = FSYNCS_PER_ABSORB;
|
* value of mdsync_cycle_ctr. However, if we fail partway through the
|
||||||
|
* fsync'ing loop, then older values of cycle_ctr might remain when we
|
||||||
|
* come back here to try again. Repeated checkpoint failures would
|
||||||
|
* eventually wrap the counter around to the point where an old entry
|
||||||
|
* might appear new, causing us to skip it, possibly allowing a checkpoint
|
||||||
|
* to succeed that should not have. To forestall wraparound, any time
|
||||||
|
* the previous mdsync() failed to complete, run through the table and
|
||||||
|
* forcibly set cycle_ctr = mdsync_cycle_ctr.
|
||||||
|
*
|
||||||
|
* Think not to merge this loop with the main loop, as the problem is
|
||||||
|
* exactly that that loop may fail before having visited all the entries.
|
||||||
|
* From a performance point of view it doesn't matter anyway, as this
|
||||||
|
* path will never be taken in a system that's functioning normally.
|
||||||
|
*/
|
||||||
|
if (mdsync_in_progress)
|
||||||
|
{
|
||||||
|
/* prior try failed, so update any stale cycle_ctr values */
|
||||||
hash_seq_init(&hstat, pendingOpsTable);
|
hash_seq_init(&hstat, pendingOpsTable);
|
||||||
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
||||||
{
|
{
|
||||||
|
entry->cycle_ctr = mdsync_cycle_ctr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Advance counter so that new hashtable entries are distinguishable */
|
||||||
|
mdsync_cycle_ctr++;
|
||||||
|
|
||||||
|
/* Set flag to detect failure if we don't reach the end of the loop */
|
||||||
|
mdsync_in_progress = true;
|
||||||
|
|
||||||
|
/* Now scan the hashtable for fsync requests to process */
|
||||||
|
absorb_counter = FSYNCS_PER_ABSORB;
|
||||||
|
hash_seq_init(&hstat, pendingOpsTable);
|
||||||
|
while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* If the entry is new then don't process it this time. Note that
|
||||||
|
* "continue" bypasses the hash-remove call at the bottom of the loop.
|
||||||
|
*/
|
||||||
|
if (entry->cycle_ctr == mdsync_cycle_ctr)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
/* Else assert we haven't missed it */
|
||||||
|
Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If fsync is off then we don't have to bother opening the file
|
||||||
|
* at all. (We delay checking until this point so that changing
|
||||||
|
* fsync on the fly behaves sensibly.) Also, if the entry is
|
||||||
|
* marked canceled, fall through to delete it.
|
||||||
|
*/
|
||||||
|
if (enableFsync && !entry->canceled)
|
||||||
|
{
|
||||||
|
int failures;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* If fsync is off then we don't have to bother opening the file
|
* If in bgwriter, we want to absorb pending requests every so
|
||||||
* at all. (We delay checking until this point so that changing
|
* often to prevent overflow of the fsync request queue. It is
|
||||||
* fsync on the fly behaves sensibly.)
|
* unspecified whether newly-added entries will be visited by
|
||||||
|
* hash_seq_search, but we don't care since we don't need to
|
||||||
|
* process them anyway.
|
||||||
*/
|
*/
|
||||||
if (enableFsync)
|
if (--absorb_counter <= 0)
|
||||||
|
{
|
||||||
|
AbsorbFsyncRequests();
|
||||||
|
absorb_counter = FSYNCS_PER_ABSORB;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The fsync table could contain requests to fsync segments that
|
||||||
|
* have been deleted (unlinked) by the time we get to them.
|
||||||
|
* Rather than just hoping an ENOENT (or EACCES on Windows) error
|
||||||
|
* can be ignored, what we do on error is absorb pending requests
|
||||||
|
* and then retry. Since mdunlink() queues a "revoke" message
|
||||||
|
* before actually unlinking, the fsync request is guaranteed to
|
||||||
|
* be marked canceled after the absorb if it really was this case.
|
||||||
|
* DROP DATABASE likewise has to tell us to forget fsync requests
|
||||||
|
* before it starts deletions.
|
||||||
|
*/
|
||||||
|
for (failures = 0; ; failures++) /* loop exits at "break" */
|
||||||
{
|
{
|
||||||
SMgrRelation reln;
|
SMgrRelation reln;
|
||||||
MdfdVec *seg;
|
MdfdVec *seg;
|
||||||
|
|
||||||
/*
|
|
||||||
* If in bgwriter, we want to absorb pending requests every so
|
|
||||||
* often to prevent overflow of the fsync request queue. This
|
|
||||||
* could result in deleting the current entry out from under
|
|
||||||
* our hashtable scan, so the procedure is to fall out of the
|
|
||||||
* scan and start over from the top of the function.
|
|
||||||
*/
|
|
||||||
if (--absorb_counter <= 0)
|
|
||||||
{
|
|
||||||
need_retry = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Find or create an smgr hash entry for this relation. This
|
* Find or create an smgr hash entry for this relation. This
|
||||||
* may seem a bit unclean -- md calling smgr? But it's really
|
* may seem a bit unclean -- md calling smgr? But it's really
|
||||||
@ -940,7 +1000,7 @@ mdsync(void)
|
|||||||
/*
|
/*
|
||||||
* It is possible that the relation has been dropped or
|
* It is possible that the relation has been dropped or
|
||||||
* truncated since the fsync request was entered. Therefore,
|
* truncated since the fsync request was entered. Therefore,
|
||||||
* allow ENOENT, but only if we didn't fail once already on
|
* allow ENOENT, but only if we didn't fail already on
|
||||||
* this file. This applies both during _mdfd_getseg() and
|
* this file. This applies both during _mdfd_getseg() and
|
||||||
* during FileSync, since fd.c might have closed the file
|
* during FileSync, since fd.c might have closed the file
|
||||||
* behind our back.
|
* behind our back.
|
||||||
@ -948,42 +1008,56 @@ mdsync(void)
|
|||||||
seg = _mdfd_getseg(reln,
|
seg = _mdfd_getseg(reln,
|
||||||
entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
|
entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
|
||||||
false, EXTENSION_RETURN_NULL);
|
false, EXTENSION_RETURN_NULL);
|
||||||
if (seg == NULL ||
|
if (seg != NULL &&
|
||||||
FileSync(seg->mdfd_vfd) < 0)
|
FileSync(seg->mdfd_vfd) >= 0)
|
||||||
{
|
break; /* success; break out of retry loop */
|
||||||
/*
|
|
||||||
* XXX is there any point in allowing more than one try?
|
|
||||||
* Don't see one at the moment, but easy to change the
|
|
||||||
* test here if so.
|
|
||||||
*/
|
|
||||||
if (!FILE_POSSIBLY_DELETED(errno) ||
|
|
||||||
++(entry->failures) > 1)
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode_for_file_access(),
|
|
||||||
errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
|
|
||||||
entry->tag.segno,
|
|
||||||
entry->tag.rnode.spcNode,
|
|
||||||
entry->tag.rnode.dbNode,
|
|
||||||
entry->tag.rnode.relNode)));
|
|
||||||
else
|
|
||||||
ereport(DEBUG1,
|
|
||||||
(errcode_for_file_access(),
|
|
||||||
errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
|
|
||||||
entry->tag.segno,
|
|
||||||
entry->tag.rnode.spcNode,
|
|
||||||
entry->tag.rnode.dbNode,
|
|
||||||
entry->tag.rnode.relNode)));
|
|
||||||
need_retry = true;
|
|
||||||
continue; /* don't delete the hashtable entry */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Okay, delete this entry */
|
/*
|
||||||
if (hash_search(pendingOpsTable, &entry->tag,
|
* XXX is there any point in allowing more than one retry?
|
||||||
HASH_REMOVE, NULL) == NULL)
|
* Don't see one at the moment, but easy to change the
|
||||||
elog(ERROR, "pendingOpsTable corrupted");
|
* test here if so.
|
||||||
|
*/
|
||||||
|
if (!FILE_POSSIBLY_DELETED(errno) ||
|
||||||
|
failures > 0)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode_for_file_access(),
|
||||||
|
errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
|
||||||
|
entry->tag.segno,
|
||||||
|
entry->tag.rnode.spcNode,
|
||||||
|
entry->tag.rnode.dbNode,
|
||||||
|
entry->tag.rnode.relNode)));
|
||||||
|
else
|
||||||
|
ereport(DEBUG1,
|
||||||
|
(errcode_for_file_access(),
|
||||||
|
errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
|
||||||
|
entry->tag.segno,
|
||||||
|
entry->tag.rnode.spcNode,
|
||||||
|
entry->tag.rnode.dbNode,
|
||||||
|
entry->tag.rnode.relNode)));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Absorb incoming requests and check to see if canceled.
|
||||||
|
*/
|
||||||
|
AbsorbFsyncRequests();
|
||||||
|
absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
|
||||||
|
|
||||||
|
if (entry->canceled)
|
||||||
|
break;
|
||||||
|
} /* end retry loop */
|
||||||
}
|
}
|
||||||
} while (need_retry);
|
|
||||||
|
/*
|
||||||
|
* If we get here, either we fsync'd successfully, or we don't have
|
||||||
|
* to because enableFsync is off, or the entry is (now) marked
|
||||||
|
* canceled. Okay to delete it.
|
||||||
|
*/
|
||||||
|
if (hash_search(pendingOpsTable, &entry->tag,
|
||||||
|
HASH_REMOVE, NULL) == NULL)
|
||||||
|
elog(ERROR, "pendingOpsTable corrupted");
|
||||||
|
} /* end loop over hashtable entries */
|
||||||
|
|
||||||
|
/* Flag successful completion of mdsync */
|
||||||
|
mdsync_in_progress = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1027,8 +1101,8 @@ register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
|
|||||||
*
|
*
|
||||||
* The range of possible segment numbers is way less than the range of
|
* The range of possible segment numbers is way less than the range of
|
||||||
* BlockNumber, so we can reserve high values of segno for special purposes.
|
* BlockNumber, so we can reserve high values of segno for special purposes.
|
||||||
* We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
|
* We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
|
||||||
* a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
|
* a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
|
||||||
* a whole database. (These are a tad slow because the hash table has to be
|
* a whole database. (These are a tad slow because the hash table has to be
|
||||||
* searched linearly, but it doesn't seem worth rethinking the table structure
|
* searched linearly, but it doesn't seem worth rethinking the table structure
|
||||||
* for them.)
|
* for them.)
|
||||||
@ -1049,10 +1123,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
|
|||||||
{
|
{
|
||||||
if (RelFileNodeEquals(entry->tag.rnode, rnode))
|
if (RelFileNodeEquals(entry->tag.rnode, rnode))
|
||||||
{
|
{
|
||||||
/* Okay, delete this entry */
|
/* Okay, cancel this entry */
|
||||||
if (hash_search(pendingOpsTable, &entry->tag,
|
entry->canceled = true;
|
||||||
HASH_REMOVE, NULL) == NULL)
|
|
||||||
elog(ERROR, "pendingOpsTable corrupted");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1067,10 +1139,8 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
|
|||||||
{
|
{
|
||||||
if (entry->tag.rnode.dbNode == rnode.dbNode)
|
if (entry->tag.rnode.dbNode == rnode.dbNode)
|
||||||
{
|
{
|
||||||
/* Okay, delete this entry */
|
/* Okay, cancel this entry */
|
||||||
if (hash_search(pendingOpsTable, &entry->tag,
|
entry->canceled = true;
|
||||||
HASH_REMOVE, NULL) == NULL)
|
|
||||||
elog(ERROR, "pendingOpsTable corrupted");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1090,8 +1160,25 @@ RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
|
|||||||
&key,
|
&key,
|
||||||
HASH_ENTER,
|
HASH_ENTER,
|
||||||
&found);
|
&found);
|
||||||
if (!found) /* new entry, so initialize it */
|
/* if new or previously canceled entry, initialize it */
|
||||||
entry->failures = 0;
|
if (!found || entry->canceled)
|
||||||
|
{
|
||||||
|
entry->canceled = false;
|
||||||
|
entry->cycle_ctr = mdsync_cycle_ctr;
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* NB: it's intentional that we don't change cycle_ctr if the entry
|
||||||
|
* already exists. The fsync request must be treated as old, even
|
||||||
|
* though the new request will be satisfied too by any subsequent
|
||||||
|
* fsync.
|
||||||
|
*
|
||||||
|
* However, if the entry is present but is marked canceled, we should
|
||||||
|
* act just as though it wasn't there. The only case where this could
|
||||||
|
* happen would be if a file had been deleted, we received but did not
|
||||||
|
* yet act on the cancel request, and the same relfilenode was then
|
||||||
|
* assigned to a new file. We mustn't lose the new request, but
|
||||||
|
* it should be considered new not old.
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user