diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 73194ec4ef..744601f115 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.114 2004/12/31 22:01:13 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.114.4.1 2006/11/20 01:08:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -33,23 +33,44 @@ * descriptors in its own descriptor pool. This is done to make it * easier to support relations that are larger than the operating * system's file size limit (often 2GBytes). In order to do that, - * we break relations up into chunks of < 2GBytes and store one chunk - * in each of several files that represent the relation. See the - * BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h. - * All chunks except the last MUST have size exactly equal to RELSEG_SIZE - * blocks --- see mdnblocks() and mdtruncate(). + * we break relations up into "segment" files that are each shorter than + * the OS file size limit. The segment size is set by the RELSEG_SIZE + * configuration constant in pg_config_manual.h. + * + * On disk, a relation must consist of consecutively numbered segment + * files in the pattern + * -- Zero or more full segments of exactly RELSEG_SIZE blocks each + * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks + * -- Optionally, any number of inactive segments of size 0 blocks. + * The full and partial segments are collectively the "active" segments. + * Inactive segments are those that once contained data but are currently + * not needed because of an mdtruncate() operation. The reason for leaving + * them present at size zero, rather than unlinking them, is that other + * backends and/or the bgwriter might be holding open file references to + * such segments. If the relation expands again after mdtruncate(), such + * that a deactivated segment becomes active again, it is important that + * such file references still be valid --- else data might get written + * out to an unlinked old copy of a segment file that will eventually + * disappear. * * The file descriptor pointer (md_fd field) stored in the SMgrRelation - * cache is, therefore, just the head of a list of MdfdVec objects. - * But note the md_fd pointer can be NULL, indicating relation not open. + * cache is, therefore, just the head of a list of MdfdVec objects, one + * per segment. But note the md_fd pointer can be NULL, indicating + * relation not open. * - * Note that mdfd_chain == NULL does not necessarily mean the relation + * Also note that mdfd_chain == NULL does not necessarily mean the relation * doesn't have another segment after this one; we may just not have * opened the next segment yet. (We could not have "all segments are * in the chain" as an invariant anyway, since another backend could - * extend the relation when we weren't looking.) + * extend the relation when we weren't looking.) We do not make chain + * entries for inactive segments, however; as soon as we find a partial + * segment, we assume that any subsequent segments are inactive. * * All MdfdVec objects are palloc'd in the MdCxt memory context. + * + * Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic, + * for use on machines that support large files. Beware that that + * code has not been tested in a long time and is probably bit-rotted. */ typedef struct _MdfdVec @@ -75,8 +96,6 @@ static MemoryContext MdCxt; /* context for all md.c allocations */ * * (Regular backends do not track pending operations locally, but forward * them to the bgwriter.) - * - * XXX for WIN32, may want to expand this to track pending deletes, too. */ typedef struct { @@ -221,12 +240,16 @@ mdunlink(RelFileNode rnode, bool isRedo) } #ifndef LET_OS_MANAGE_FILESIZE - /* Get the additional segments, if any */ + /* Delete the additional segments, if any */ if (status) { char *segpath = (char *) palloc(strlen(path) + 12); BlockNumber segno; + /* + * Note that because we loop until getting ENOENT, we will + * correctly remove all inactive segments as well as active ones. + */ for (segno = 1;; segno++) { sprintf(segpath, "%s.%u", path, segno); @@ -256,15 +279,10 @@ mdunlink(RelFileNode rnode, bool isRedo) * * The semantics are basically the same as mdwrite(): write at the * specified position. However, we are expecting to extend the - * relation (ie, blocknum is the current EOF), and so in case of + * relation (ie, blocknum is >= the current EOF), and so in case of * failure we clean up by truncating. * * This routine returns true or false, with errno set as appropriate. - * - * Note: this routine used to call mdnblocks() to get the block position - * to write at, but that's pretty silly since the caller needs to know where - * the block will be written, and accordingly must have done mdnblocks() - * already. Might as well pass in the position and save a seek. */ bool mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp) @@ -498,10 +516,10 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp) /* * mdnblocks() -- Get the number of blocks stored in a relation. * - * Important side effect: all segments of the relation are opened + * Important side effect: all active segments of the relation are opened * and added to the mdfd_chain list. If this routine has not been * called, then only segments up to the last one actually touched - * are present in the chain... + * are present in the chain. * * Returns # of blocks, or InvalidBlockNumber on error. */ @@ -518,10 +536,14 @@ mdnblocks(SMgrRelation reln) * Skip through any segments that aren't the last one, to avoid * redundant seeks on them. We have previously verified that these * segments are exactly RELSEG_SIZE long, and it's useless to recheck - * that each time. (NOTE: this assumption could only be wrong if - * another backend has truncated the relation. We rely on higher code - * levels to handle that scenario by closing and re-opening the md - * fd.) + * that each time. + * + * NOTE: this assumption could only be wrong if another backend has + * truncated the relation. We rely on higher code levels to handle that + * scenario by closing and re-opening the md fd, which is handled via + * relcache flush. (Since the bgwriter doesn't participate in relcache + * flush, it could have segment chain entries for inactive segments; + * that's OK because the bgwriter never needs to compute relation size.) */ while (v->mdfd_chain != NULL) { @@ -579,8 +601,8 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp) #endif /* - * NOTE: mdnblocks makes sure we have opened all existing segments, so - * that truncate/delete loop will get them all! + * NOTE: mdnblocks makes sure we have opened all active segments, so + * that truncation loop will get them all! */ curnblk = mdnblocks(reln); if (curnblk == InvalidBlockNumber) @@ -601,14 +623,17 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp) if (priorblocks > nblocks) { /* - * This segment is no longer wanted at all (and has already - * been unlinked from the mdfd_chain). We truncate the file - * before deleting it because if other backends are holding - * the file open, the unlink will fail on some platforms. - * Better a zero-size file gets left around than a big file... + * This segment is no longer active (and has already been + * unlinked from the mdfd_chain). We truncate the file, but do + * not delete it, for reasons explained in the header comments. */ - FileTruncate(v->mdfd_vfd, 0); - FileUnlink(v->mdfd_vfd); + if (FileTruncate(v->mdfd_vfd, 0) < 0) + return InvalidBlockNumber; + if (!isTemp) + { + if (!register_dirty_segment(reln, v)) + return InvalidBlockNumber; + } v = v->mdfd_chain; Assert(ov != reln->md_fd); /* we never drop the 1st segment */ pfree(ov); @@ -668,7 +693,7 @@ mdimmedsync(SMgrRelation reln) BlockNumber curnblk; /* - * NOTE: mdnblocks makes sure we have opened all existing segments, so + * NOTE: mdnblocks makes sure we have opened all active segments, so * that fsync loop will get them all! */ curnblk = mdnblocks(reln);