diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c index dcd81208953..b42b9e6c41f 100644 --- a/contrib/bloom/blinsert.c +++ b/contrib/bloom/blinsert.c @@ -166,7 +166,7 @@ blbuildempty(Relation index) Page metapage; /* Construct metapage. */ - metapage = (Page) palloc(BLCKSZ); + metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); BloomFillMetapage(index, metapage); /* diff --git a/contrib/pg_prewarm/pg_prewarm.c b/contrib/pg_prewarm/pg_prewarm.c index 54209924aed..e464d0d4d2b 100644 --- a/contrib/pg_prewarm/pg_prewarm.c +++ b/contrib/pg_prewarm/pg_prewarm.c @@ -36,7 +36,7 @@ typedef enum PREWARM_BUFFER } PrewarmType; -static PGAlignedBlock blockbuffer; +static PGIOAlignedBlock blockbuffer; /* * pg_prewarm(regclass, mode text, fork text, diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index d2f8da5b026..5e0c1447f92 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -415,7 +415,7 @@ gist_indexsortbuild(GISTBuildState *state) * Write an empty page as a placeholder for the root page. It will be * replaced with the real root page at the end. */ - page = palloc0(BLCKSZ); + page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); smgrextend(RelationGetSmgr(state->indexrel), MAIN_FORKNUM, GIST_ROOT_BLKNO, page, true); state->pages_allocated++; @@ -509,7 +509,8 @@ gist_indexsortbuild_levelstate_add(GISTBuildState *state, levelstate->current_page++; if (levelstate->pages[levelstate->current_page] == NULL) - levelstate->pages[levelstate->current_page] = palloc(BLCKSZ); + levelstate->pages[levelstate->current_page] = + palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); newPage = levelstate->pages[levelstate->current_page]; gistinitpage(newPage, old_page_flags); @@ -579,7 +580,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, /* Create page and copy data */ data = (char *) (dist->list); - target = palloc0(BLCKSZ); + target = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, MCXT_ALLOC_ZERO); gistinitpage(target, isleaf ? F_LEAF : 0); for (int i = 0; i < dist->block.num; i++) { @@ -630,7 +631,7 @@ gist_indexsortbuild_levelstate_flush(GISTBuildState *state, if (parent == NULL) { parent = palloc0(sizeof(GistSortedBuildLevelState)); - parent->pages[0] = (Page) palloc(BLCKSZ); + parent->pages[0] = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); parent->parent = NULL; gistinitpage(parent->pages[0], 0); diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 6d8af422609..af3a1542667 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -992,7 +992,7 @@ static bool _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; - PGAlignedBlock zerobuf; + PGIOAlignedBlock zerobuf; Page page; HashPageOpaque ovflopaque; diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index ae0282a70ee..424958912c7 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -255,7 +255,7 @@ begin_heap_rewrite(Relation old_heap, Relation new_heap, TransactionId oldest_xm state->rs_old_rel = old_heap; state->rs_new_rel = new_heap; - state->rs_buffer = (Page) palloc(BLCKSZ); + state->rs_buffer = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); /* new_heap needn't be empty, just locked */ state->rs_blockno = RelationGetNumberOfBlocks(new_heap); state->rs_buffer_valid = false; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 992f84834f8..2df8849858e 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -154,7 +154,7 @@ btbuildempty(Relation index) Page metapage; /* Construct metapage. */ - metapage = (Page) palloc(BLCKSZ); + metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); _bt_initmetapage(metapage, P_NONE, 0, _bt_allequalimage(index, false)); /* diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 1207a496895..6ad3f3c54d5 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -619,7 +619,7 @@ _bt_blnewpage(uint32 level) Page page; BTPageOpaque opaque; - page = (Page) palloc(BLCKSZ); + page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); /* Zero the page and set up standard page header info */ _bt_pageinit(page, BLCKSZ); @@ -660,7 +660,9 @@ _bt_blwritepage(BTWriteState *wstate, Page page, BlockNumber blkno) while (blkno > wstate->btws_pages_written) { if (!wstate->btws_zeropage) - wstate->btws_zeropage = (Page) palloc0(BLCKSZ); + wstate->btws_zeropage = (Page) palloc_aligned(BLCKSZ, + PG_IO_ALIGN_SIZE, + MCXT_ALLOC_ZERO); /* don't set checksum for all-zero page */ smgrextend(RelationGetSmgr(wstate->index), MAIN_FORKNUM, wstate->btws_pages_written++, @@ -1170,7 +1172,7 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) * set to point to "P_NONE"). This changes the index to the "valid" state * by filling in a valid magic number in the metapage. */ - metapage = (Page) palloc(BLCKSZ); + metapage = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); _bt_initmetapage(metapage, rootblkno, rootlevel, wstate->inskey->allequalimage); _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 718a88335d0..72d2e1551cd 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -158,7 +158,7 @@ spgbuildempty(Relation index) Page page; /* Construct metapage. */ - page = (Page) palloc(BLCKSZ); + page = (Page) palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0); SpGistInitMetapage(page); /* diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c index 9f67d1c1cd5..6c68191ca62 100644 --- a/src/backend/access/transam/generic_xlog.c +++ b/src/backend/access/transam/generic_xlog.c @@ -58,14 +58,17 @@ typedef struct char delta[MAX_DELTA_SIZE]; /* delta between page images */ } PageData; -/* State of generic xlog record construction */ +/* + * State of generic xlog record construction. Must be allocated at an I/O + * aligned address. + */ struct GenericXLogState { + /* Page images (properly aligned, must be first) */ + PGIOAlignedBlock images[MAX_GENERIC_XLOG_PAGES]; /* Info about each page, see above */ PageData pages[MAX_GENERIC_XLOG_PAGES]; bool isLogged; - /* Page images (properly aligned) */ - PGAlignedBlock images[MAX_GENERIC_XLOG_PAGES]; }; static void writeFragment(PageData *pageData, OffsetNumber offset, @@ -269,7 +272,9 @@ GenericXLogStart(Relation relation) GenericXLogState *state; int i; - state = (GenericXLogState *) palloc(sizeof(GenericXLogState)); + state = (GenericXLogState *) palloc_aligned(sizeof(GenericXLogState), + PG_IO_ALIGN_SIZE, + 0); state->isLogged = RelationNeedsWAL(relation); for (i = 0; i < MAX_GENERIC_XLOG_PAGES; i++) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 46821ad6056..a5c74fdab8c 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4506,7 +4506,7 @@ XLOGShmemSize(void) /* xlblocks array */ size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers)); /* extra alignment padding for XLOG I/O buffers */ - size = add_size(size, XLOG_BLCKSZ); + size = add_size(size, Max(XLOG_BLCKSZ, PG_IO_ALIGN_SIZE)); /* and the buffers themselves */ size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers)); diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index af1491aa1d1..2add0534891 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -451,7 +451,7 @@ void RelationCopyStorage(SMgrRelation src, SMgrRelation dst, ForkNumber forkNum, char relpersistence) { - PGAlignedBlock buf; + PGIOAlignedBlock buf; Page page; bool use_wal; bool copying_initfork; diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 20946c47cb4..0057443f0c6 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -78,9 +78,12 @@ InitBufferPool(void) NBuffers * sizeof(BufferDescPadded), &foundDescs); + /* Align buffer pool on IO page size boundary. */ BufferBlocks = (char *) - ShmemInitStruct("Buffer Blocks", - NBuffers * (Size) BLCKSZ, &foundBufs); + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemInitStruct("Buffer Blocks", + NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &foundBufs)); /* Align condition variables to cacheline boundary. */ BufferIOCVArray = (ConditionVariableMinimallyPadded *) @@ -163,7 +166,8 @@ BufferShmemSize(void) /* to allow aligning buffer descriptors */ size = add_size(size, PG_CACHE_LINE_SIZE); - /* size of data pages */ + /* size of data pages, plus alignment padding */ + size = add_size(size, PG_IO_ALIGN_SIZE); size = add_size(size, mul_size(NBuffers, BLCKSZ)); /* size of stuff controlled by freelist.c */ diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index a12d0c6c27c..5a237d56063 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -4250,7 +4250,7 @@ RelationCopyStorageUsingBuffer(RelFileLocator srclocator, bool use_wal; BlockNumber nblocks; BlockNumber blkno; - PGAlignedBlock buf; + PGIOAlignedBlock buf; BufferAccessStrategy bstrategy_src; BufferAccessStrategy bstrategy_dst; diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3d5bc9193d3..3c6382456a2 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -744,8 +744,11 @@ GetLocalBufferStorage(void) /* And don't overflow MaxAllocSize, either */ num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ); - cur_block = (char *) MemoryContextAlloc(LocalBufferContext, - num_bufs * BLCKSZ); + /* Buffers should be I/O aligned. */ + cur_block = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + MemoryContextAlloc(LocalBufferContext, + num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE)); next_buf_in_block = 0; num_bufs_in_block = num_bufs; } diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 37ea8ac6b7c..84ead85942a 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -95,6 +95,12 @@ struct BufFile off_t curOffset; /* offset part of current pos */ int pos; /* next read/write position in buffer */ int nbytes; /* total # of valid bytes in buffer */ + + /* + * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid + * wasting per-file alignment padding when some users create many + * files. + */ PGAlignedBlock buffer; }; diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 92994f8f395..9a302ddc30e 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -1522,7 +1522,10 @@ PageSetChecksumCopy(Page page, BlockNumber blkno) * and second to avoid wasting space in processes that never call this. */ if (pageCopy == NULL) - pageCopy = MemoryContextAlloc(TopMemoryContext, BLCKSZ); + pageCopy = MemoryContextAllocAligned(TopMemoryContext, + BLCKSZ, + PG_IO_ALIGN_SIZE, + 0); memcpy(pageCopy, (char *) page, BLCKSZ); ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index d9d0367c89d..d1124d46f49 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -453,6 +453,10 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbytes; MdfdVec *v; + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND Assert(blocknum >= mdnblocks(reln, forknum)); @@ -783,6 +787,10 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbytes; MdfdVec *v; + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rlocator.locator.spcOid, reln->smgr_rlocator.locator.dbOid, @@ -848,6 +856,10 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nbytes; MdfdVec *v; + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + /* This assert is too expensive to have on normally ... */ #ifdef CHECK_WRITE_VS_EXTEND Assert(blocknum < mdnblocks(reln, forknum)); @@ -1429,7 +1441,8 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, */ if (nblocks < ((BlockNumber) RELSEG_SIZE)) { - char *zerobuf = palloc0(BLCKSZ); + char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, + MCXT_ALLOC_ZERO); mdextend(reln, forknum, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index 64ea237438b..52b8898d5ed 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -252,7 +252,7 @@ ltsWriteBlock(LogicalTapeSet *lts, long blocknum, const void *buffer) */ while (blocknum > lts->nBlocksWritten) { - PGAlignedBlock zerobuf; + PGIOAlignedBlock zerobuf; MemSet(zerobuf.data, 0, sizeof(zerobuf)); diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c index aa210074976..19eb67e4854 100644 --- a/src/bin/pg_checksums/pg_checksums.c +++ b/src/bin/pg_checksums/pg_checksums.c @@ -183,7 +183,7 @@ skipfile(const char *fn) static void scan_file(const char *fn, int segmentno) { - PGAlignedBlock buf; + PGIOAlignedBlock buf; PageHeader header = (PageHeader) buf.data; int f; BlockNumber blockno; diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c index da9d75dccb2..4e2a1376c69 100644 --- a/src/bin/pg_rewind/local_source.c +++ b/src/bin/pg_rewind/local_source.c @@ -77,7 +77,7 @@ static void local_queue_fetch_file(rewind_source *source, const char *path, size_t len) { const char *datadir = ((local_source *) source)->datadir; - PGAlignedBlock buf; + PGIOAlignedBlock buf; char srcpath[MAXPGPATH]; int srcfd; size_t written_len; @@ -129,7 +129,7 @@ local_queue_fetch_range(rewind_source *source, const char *path, off_t off, size_t len) { const char *datadir = ((local_source *) source)->datadir; - PGAlignedBlock buf; + PGIOAlignedBlock buf; char srcpath[MAXPGPATH]; int srcfd; off_t begin = off; diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index ed874507ff4..d1736028826 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -178,8 +178,8 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, { int src_fd; int dst_fd; - PGAlignedBlock buffer; - PGAlignedBlock new_vmbuf; + PGIOAlignedBlock buffer; + PGIOAlignedBlock new_vmbuf; ssize_t totalBytesRead = 0; ssize_t src_filesize; int rewriteVmBytesPerPage; diff --git a/src/common/file_utils.c b/src/common/file_utils.c index d568d83b9f6..74833c4acbb 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -540,8 +540,8 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset) { - static const PGAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */ - void *zerobuf_addr = unconstify(PGAlignedBlock *, &zbuffer)->data; + static const PGIOAlignedBlock zbuffer = {{0}}; /* worth BLCKSZ */ + void *zerobuf_addr = unconstify(PGIOAlignedBlock *, &zbuffer)->data; struct iovec iov[PG_IOV_MAX]; size_t remaining_size = size; ssize_t total_written = 0; diff --git a/src/include/c.h b/src/include/c.h index 5fe7a97ff03..f69d739be57 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1119,14 +1119,11 @@ extern void ExceptionalCondition(const char *conditionName, /* * Use this, not "char buf[BLCKSZ]", to declare a field or local variable - * holding a page buffer, if that page might be accessed as a page and not - * just a string of bytes. Otherwise the variable might be under-aligned, - * causing problems on alignment-picky hardware. (In some places, we use - * this to declare buffers even though we only pass them to read() and - * write(), because copying to/from aligned buffers is usually faster than - * using unaligned buffers.) We include both "double" and "int64" in the - * union to ensure that the compiler knows the value must be MAXALIGN'ed - * (cf. configure's computation of MAXIMUM_ALIGNOF). + * holding a page buffer, if that page might be accessed as a page. Otherwise + * the variable might be under-aligned, causing problems on alignment-picky + * hardware. We include both "double" and "int64" in the union to ensure that + * the compiler knows the value must be MAXALIGN'ed (cf. configure's + * computation of MAXIMUM_ALIGNOF). */ typedef union PGAlignedBlock { @@ -1135,9 +1132,30 @@ typedef union PGAlignedBlock int64 force_align_i64; } PGAlignedBlock; +/* + * Use this to declare a field or local variable holding a page buffer, if that + * page might be accessed as a page or passed to an SMgr I/O function. If + * allocating using the MemoryContext API, the aligned allocation functions + * should be used with PG_IO_ALIGN_SIZE. This alignment may be more efficient + * for I/O in general, but may be strictly required on some platforms when + * using direct I/O. + */ +typedef union PGIOAlignedBlock +{ +#ifdef pg_attribute_aligned + pg_attribute_aligned(PG_IO_ALIGN_SIZE) +#endif + char data[BLCKSZ]; + double force_align_d; + int64 force_align_i64; +} PGIOAlignedBlock; + /* Same, but for an XLOG_BLCKSZ-sized buffer */ typedef union PGAlignedXLogBlock { +#ifdef pg_attribute_aligned + pg_attribute_aligned(PG_IO_ALIGN_SIZE) +#endif char data[XLOG_BLCKSZ]; double force_align_d; int64 force_align_i64; diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index b586ee269a0..33ec6102c15 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -227,6 +227,12 @@ */ #define PG_CACHE_LINE_SIZE 128 +/* + * Assumed alignment requirement for direct I/O. 4K corresponds to common + * sector and memory page size. + */ +#define PG_IO_ALIGN_SIZE 4096 + /* *------------------------------------------------------------------------ * The following symbols are for enabling debugging code, not for diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index daceafd4732..faac4914fea 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -82,9 +82,10 @@ extern PGDLLIMPORT int max_safe_fds; * to the appropriate Windows flag in src/port/open.c. We simulate it with * fcntl(F_NOCACHE) on macOS inside fd.c's open() wrapper. We use the name * PG_O_DIRECT rather than defining O_DIRECT in that case (probably not a good - * idea on a Unix). + * idea on a Unix). We can only use it if the compiler will correctly align + * PGIOAlignedBlock for us, though. */ -#if defined(O_DIRECT) +#if defined(O_DIRECT) && defined(pg_attribute_aligned) #define PG_O_DIRECT O_DIRECT #elif defined(F_NOCACHE) #define PG_O_DIRECT 0x80000000 diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 494cc66d5b6..df960883c5c 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1703,6 +1703,7 @@ PGEventResultDestroy PGFInfoFunction PGFileType PGFunction +PGIOAlignedBlock PGLZ_HistEntry PGLZ_Strategy PGLoadBalanceType