mirror of
https://git.postgresql.org/git/postgresql.git
synced 2025-02-23 19:39:53 +08:00
Use O_DIRECT if available when using O_SYNC for wal_sync_method.
Also, write multiple WAL buffers out in one write() operation. ITAGAKI Takahiro --------------------------------------------------------------------------- > If we disable writeback-cache and use open_sync, the per-page writing > behavior in WAL module will show up as bad result. O_DIRECT is similar > to O_DSYNC (at least on linux), so that the benefit of it will disappear > behind the slow disk revolution. > > In the current source, WAL is written as: > for (i = 0; i < N; i++) { write(&buffers[i], BLCKSZ); } > Is this intentional? Can we rewrite it as follows? > write(&buffers[0], N * BLCKSZ); > > In order to achieve it, I wrote a 'gather-write' patch (xlog.gw.diff). > Aside from this, I'll also send the fixed direct io patch (xlog.dio.diff). > These two patches are independent, so they can be applied either or both. > > > I tested them on my machine and the results as follows. It shows that > direct-io and gather-write is the best choice when writeback-cache is off. > Are these two patches worth trying if they are used together? > > > | writeback | fsync= | fdata | open_ | fsync_ | open_ > patch | cache | false | sync | sync | direct | direct > ------------+-----------+--------+-------+-------+--------+--------- > direct io | off | 124.2 | 105.7 | 48.3 | 48.3 | 48.2 > direct io | on | 129.1 | 112.3 | 114.1 | 142.9 | 144.5 > gather-write| off | 124.3 | 108.7 | 105.4 | (N/A) | (N/A) > both | off | 131.5 | 115.5 | 114.4 | 145.4 | 145.2 > > - 20runs * pgbench -s 100 -c 50 -t 200 > - with tuning (wal_buffers=64, commit_delay=500, checkpoint_segments=8) > - using 2 ATA disks: > - hda(reiserfs) includes system and wal. > - hdc(jfs) includes database files. writeback-cache is always on. > > --- > ITAGAKI Takahiro
This commit is contained in:
parent
722f31f786
commit
c34bb00581
@ -7,7 +7,7 @@
|
||||
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
|
||||
* Portions Copyright (c) 1994, Regents of the University of California
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.210 2005/07/23 15:31:16 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.211 2005/07/29 03:22:33 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@ -47,6 +47,20 @@
|
||||
#include "utils/relcache.h"
|
||||
|
||||
|
||||
/*
|
||||
* Becauase O_DIRECT bypasses the kernel buffers, and because we never
|
||||
* read those buffers except during crash recovery, it is a win to use
|
||||
* it in all cases where we sync on each write(). We could allow O_DIRECT
|
||||
* with fsync(), but because skipping the kernel buffer forces writes out
|
||||
* quickly, it seems best just to use it for O_SYNC. It is hard to imagine
|
||||
* how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
|
||||
*/
|
||||
#ifdef O_DIRECT
|
||||
#define PG_O_DIRECT O_DIRECT
|
||||
#else
|
||||
#define PG_O_DIRECT 0
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This chunk of hackery attempts to determine which file sync methods
|
||||
* are available on the current platform, and to choose an appropriate
|
||||
@ -54,24 +68,50 @@
|
||||
* configure determined whether fdatasync() is.
|
||||
*/
|
||||
#if defined(O_SYNC)
|
||||
#define OPEN_SYNC_FLAG O_SYNC
|
||||
#define CMP_OPEN_SYNC_FLAG O_SYNC
|
||||
#else
|
||||
#if defined(O_FSYNC)
|
||||
#define OPEN_SYNC_FLAG O_FSYNC
|
||||
#define CMP_OPEN_SYNC_FLAG O_FSYNC
|
||||
#endif
|
||||
#endif
|
||||
#define OPEN_SYNC_FLAG (CMP_OPEN_SYNC_FLAG | PG_O_DIRECT)
|
||||
|
||||
#if defined(O_DSYNC)
|
||||
#if defined(OPEN_SYNC_FLAG)
|
||||
#if O_DSYNC != OPEN_SYNC_FLAG
|
||||
#define OPEN_DATASYNC_FLAG O_DSYNC
|
||||
#if O_DSYNC != CMP_OPEN_SYNC_FLAG
|
||||
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
|
||||
#endif
|
||||
#else /* !defined(OPEN_SYNC_FLAG) */
|
||||
/* Win32 only has O_DSYNC */
|
||||
#define OPEN_DATASYNC_FLAG O_DSYNC
|
||||
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Limitation of buffer-alignment for direct io depend on OS and filesystem,
|
||||
* but BLCKSZ is assumed to be enough for it.
|
||||
*/
|
||||
#ifdef O_DIRECT
|
||||
#define ALIGNOF_XLOG_BUFFER BLCKSZ
|
||||
#else
|
||||
#define ALIGNOF_XLOG_BUFFER MAXIMUM_ALIGNOF
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Switch the alignment routine because ShmemAlloc() returns a max-aligned
|
||||
* buffer and ALIGNOF_XLOG_BUFFER may be greater than MAXIMUM_ALIGNOF.
|
||||
*/
|
||||
#if ALIGNOF_XLOG_BUFFER <= MAXIMUM_ALIGNOF
|
||||
#define XLOG_BUFFER_ALIGN(LEN) MAXALIGN((LEN))
|
||||
#else
|
||||
#define XLOG_BUFFER_ALIGN(LEN) ((LEN) + (ALIGNOF_XLOG_BUFFER))
|
||||
#endif
|
||||
/* assume sizeof(ptrdiff_t) == sizeof(void*) */
|
||||
#define POINTERALIGN(ALIGNVAL,PTR) \
|
||||
((char *)(((ptrdiff_t) (PTR) + (ALIGNVAL-1)) & ~((ptrdiff_t) (ALIGNVAL-1))))
|
||||
#define XLOG_BUFFER_POINTERALIGN(PTR) \
|
||||
POINTERALIGN((ALIGNOF_XLOG_BUFFER), (PTR))
|
||||
|
||||
#if defined(OPEN_DATASYNC_FLAG)
|
||||
#define DEFAULT_SYNC_METHOD_STR "open_datasync"
|
||||
#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN
|
||||
@ -469,6 +509,17 @@ static void ReadControlFile(void);
|
||||
static char *str_time(time_t tnow);
|
||||
static void issue_xlog_fsync(void);
|
||||
|
||||
/* XLog gather-write staffs */
|
||||
typedef struct XLogPages
|
||||
{
|
||||
char *head; /* Head of first page */
|
||||
int size; /* Total bytes of pages == count(pages) * BLCKSZ */
|
||||
int offset; /* Offset in xlog segment file */
|
||||
} XLogPages;
|
||||
static void XLogPageReset(XLogPages *pages);
|
||||
static void XLogPageWrite(XLogPages *pages, int index);
|
||||
static void XLogPageFlush(XLogPages *pages, int index);
|
||||
|
||||
#ifdef WAL_DEBUG
|
||||
static void xlog_outrec(char *buf, XLogRecord *record);
|
||||
#endif
|
||||
@ -1245,9 +1296,10 @@ static void
|
||||
XLogWrite(XLogwrtRqst WriteRqst)
|
||||
{
|
||||
XLogCtlWrite *Write = &XLogCtl->Write;
|
||||
char *from;
|
||||
bool ispartialpage;
|
||||
bool use_existent;
|
||||
int currentIndex = Write->curridx;
|
||||
XLogPages pages;
|
||||
|
||||
/* We should always be inside a critical section here */
|
||||
Assert(CritSectionCount > 0);
|
||||
@ -1258,6 +1310,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
|
||||
*/
|
||||
LogwrtResult = Write->LogwrtResult;
|
||||
|
||||
XLogPageReset(&pages);
|
||||
|
||||
while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
|
||||
{
|
||||
/*
|
||||
@ -1266,14 +1320,14 @@ XLogWrite(XLogwrtRqst WriteRqst)
|
||||
* end of the last page that's been initialized by
|
||||
* AdvanceXLInsertBuffer.
|
||||
*/
|
||||
if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
|
||||
if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[currentIndex]))
|
||||
elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
|
||||
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
|
||||
XLogCtl->xlblocks[Write->curridx].xlogid,
|
||||
XLogCtl->xlblocks[Write->curridx].xrecoff);
|
||||
XLogCtl->xlblocks[currentIndex].xlogid,
|
||||
XLogCtl->xlblocks[currentIndex].xrecoff);
|
||||
|
||||
/* Advance LogwrtResult.Write to end of current buffer page */
|
||||
LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
|
||||
LogwrtResult.Write = XLogCtl->xlblocks[currentIndex];
|
||||
ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
|
||||
|
||||
if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
|
||||
@ -1281,6 +1335,7 @@ XLogWrite(XLogwrtRqst WriteRqst)
|
||||
/*
|
||||
* Switch to new logfile segment.
|
||||
*/
|
||||
XLogPageFlush(&pages, currentIndex);
|
||||
if (openLogFile >= 0)
|
||||
{
|
||||
if (close(openLogFile))
|
||||
@ -1354,31 +1409,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
|
||||
openLogOff = 0;
|
||||
}
|
||||
|
||||
/* Need to seek in the file? */
|
||||
if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
|
||||
{
|
||||
openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
|
||||
if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
|
||||
ereport(PANIC,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not seek in log file %u, segment %u to offset %u: %m",
|
||||
openLogId, openLogSeg, openLogOff)));
|
||||
}
|
||||
|
||||
/* OK to write the page */
|
||||
from = XLogCtl->pages + Write->curridx * BLCKSZ;
|
||||
errno = 0;
|
||||
if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
|
||||
{
|
||||
/* if write didn't set errno, assume problem is no disk space */
|
||||
if (errno == 0)
|
||||
errno = ENOSPC;
|
||||
ereport(PANIC,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write to log file %u, segment %u at offset %u: %m",
|
||||
openLogId, openLogSeg, openLogOff)));
|
||||
}
|
||||
openLogOff += BLCKSZ;
|
||||
/* Add a page to buffer */
|
||||
XLogPageWrite(&pages, currentIndex);
|
||||
|
||||
/*
|
||||
* If we just wrote the whole last page of a logfile segment,
|
||||
@ -1390,8 +1422,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
|
||||
* This is also the right place to notify the Archiver that the
|
||||
* segment is ready to copy to archival storage.
|
||||
*/
|
||||
if (openLogOff >= XLogSegSize && !ispartialpage)
|
||||
if (openLogOff + pages.size >= XLogSegSize && !ispartialpage)
|
||||
{
|
||||
XLogPageFlush(&pages, currentIndex);
|
||||
issue_xlog_fsync();
|
||||
LogwrtResult.Flush = LogwrtResult.Write; /* end of current page */
|
||||
|
||||
@ -1405,8 +1438,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
|
||||
LogwrtResult.Write = WriteRqst.Write;
|
||||
break;
|
||||
}
|
||||
Write->curridx = NextBufIdx(Write->curridx);
|
||||
currentIndex = NextBufIdx(currentIndex);
|
||||
}
|
||||
XLogPageFlush(&pages, currentIndex);
|
||||
|
||||
/*
|
||||
* If asked to flush, do so
|
||||
@ -3584,7 +3618,7 @@ XLOGShmemSize(void)
|
||||
if (XLOGbuffers < MinXLOGbuffers)
|
||||
XLOGbuffers = MinXLOGbuffers;
|
||||
|
||||
return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
|
||||
return XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
|
||||
+ BLCKSZ * XLOGbuffers +
|
||||
MAXALIGN(sizeof(ControlFileData));
|
||||
}
|
||||
@ -3601,7 +3635,7 @@ XLOGShmemInit(void)
|
||||
|
||||
XLogCtl = (XLogCtlData *)
|
||||
ShmemInitStruct("XLOG Ctl",
|
||||
MAXALIGN(sizeof(XLogCtlData) +
|
||||
XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) +
|
||||
sizeof(XLogRecPtr) * XLOGbuffers)
|
||||
+ BLCKSZ * XLOGbuffers,
|
||||
&foundXLog);
|
||||
@ -3630,9 +3664,9 @@ XLOGShmemInit(void)
|
||||
* Here, on the other hand, we must MAXALIGN to ensure the page
|
||||
* buffers have worst-case alignment.
|
||||
*/
|
||||
XLogCtl->pages =
|
||||
((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
|
||||
sizeof(XLogRecPtr) * XLOGbuffers);
|
||||
XLogCtl->pages = XLOG_BUFFER_POINTERALIGN(
|
||||
((char *) XLogCtl)
|
||||
+ sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers);
|
||||
memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);
|
||||
|
||||
/*
|
||||
@ -3690,10 +3724,9 @@ BootStrapXLOG(void)
|
||||
/* First timeline ID is always 1 */
|
||||
ThisTimeLineID = 1;
|
||||
|
||||
/* Use malloc() to ensure buffer is MAXALIGNED */
|
||||
buffer = (char *) malloc(BLCKSZ);
|
||||
page = (XLogPageHeader) buffer;
|
||||
memset(buffer, 0, BLCKSZ);
|
||||
buffer = (char *) malloc(BLCKSZ + ALIGNOF_XLOG_BUFFER);
|
||||
page = (XLogPageHeader) XLOG_BUFFER_POINTERALIGN(buffer);
|
||||
memset(page, 0, BLCKSZ);
|
||||
|
||||
/* Set up information for the initial checkpoint record */
|
||||
checkPoint.redo.xlogid = 0;
|
||||
@ -3745,7 +3778,7 @@ BootStrapXLOG(void)
|
||||
|
||||
/* Write the first page with the initial record */
|
||||
errno = 0;
|
||||
if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
|
||||
if (write(openLogFile, page, BLCKSZ) != BLCKSZ)
|
||||
{
|
||||
/* if write didn't set errno, assume problem is no disk space */
|
||||
if (errno == 0)
|
||||
@ -5837,3 +5870,71 @@ remove_backup_label(void)
|
||||
errmsg("could not remove file \"%s\": %m",
|
||||
BACKUP_LABEL_FILE)));
|
||||
}
|
||||
|
||||
|
||||
/* XLog gather-write staffs */
|
||||
|
||||
static void
|
||||
XLogPageReset(XLogPages *pages)
|
||||
{
|
||||
memset(pages, 0, sizeof(*pages));
|
||||
}
|
||||
|
||||
static void
|
||||
XLogPageWrite(XLogPages *pages, int index)
|
||||
{
|
||||
char *page = XLogCtl->pages + index * BLCKSZ;
|
||||
int size = BLCKSZ;
|
||||
int offset = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
|
||||
|
||||
if (pages->head + pages->size == page
|
||||
&& pages->offset + pages->size == offset)
|
||||
{ /* Pages are continuous. Append new page. */
|
||||
pages->size += size;
|
||||
}
|
||||
else
|
||||
{ /* Pages are not continuous. Flush and clear. */
|
||||
XLogPageFlush(pages, PrevBufIdx(index));
|
||||
pages->head = page;
|
||||
pages->size = size;
|
||||
pages->offset = offset;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
XLogPageFlush(XLogPages *pages, int index)
|
||||
{
|
||||
if (!pages->head)
|
||||
{ /* No needs to write pages. */
|
||||
XLogCtl->Write.curridx = index;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Need to seek in the file? */
|
||||
if (openLogOff != pages->offset)
|
||||
{
|
||||
openLogOff = pages->offset;
|
||||
if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
|
||||
ereport(PANIC,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not seek in log file %u, segment %u to offset %u: %m",
|
||||
openLogId, openLogSeg, openLogOff)));
|
||||
}
|
||||
|
||||
/* OK to write the page */
|
||||
errno = 0;
|
||||
if (write(openLogFile, pages->head, pages->size) != pages->size)
|
||||
{
|
||||
/* if write didn't set errno, assume problem is no disk space */
|
||||
if (errno == 0)
|
||||
errno = ENOSPC;
|
||||
ereport(PANIC,
|
||||
(errcode_for_file_access(),
|
||||
errmsg("could not write to log file %u, segment %u at offset %u: %m",
|
||||
openLogId, openLogSeg, openLogOff)));
|
||||
}
|
||||
|
||||
openLogOff += pages->size;
|
||||
XLogCtl->Write.curridx = index;
|
||||
XLogPageReset(pages);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user