From d93a6c97bad19d3718f0e4f06caeac5ce9937b37 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy Date: Thu, 8 Dec 2022 09:37:01 +0000 Subject: [PATCH v1] Improve WALRead() to suck data directly from WAL buffers when possible --- src/backend/access/transam/xlog.c | 184 ++++++++++++++++++++++++ src/backend/access/transam/xlogreader.c | 58 +++++++- src/include/access/xlog.h | 9 ++ 3 files changed, 249 insertions(+), 2 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a31fbbff78..196be98591 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -689,6 +689,7 @@ static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr); static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto); static char *GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli); +static char *GetXLogBufferForRead(XLogRecPtr ptr, TimeLineID tli, char *page); static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); @@ -1639,6 +1640,189 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) return cachedPos + ptr % XLOG_BLCKSZ; } +/* + * Get the WAL buffer page containing passed in WAL record and also return the + * record's location within that buffer page. + */ +static char * +GetXLogBufferForRead(XLogRecPtr ptr, TimeLineID tli, char *page) +{ + XLogRecPtr expectedEndPtr; + XLogRecPtr endptr; + int idx; + char *recptr = NULL; + + idx = XLogRecPtrToBufIdx(ptr); + expectedEndPtr = ptr; + expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ; + + /* + * Hold WALBufMappingLock in shared mode so that the other concurrent WAL + * readers are also allowed. We try to do as less work as possible while + * holding the lock as it might impact concurrent WAL writers. + * + * XXX: Perhaps, measuring the immediate lock availability and its impact + * on concurrent WAL writers is a good idea here. + * + * XXX: Perhaps, returning if lock is not immediately available a good idea + * here. The caller can then go ahead with reading WAL from WAL file. + * + * XXX: Perhaps, quickly finding if the given WAL record is in WAL buffers + * a good idea here. This avoids unnecessary lock acquire-release cycles. + * One way to do that is by maintaining oldest WAL record that's currently + * present in WAL buffers. + */ + LWLockAcquire(WALBufMappingLock, LW_SHARED); + + /* + * Holding WALBufMappingLock ensures inserters don't overwrite this value + * while we are reading it. + */ + endptr = XLogCtl->xlblocks[idx]; + + if (expectedEndPtr == endptr) + { + XLogPageHeader phdr; + + /* + * We have found the WAL buffer page holding the given LSN. Read from a pointer + * to the right offset within the page. + */ + memcpy(page, (XLogCtl->pages + idx * (Size) XLOG_BLCKSZ), + (Size) XLOG_BLCKSZ); + + /* + * Release the lock as early as possible to avoid any possible + * contention. + */ + LWLockRelease(WALBufMappingLock); + + /* + * Despite we read the WAL buffer page by holding all necessary locks, + * we still want to be extra cautious here and serve the valid WAL + * buffer page. + * + * XXX: Perhaps, we can further go and validate the found page header, + * record header and record at least in assert builds, something like + * the xlogreader.c does and return if any of those validity checks + * fail. Having said that, we stick to the minimal checks for now. + */ + phdr = (XLogPageHeader) page; + + if (phdr->xlp_magic == XLOG_PAGE_MAGIC && + phdr->xlp_pageaddr == (ptr - (ptr % XLOG_BLCKSZ)) && + phdr->xlp_tli == tli) + { + /* + * Page looks valid, so return the page and the requested record's + * LSN. + */ + recptr = page + ptr % XLOG_BLCKSZ; + } + } + else + { + /* We have not found anything. */ + LWLockRelease(WALBufMappingLock); + } + + return recptr; +} + +/* + * When possible, read WAL starting at 'startptr' of size 'count' bytes from + * WAL buffers into buffer passed in by the caller 'buf'. Read as much WAL as + * possible from the WAL buffers, remaining WAL, if any, the caller will take + * care of reading from WAL files directly. + * + * This function sets read bytes to 'read_bytes' and sets 'hit', 'partial_hit' + * and 'miss' accordingly. + */ +void +XLogReadFromBuffers(XLogRecPtr startptr, + TimeLineID tli, + Size count, + char *buf, + Size *read_bytes, + bool *hit, + bool *partial_hit, + bool *miss) +{ + XLogRecPtr ptr; + char *dst; + Size nbytes; + + Assert(!XLogRecPtrIsInvalid(startptr)); + Assert(count > 0); + Assert(startptr <= GetFlushRecPtr(NULL)); + Assert(!RecoveryInProgress()); + + ptr = startptr; + nbytes = count; + dst = buf; + *read_bytes = 0; + *hit = false; + *partial_hit = false; + *miss = false; + + while (nbytes > 0) + { + char page[XLOG_BLCKSZ] = {0}; + char *recptr; + + recptr = GetXLogBufferForRead(ptr, tli, page); + + if (recptr == NULL) + break; + + if ((recptr + nbytes) <= (page + XLOG_BLCKSZ)) + { + /* All the bytes are in one page. */ + memcpy(dst, recptr, nbytes); + dst += nbytes; + *read_bytes += nbytes; + ptr += nbytes; + nbytes = 0; + } + else if ((recptr + nbytes) > (page + XLOG_BLCKSZ)) + { + /* All the bytes are not in one page. */ + Size bytes_remaining; + + /* + * Compute the remaining bytes on the current page, copy them over + * to output buffer and move forward to read further. + */ + bytes_remaining = XLOG_BLCKSZ - (recptr - page); + memcpy(dst, recptr, bytes_remaining); + dst += bytes_remaining; + nbytes -= bytes_remaining; + *read_bytes += bytes_remaining; + ptr += bytes_remaining; + } + } + + if (*read_bytes == count) + { + /* It's a buffer hit. */ + *hit = true; + } + else if (*read_bytes > 0 && + *read_bytes < count) + { + /* It's a buffer partial hit. */ + *partial_hit = true; + } + else if (*read_bytes == 0) + { + /* It's a buffer miss. */ + *miss = true; + } + + elog(DEBUG1, "read %zu bytes out of %zu bytes from WAL buffers for given LSN %X/%X", + *read_bytes, count, LSN_FORMAT_ARGS(startptr)); +} + /* * Converts a "usable byte position" to XLogRecPtr. A usable byte position * is the position starting from the beginning of WAL, excluding all WAL diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index a38a80e049..7ec94a0535 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -1485,8 +1485,7 @@ err: * Returns true if succeeded, false if an error occurs, in which case * 'errinfo' receives error details. * - * XXX probably this should be improved to suck data directly from the - * WAL buffers when possible. + * When possible, this function reads data directly from WAL buffers. */ bool WALRead(XLogReaderState *state, @@ -1497,6 +1496,61 @@ WALRead(XLogReaderState *state, XLogRecPtr recptr; Size nbytes; +#ifndef FRONTEND + /* Frontend tools have no idea of WAL buffers. */ + Size read_bytes; + bool hit; + bool partial_hit; + bool miss; + + /* + * When possible, read WAL from WAL buffers. We skip this step and continue + * the usual way, that is to read from WAL file, either when the server is + * in recovery (standby mode, archive or crash recovery), in which case the + * WAL buffers are not used or when the server is inserting in a different + * timeline from that of the timeline that we're trying to read WAL from. + */ + if (!RecoveryInProgress() && + tli == GetWALInsertionTimeLine()) + { + pgstat_report_wait_start(WAIT_EVENT_WAL_READ); + XLogReadFromBuffers(startptr, tli, count, buf, &read_bytes, + &hit, &partial_hit, &miss); + pgstat_report_wait_end(); + + if (hit) + { + /* + * We have fully read the requested WAL from WAL buffers, so + * return. + */ + Assert(count == read_bytes); + return true; + } + else if (partial_hit) + { + /* + * We have partially read from WAL buffers, so reset the state and + * read the remaining bytes the usual way. + */ + Assert(read_bytes > 0 && count > read_bytes); + buf += read_bytes; + startptr += read_bytes; + count -= read_bytes; + } +#ifdef USE_ASSERT_CHECKING + else if (miss) + { + /* + * We have not read anything from WAL buffers, so read the usual way, + * that is to read from WAL file. + */ + Assert(read_bytes == 0); + } +#endif /* USE_ASSERT_CHECKING */ + } +#endif /* FRONTEND */ + p = buf; recptr = startptr; nbytes = count; diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 1fbd48fbda..968608353e 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -247,6 +247,15 @@ extern XLogRecPtr GetLastImportantRecPtr(void); extern void SetWalWriterSleeping(bool sleeping); +extern void XLogReadFromBuffers(XLogRecPtr startptr, + TimeLineID tli, + Size count, + char *buf, + Size *read_bytes, + bool *hit, + bool *partial_hit, + bool *miss); + /* * Routines used by xlogrecovery.c to call back into xlog.c during recovery. */ -- 2.34.1