From 4ef8c90b3bcd79d4f9363527d022d7c04cbae737 Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Tue, 25 Nov 2014 14:24:26 +0900 Subject: [PATCH 2/2] Support compression for full-page writes in WAL Compression is controlled with a new parameter called wal_compression. This parameter can be changed at session level to control WAL compression. --- contrib/pg_xlogdump/pg_xlogdump.c | 9 +- doc/src/sgml/config.sgml | 24 +++++ src/backend/access/transam/xlog.c | 5 ++ src/backend/access/transam/xloginsert.c | 121 ++++++++++++++++++++++---- src/backend/access/transam/xlogreader.c | 39 +++++++-- src/backend/utils/misc/guc.c | 9 ++ src/backend/utils/misc/postgresql.conf.sample | 1 + src/bin/pg_controldata/pg_controldata.c | 2 + src/bin/pg_resetxlog/pg_resetxlog.c | 2 + src/include/access/xlog.h | 1 + src/include/access/xlog_internal.h | 1 + src/include/access/xlogreader.h | 4 + src/include/access/xlogrecord.h | 5 ++ src/include/catalog/pg_control.h | 1 + 14 files changed, 201 insertions(+), 23 deletions(-) diff --git a/contrib/pg_xlogdump/pg_xlogdump.c b/contrib/pg_xlogdump/pg_xlogdump.c index 9f05e25..b3db55f 100644 --- a/contrib/pg_xlogdump/pg_xlogdump.c +++ b/contrib/pg_xlogdump/pg_xlogdump.c @@ -369,7 +369,9 @@ XLogDumpCountRecord(XLogDumpConfig *config, XLogDumpStats *stats, fpi_len = 0; for (block_id = 0; block_id <= record->max_block_id; block_id++) { - if (XLogRecHasBlockImage(record, block_id)) + if (XLogRecHasCompressedBlockImage(record, block_id)) + fpi_len += record->blocks[block_id].compress_len; + else if (XLogRecHasBlockImage(record, block_id)) fpi_len += BLCKSZ - record->blocks[block_id].hole_length; } @@ -465,9 +467,10 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) blk); if (XLogRecHasBlockImage(record, block_id)) { - printf(" (FPW); hole: offset: %u, length: %u\n", + printf(" (FPW); hole: offset: %u, length: %u, compressed: %u\n", record->blocks[block_id].hole_offset, - record->blocks[block_id].hole_length); + record->blocks[block_id].hole_length, + record->blocks[block_id].compress_len); } putchar('\n'); } diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index d607eca..4778c77 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2254,6 +2254,30 @@ include_dir 'conf.d' + + wal_compression (boolean) + + wal_compression configuration parameter + + + + + When this parameter is on, the PostgreSQL + server compresses the content of full-page writes when necessary and + inserts in WAL a records with smaller sizes, reducing the amount of + WAL stored on disk. + + + + Compression has the advantage of reducing the amount of disk I/O when + doing WAL-logging, at the cost of some extra CPU to perform the + compression of an image. At WAL replay, compressed images still need + some more CPU cycles to perform the decompression of each block image, + but it can reduce as well replay time in I/O bounded environments. + + + + wal_buffers (integer) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 0f09add..50dfed0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -88,6 +88,7 @@ char *XLogArchiveCommand = NULL; bool EnableHotStandby = false; bool fullPageWrites = true; bool wal_log_hints = false; +bool wal_compression = false; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; @@ -4610,6 +4611,7 @@ BootStrapXLOG(void) ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; ControlFile->wal_log_hints = wal_log_hints; + ControlFile->wal_compression = wal_compression; ControlFile->track_commit_timestamp = track_commit_timestamp; ControlFile->data_checksum_version = bootstrap_data_checksum_version; @@ -8498,6 +8500,7 @@ XLogReportParameters(void) { if (wal_level != ControlFile->wal_level || wal_log_hints != ControlFile->wal_log_hints || + wal_compression != ControlFile->wal_compression || MaxConnections != ControlFile->MaxConnections || max_worker_processes != ControlFile->max_worker_processes || max_prepared_xacts != ControlFile->max_prepared_xacts || @@ -8522,6 +8525,7 @@ XLogReportParameters(void) xlrec.max_locks_per_xact = max_locks_per_xact; xlrec.wal_level = wal_level; xlrec.wal_log_hints = wal_log_hints; + xlrec.wal_compression = wal_compression; xlrec.track_commit_timestamp = track_commit_timestamp; XLogBeginInsert(); @@ -8537,6 +8541,7 @@ XLogReportParameters(void) ControlFile->max_locks_per_xact = max_locks_per_xact; ControlFile->wal_level = wal_level; ControlFile->wal_log_hints = wal_log_hints; + ControlFile->wal_compression = wal_compression; ControlFile->track_commit_timestamp = track_commit_timestamp; UpdateControlFile(); } diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index f3d610f..0b65eaf 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -27,9 +27,13 @@ #include "miscadmin.h" #include "storage/bufmgr.h" #include "storage/proc.h" +#include "utils/pg_lzcompress.h" #include "utils/memutils.h" #include "pg_trace.h" +/* maximum size for compression buffer of block image */ +#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) + /* * For each block reference registered with XLogRegisterBuffer, we fill in * a registered_buffer struct. @@ -50,6 +54,8 @@ typedef struct XLogRecData bkp_rdatas[2]; /* temporary rdatas used to hold references to * backup block data in XLogRecordAssemble() */ + char compressed_page[PGLZ_MAX_BLCKSZ]; /* recipient for compressed + * page */ } registered_buffer; static registered_buffer *registered_buffers; @@ -57,6 +63,9 @@ static int max_registered_buffers; /* allocated size */ static int max_registered_block_id = 0; /* highest block_id + 1 * currently registered */ +/* Scratch buffer used to store block image to-be-compressed */ +static char compression_scratch[PGLZ_MAX_BLCKSZ]; + /* * A chain of XLogRecDatas to hold the "main data" of a WAL record, registered * with XLogRegisterData(...). @@ -97,6 +106,9 @@ static XLogRecData *XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecPtr RedoRecPtr, bool doPageWrites, XLogRecPtr *fpw_lsn); +static bool XLogCompressBackupBlock(char *page, uint32 orig_len, + char *dest, uint16 *len); + /* * Begin constructing a WAL record. This must be called before the * XLogRegister* functions and XLogInsert(). @@ -150,6 +162,7 @@ XLogEnsureRecordSpace(int max_block_id, int ndatas) if (nbuffers > max_registered_buffers) { + int i; registered_buffers = (registered_buffer *) repalloc(registered_buffers, sizeof(registered_buffer) * nbuffers); @@ -529,6 +542,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if (needs_backup) { Page page = regbuf->page; + int compression_done = false; /* * The page needs to be backed up, so set up *bimg @@ -563,29 +577,76 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, /* Fill in the remaining fields in the XLogRecordBlockData struct */ bkpb.fork_flags |= BKPBLOCK_HAS_IMAGE; - total_len += BLCKSZ - bimg.hole_length; - /* - * Construct XLogRecData entries for the page content. + * Construct XLogRecData entries for the page content. If page + * compression is enabled instead of creating a new entry store + * the data in dedicated buffer to prepare for the compression. + * If page has a hole skip it, allowing to achieve a two-level + * of compression. */ - rdt_datas_last->next = ®buf->bkp_rdatas[0]; - rdt_datas_last = rdt_datas_last->next; - if (bimg.hole_length == 0) + if (wal_compression) { - rdt_datas_last->data = page; - rdt_datas_last->len = BLCKSZ; + int page_len = BLCKSZ - bimg.hole_length; + uint16 compression_len; + + /* shape block image for compression and skip hole if any */ + if (bimg.hole_length == 0) + memcpy(compression_scratch, page, BLCKSZ); + else + { + /* Copy page content without hole */ + memcpy(compression_scratch, page, bimg.hole_offset); + memcpy(compression_scratch + bimg.hole_offset, + page + bimg.hole_offset + bimg.hole_length, + BLCKSZ - (bimg.hole_offset + bimg.hole_length)); + } + + /* Perform compression of block */ + if (XLogCompressBackupBlock(compression_scratch, + page_len, + regbuf->compressed_page, + &compression_len)) + { + /* compression is done, add record */ + compression_done = true; + bimg.compress_len = compression_len; + + rdt_datas_last->next = ®buf->bkp_rdatas[0]; + rdt_datas_last = rdt_datas_last->next; + rdt_datas_last->data = regbuf->compressed_page; + rdt_datas_last->len = compression_len; + total_len += compression_len; + } } - else + + /* + * If compression has not been done store normally this + * block image. + */ + if (!compression_done) { - /* must skip the hole */ - rdt_datas_last->data = page; - rdt_datas_last->len = bimg.hole_offset; + total_len += BLCKSZ - bimg.hole_length; - rdt_datas_last->next = ®buf->bkp_rdatas[1]; + rdt_datas_last->next = ®buf->bkp_rdatas[0]; rdt_datas_last = rdt_datas_last->next; + if (bimg.hole_length == 0) + { + rdt_datas_last->data = page; + rdt_datas_last->len = BLCKSZ; + } + else + { + /* must skip the hole */ + rdt_datas_last->data = page; + rdt_datas_last->len = bimg.hole_offset; - rdt_datas_last->data = page + (bimg.hole_offset + bimg.hole_length); - rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + bimg.hole_length); + rdt_datas_last->next = ®buf->bkp_rdatas[1]; + rdt_datas_last = rdt_datas_last->next; + + rdt_datas_last->data = page + (bimg.hole_offset + bimg.hole_length); + rdt_datas_last->len = BLCKSZ - (bimg.hole_offset + bimg.hole_length); + } + bimg.compress_len = 0; } } @@ -681,6 +742,35 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, } /* + * Create a compressed version of a backup block. If successful, return + * true and set 'len' to its length. If block cannot be compressed or if + * compression failed return false. + */ +static bool +XLogCompressBackupBlock(char *page, uint32 orig_len, char *dest, uint16 *len) +{ + /* leave if data can not be compressed */ + if (pglz_compress(page, orig_len, (PGLZ_Header *) dest, + PGLZ_strategy_default) != PGLZ_OK) + return false; + + /* + * We recheck the actual size even if pglz_compress() report success, + * because it might be satisfied with having saved as little as one byte + * in the compressed data --- which could turn into a net loss once you + * consider header and alignment padding. Worst case, the compressed + * format might require three padding bytes (plus header, which is + * included in VARSIZE(buf)), whereas the uncompressed format would take + * only one header byte and no padding if the value is short enough. So + * we insist on a savings of more than 2 bytes to ensure we have a gain. + */ + *len = VARSIZE((struct varlena *) dest); + if (*len >= orig_len - 2) + return false; + return true; +} + +/* * Determine whether the buffer referenced has to be backed up. * * Since we don't yet have the insert lock, fullPageWrites and forcePageWrites @@ -875,6 +965,7 @@ InitXLogInsert(void) if (registered_buffers == NULL) { + int i; registered_buffers = (registered_buffer *) MemoryContextAllocZero(xloginsert_cxt, sizeof(registered_buffer) * (XLR_NORMAL_MAX_BLOCK_ID + 1)); diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 67d6223..462266a 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -20,6 +20,7 @@ #include "access/xlog_internal.h" #include "access/xlogreader.h" #include "catalog/pg_control.h" +#include "utils/pg_lzcompress.h" static bool allocate_recordbuf(XLogReaderState *state, uint32 reclength); @@ -1034,7 +1035,13 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) { COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); COPY_HEADER_FIELD(&blk->hole_length, sizeof(uint16)); - datatotal += BLCKSZ - blk->hole_length; + COPY_HEADER_FIELD(&blk->compress_len, sizeof(uint16)); + + /* adapt depending on presence of compressed image */ + if (blk->compress_len != 0) + datatotal += blk->compress_len; + else + datatotal += BLCKSZ - blk->hole_length; } if (!(fork_flags & BKPBLOCK_SAME_REL)) { @@ -1089,7 +1096,12 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (blk->has_image) { blk->bkp_image = ptr; - ptr += BLCKSZ - blk->hole_length; + + /* adapt depending on presence of compressed image */ + if (blk->compress_len != 0) + ptr += blk->compress_len; + else + ptr += BLCKSZ - blk->hole_length; } if (blk->has_data) { @@ -1195,6 +1207,8 @@ bool RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) { DecodedBkpBlock *bkpb; + char *uncompressed_page = NULL; + char *block_image; if (!record->blocks[block_id].in_use) return false; @@ -1202,20 +1216,35 @@ RestoreBlockImage(XLogReaderState *record, uint8 block_id, char *page) return false; bkpb = &record->blocks[block_id]; + block_image = bkpb->bkp_image; + + /* decompress block if needed before processing */ + if (bkpb->compress_len != 0) + { + PGLZ_Header *header = (PGLZ_Header *) block_image; + uncompressed_page = (char *) + palloc(PGLZ_RAW_SIZE(header)); + /* XXX: should check for status code here */ + pglz_decompress(header, uncompressed_page); + block_image = uncompressed_page; + } + /* generate page, taking into account hole if necessary */ if (bkpb->hole_length == 0) { - memcpy(page, bkpb->bkp_image, BLCKSZ); + memcpy(page, block_image, BLCKSZ); } else { - memcpy(page, bkpb->bkp_image, bkpb->hole_offset); + memcpy(page, block_image, bkpb->hole_offset); /* must zero-fill the hole */ MemSet(page + bkpb->hole_offset, 0, bkpb->hole_length); memcpy(page + (bkpb->hole_offset + bkpb->hole_length), - bkpb->bkp_image + bkpb->hole_offset, + block_image + bkpb->hole_offset, BLCKSZ - (bkpb->hole_offset + bkpb->hole_length)); } + if (uncompressed_page) + pfree(uncompressed_page); return true; } diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index b1bff7f..beb1bc2 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -927,6 +927,15 @@ static struct config_bool ConfigureNamesBool[] = false, NULL, NULL, NULL }, + { + {"wal_compression", PGC_USERSET, WAL_SETTINGS, + gettext_noop("Compresses full-page writes written in WAL file."), + NULL + }, + &wal_compression, + false, + NULL, NULL, NULL + }, { {"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT, diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index b053659..3e928f8 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -191,6 +191,7 @@ #wal_buffers = -1 # min 32kB, -1 sets based on shared_buffers # (change requires restart) #wal_writer_delay = 200ms # 1-10000 milliseconds +#wal_compression = off # enable compression of full-page writes #commit_delay = 0 # range 0-100000, in microseconds #commit_siblings = 5 # range 1-1000 diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c index a838bb5..c15f5f4 100644 --- a/src/bin/pg_controldata/pg_controldata.c +++ b/src/bin/pg_controldata/pg_controldata.c @@ -294,6 +294,8 @@ main(int argc, char *argv[]) wal_level_str(ControlFile.wal_level)); printf(_("Current wal_log_hints setting: %s\n"), ControlFile.wal_log_hints ? _("on") : _("off")); + printf(_("Current wal_compression setting: %s\n"), + ControlFile.wal_compression ? _("on") : _("off")); printf(_("Current max_connections setting: %d\n"), ControlFile.MaxConnections); printf(_("Current max_worker_processes setting: %d\n"), diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c index f42d515..f4abe3c 100644 --- a/src/bin/pg_resetxlog/pg_resetxlog.c +++ b/src/bin/pg_resetxlog/pg_resetxlog.c @@ -579,6 +579,7 @@ GuessControlValues(void) ControlFile.wal_level = WAL_LEVEL_MINIMAL; ControlFile.wal_log_hints = false; + ControlFile.wal_compression = false; ControlFile.track_commit_timestamp = false; ControlFile.MaxConnections = 100; ControlFile.max_worker_processes = 8; @@ -795,6 +796,7 @@ RewriteControlFile(void) */ ControlFile.wal_level = WAL_LEVEL_MINIMAL; ControlFile.wal_log_hints = false; + ControlFile.wal_compression = false; ControlFile.track_commit_timestamp = false; ControlFile.MaxConnections = 100; ControlFile.max_worker_processes = 8; diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index d06fbc0..6bdfa4a 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -98,6 +98,7 @@ extern char *XLogArchiveCommand; extern bool EnableHotStandby; extern bool fullPageWrites; extern bool wal_log_hints; +extern bool wal_compression; extern bool log_checkpoints; /* WAL levels */ diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 825cf54..fd058ad 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -186,6 +186,7 @@ typedef struct xl_parameter_change int max_locks_per_xact; int wal_level; bool wal_log_hints; + bool wal_compression; bool track_commit_timestamp; } xl_parameter_change; diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index eb6cc89..3db312d 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -55,6 +55,7 @@ typedef struct char *bkp_image; uint16 hole_offset; uint16 hole_length; + uint16 compress_len; /* Buffer holding the rmgr-specific data associated with this block */ bool has_data; @@ -191,6 +192,9 @@ extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, ((decoder)->blocks[block_id].in_use) #define XLogRecHasBlockImage(decoder, block_id) \ ((decoder)->blocks[block_id].has_image) +#define XLogRecHasCompressedBlockImage(decoder, block_id) \ + (XLogRecHasBlockImage(decoder, block_id) && \ + (decoder)->blocks[block_id].compress_len != 0) extern bool RestoreBlockImage(XLogReaderState *recoder, uint8 block_id, char *dst); extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len); diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 11ddfac..cb58422 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -103,11 +103,16 @@ typedef struct XLogRecordBlockHeader * such a "hole" from the stored data (and it's not counted in the * XLOG record's CRC, either). Hence, the amount of block data actually * present is BLCKSZ - hole_length bytes. + * + * compress_len indicates the length of this block when compressed. A length + * of 0 means that this block is not compressed. If the block image has a hole + * the block image is compressed without the hole. */ typedef struct XLogRecordBlockImageHeader { uint16 hole_offset; /* number of bytes before "hole" */ uint16 hole_length; /* number of bytes in "hole" */ + uint16 compress_len; /* size of compressed block */ } XLogRecordBlockImageHeader; #define SizeOfXLogRecordBlockImageHeader sizeof(XLogRecordBlockImageHeader) diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index 6e9cac9..296e5b0 100644 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -175,6 +175,7 @@ typedef struct ControlFileData */ int wal_level; bool wal_log_hints; + bool wal_compression; int MaxConnections; int max_worker_processes; int max_prepared_xacts; -- 2.2.0