From e6c9d41205599d9e622858a8699e47e3c5e9136e Mon Sep 17 00:00:00 2001 From: Andrew Dunstan Date: Sat, 13 Jan 2024 08:16:41 -0500 Subject: [PATCH v7 3/4] Use incremental parsing of backup manifests. This changes the three callers to json_parse_manifest() to use json_parse_manifest_incremental_chunk() if appropriate. In the case of the backend caller, since we don't know the size of the manifest in advance we always call the incremental parser. --- src/backend/backup/basebackup_incremental.c | 53 ++++++++---- src/bin/pg_combinebackup/load_manifest.c | 93 ++++++++++++++++----- src/bin/pg_verifybackup/pg_verifybackup.c | 91 ++++++++++++++------ 3 files changed, 177 insertions(+), 60 deletions(-) diff --git a/src/backend/backup/basebackup_incremental.c b/src/backend/backup/basebackup_incremental.c index 0504c465db..9e1973b2d3 100644 --- a/src/backend/backup/basebackup_incremental.c +++ b/src/backend/backup/basebackup_incremental.c @@ -31,6 +31,14 @@ #define BLOCKS_PER_READ 512 +/* + * we expect the find the last lines of the manifest, including the checksum, + * in the last MIN_CHUNK bytes of the manifest. We trigger an incremental + * parse step if we are about to overflow MAX_CHUNK bytes. + */ +#define MIN_CHUNK 1024 +#define MAX_CHUNK (128 * 1024) + /* * Details extracted from the WAL ranges present in the supplied backup manifest. */ @@ -110,6 +118,11 @@ struct IncrementalBackupInfo * turns out to be a problem in practice, we'll need to be more clever. */ BlockRefTable *brtab; + + /* + * State object for incremental JSON parsing + */ + JsonManifestParseIncrementalState *inc_state; }; static void manifest_process_file(JsonManifestParseContext *context, @@ -136,6 +149,7 @@ CreateIncrementalBackupInfo(MemoryContext mcxt) { IncrementalBackupInfo *ib; MemoryContext oldcontext; + JsonManifestParseContext *context; oldcontext = MemoryContextSwitchTo(mcxt); @@ -151,6 +165,15 @@ CreateIncrementalBackupInfo(MemoryContext mcxt) */ ib->manifest_files = backup_file_create(mcxt, 10000, NULL); + context = palloc0(sizeof(JsonManifestParseContext)); + /* Parse the manifest. */ + context->private_data = ib; + context->per_file_cb = manifest_process_file; + context->per_wal_range_cb = manifest_process_wal_range; + context->error_cb = manifest_report_error; + + ib->inc_state = json_parse_manifest_incremental_init(context); + MemoryContextSwitchTo(oldcontext); return ib; @@ -170,13 +193,19 @@ AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data, /* Switch to our memory context. */ oldcontext = MemoryContextSwitchTo(ib->mcxt); - /* - * XXX. Our json parser is at present incapable of parsing json blobs - * incrementally, so we have to accumulate the entire backup manifest - * before we can do anything with it. This should really be fixed, since - * some users might have very large numbers of files in the data - * directory. - */ + if (ib->buf.len >= MIN_CHUNK && ib->buf.len + len > MAX_CHUNK) + { + /* + * time for an incremental parse. We'll do all but the last but so + * that we have enough left for the final piece. + */ + json_parse_manifest_incremental_chunk( + ib->inc_state, ib->buf.data, ib->buf.len - MIN_CHUNK, false); + /* now remove what we just parsed */ + memmove(ib->buf.data, ib->buf.data + (ib->buf.len - MIN_CHUNK), MIN_CHUNK + 1); + ib->buf.len = MIN_CHUNK; + } + appendBinaryStringInfo(&ib->buf, data, len); /* Switch back to previous memory context. */ @@ -190,18 +219,14 @@ AppendIncrementalManifestData(IncrementalBackupInfo *ib, const char *data, void FinalizeIncrementalManifest(IncrementalBackupInfo *ib) { - JsonManifestParseContext context; MemoryContext oldcontext; /* Switch to our memory context. */ oldcontext = MemoryContextSwitchTo(ib->mcxt); - /* Parse the manifest. */ - context.private_data = ib; - context.per_file_cb = manifest_process_file; - context.per_wal_range_cb = manifest_process_wal_range; - context.error_cb = manifest_report_error; - json_parse_manifest(&context, ib->buf.data, ib->buf.len); + /* parse the last chunk of the manifest */ + json_parse_manifest_incremental_chunk( + ib->inc_state, ib->buf.data, ib->buf.len, true); /* Done with the buffer, so release memory. */ pfree(ib->buf.data); diff --git a/src/bin/pg_combinebackup/load_manifest.c b/src/bin/pg_combinebackup/load_manifest.c index 2b8e74fcf3..982be78e28 100644 --- a/src/bin/pg_combinebackup/load_manifest.c +++ b/src/bin/pg_combinebackup/load_manifest.c @@ -34,6 +34,12 @@ */ #define ESTIMATED_BYTES_PER_MANIFEST_LINE 100 +/* + * size of json chunk to be read in + * + */ +#define READ_CHUNK_SIZE (128 * 1024) + /* * Define a hash table which we can use to store information about the files * mentioned in the backup manifest. @@ -105,6 +111,7 @@ load_backup_manifest(char *backup_directory) int rc; JsonManifestParseContext context; manifest_data *result; + int chunk_size = READ_CHUNK_SIZE; /* Open the manifest file. */ snprintf(pathname, MAXPGPATH, "%s/backup_manifest", backup_directory); @@ -129,34 +136,76 @@ load_backup_manifest(char *backup_directory) /* Create the hash table. */ ht = manifest_files_create(initial_size, NULL); - /* - * Slurp in the whole file. - * - * This is not ideal, but there's currently no way to get pg_parse_json() - * to perform incremental parsing. - */ - buffer = pg_malloc(statbuf.st_size); - rc = read(fd, buffer, statbuf.st_size); - if (rc != statbuf.st_size) - { - if (rc < 0) - pg_fatal("could not read file \"%s\": %m", pathname); - else - pg_fatal("could not read file \"%s\": read %d of %lld", - pathname, rc, (long long int) statbuf.st_size); - } - - /* Close the manifest file. */ - close(fd); - - /* Parse the manifest. */ result = pg_malloc0(sizeof(manifest_data)); result->files = ht; context.private_data = result; context.per_file_cb = combinebackup_per_file_cb; context.per_wal_range_cb = combinebackup_per_wal_range_cb; context.error_cb = report_manifest_error; - json_parse_manifest(&context, buffer, statbuf.st_size); + + /* + * Parse the file, in chunks if necessary. + */ + if (statbuf.st_size <= chunk_size) + { + buffer = pg_malloc(statbuf.st_size); + rc = read(fd, buffer, statbuf.st_size); + if (rc != statbuf.st_size) + { + if (rc < 0) + pg_fatal("could not read file \"%s\": %m", pathname); + else + pg_fatal("could not read file \"%s\": read %d of %lld", + pathname, rc, (long long int) statbuf.st_size); + } + + /* Close the manifest file. */ + close(fd); + + /* Parse the manifest. */ + json_parse_manifest(&context, buffer, statbuf.st_size); + } + else + { + int bytes_left = statbuf.st_size; + JsonManifestParseIncrementalState *inc_state; + + inc_state = json_parse_manifest_incremental_init(&context); + + buffer = pg_malloc(chunk_size + 1); + + while (bytes_left > 0) + { + int bytes_to_read = chunk_size; + + /* + * Make sure that the last chunk is sufficiently large. (i.e. at + * least half the chunk size) so that it will contain fully the + * piece at the end with the checksum. + */ + if (bytes_left < chunk_size) + bytes_to_read = bytes_left; + else if (bytes_left < 2 * chunk_size) + bytes_to_read = bytes_left / 2; + rc = read(fd, buffer, bytes_to_read); + buffer[rc] = '\0'; /* useful for writing log traces */ + if (rc != bytes_to_read) + { + if (rc < 0) + pg_fatal("could not read file \"%s\": %m", pathname); + else + pg_fatal("could not read file \"%s\": read %lld of %lld", + pathname, + (long long int)(statbuf.st_size + rc - bytes_left), + (long long int) statbuf.st_size); + } + bytes_left -= rc; + json_parse_manifest_incremental_chunk( + inc_state, buffer, rc, bytes_left == 0); + } + + close(fd); + } /* All done. */ pfree(buffer); diff --git a/src/bin/pg_verifybackup/pg_verifybackup.c b/src/bin/pg_verifybackup/pg_verifybackup.c index ae8c18f373..02b160f9fc 100644 --- a/src/bin/pg_verifybackup/pg_verifybackup.c +++ b/src/bin/pg_verifybackup/pg_verifybackup.c @@ -42,7 +42,7 @@ /* * How many bytes should we try to read from a file at once? */ -#define READ_CHUNK_SIZE 4096 +#define READ_CHUNK_SIZE (128 * 1024) /* * Each file described by the manifest file is parsed to produce an object @@ -399,6 +399,8 @@ parse_manifest_file(char *manifest_path, manifest_files_hash **ht_p, parser_context private_context; JsonManifestParseContext context; + int chunk_size = READ_CHUNK_SIZE; + /* Open the manifest file. */ if ((fd = open(manifest_path, O_RDONLY | PG_BINARY, 0)) < 0) report_fatal_error("could not open file \"%s\": %m", manifest_path); @@ -414,28 +416,6 @@ parse_manifest_file(char *manifest_path, manifest_files_hash **ht_p, /* Create the hash table. */ ht = manifest_files_create(initial_size, NULL); - /* - * Slurp in the whole file. - * - * This is not ideal, but there's currently no easy way to get - * pg_parse_json() to perform incremental parsing. - */ - buffer = pg_malloc(statbuf.st_size); - rc = read(fd, buffer, statbuf.st_size); - if (rc != statbuf.st_size) - { - if (rc < 0) - report_fatal_error("could not read file \"%s\": %m", - manifest_path); - else - report_fatal_error("could not read file \"%s\": read %d of %lld", - manifest_path, rc, (long long int) statbuf.st_size); - } - - /* Close the manifest file. */ - close(fd); - - /* Parse the manifest. */ private_context.ht = ht; private_context.first_wal_range = NULL; private_context.last_wal_range = NULL; @@ -443,7 +423,70 @@ parse_manifest_file(char *manifest_path, manifest_files_hash **ht_p, context.per_file_cb = verifybackup_per_file_cb; context.per_wal_range_cb = verifybackup_per_wal_range_cb; context.error_cb = report_manifest_error; - json_parse_manifest(&context, buffer, statbuf.st_size); + + /* + * Parse the file, in chunks if necessary. + */ + if (statbuf.st_size <= chunk_size) + { + buffer = pg_malloc(statbuf.st_size); + rc = read(fd, buffer, statbuf.st_size); + if (rc != statbuf.st_size) + { + if (rc < 0) + pg_fatal("could not read file \"%s\": %m", manifest_path); + else + pg_fatal("could not read file \"%s\": read %d of %lld", + manifest_path, rc, (long long int) statbuf.st_size); + } + + /* Close the manifest file. */ + close(fd); + + /* Parse the manifest. */ + json_parse_manifest(&context, buffer, statbuf.st_size); + } + else + { + int bytes_left = statbuf.st_size; + JsonManifestParseIncrementalState *inc_state; + + inc_state = json_parse_manifest_incremental_init(&context); + + buffer = pg_malloc(chunk_size + 1); + + while (bytes_left > 0) + { + int bytes_to_read = chunk_size; + + /* + * Make sure that the last chunk is sufficiently large. (i.e. at + * least half the chunk size) so that it will contain fully the + * piece at the end with the checksum. + */ + if (bytes_left < chunk_size) + bytes_to_read = bytes_left; + else if (bytes_left < 2 * chunk_size) + bytes_to_read = bytes_left / 2; + rc = read(fd, buffer, bytes_to_read); + buffer[rc] = '\0'; /* useful for writing log traces */ + if (rc != bytes_to_read) + { + if (rc < 0) + pg_fatal("could not read file \"%s\": %m", manifest_path); + else + pg_fatal("could not read file \"%s\": read %lld of %lld", + manifest_path, + (long long int)(statbuf.st_size + rc - bytes_left), + (long long int) statbuf.st_size); + } + bytes_left -= rc; + json_parse_manifest_incremental_chunk( + inc_state, buffer, rc, bytes_left == 0); + } + + close(fd); + } /* Done with the buffer. */ pfree(buffer); -- 2.34.1