From 287777a6fe46fb897eff7d77e11bd021ad549c56 Mon Sep 17 00:00:00 2001 From: Amit Kapila Date: Tue, 14 Jul 2020 10:56:51 +0530 Subject: [PATCH v49 1/5] Extend the BufFile interface. Allow BufFile to support temporary files that can be used by the single backend when the corresponding files need to be survived across the transaction and need to be opened and closed multiple times. Such files need to be created as a member of a SharedFileSet. Implement the interface for BufFileTruncate to allow files to be truncated up to a particular offset. Extend BufFileSeek API to support SEEK_END case. Add an option to provide a mode while opening the shared BufFiles instead of always opening in read-only mode. These enhancements in BufFile interface are required for the upcoming patch to allow the replication apply worker, to properly handle streamed in-progress transactions. Author: Dilip Kumar, Amit Kapila Reviewed-by: Amit Kapila Tested-by: Neha Sharma Discussion: https://postgr.es/m/688b0b7f-2f6c-d827-c27b-216a8e3ea700@2ndquadrant.com --- src/backend/postmaster/pgstat.c | 3 + src/backend/storage/file/buffile.c | 92 +++++++++++++++++++++++--- src/backend/storage/file/fd.c | 9 ++- src/backend/storage/file/sharedfileset.c | 103 +++++++++++++++++++++++++++--- src/backend/utils/sort/logtape.c | 4 +- src/backend/utils/sort/sharedtuplestore.c | 2 +- src/include/pgstat.h | 1 + src/include/storage/buffile.h | 4 +- src/include/storage/fd.h | 2 +- src/include/storage/sharedfileset.h | 4 +- 10 files changed, 196 insertions(+), 28 deletions(-) diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 73ce944..8116b23 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3940,6 +3940,9 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_BUFFILE_WRITE: event_name = "BufFileWrite"; break; + case WAIT_EVENT_BUFFILE_TRUNCATE: + event_name = "BufFileTruncate"; + break; case WAIT_EVENT_CONTROL_FILE_READ: event_name = "ControlFileRead"; break; diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c index 2d7a082..939f092 100644 --- a/src/backend/storage/file/buffile.c +++ b/src/backend/storage/file/buffile.c @@ -32,10 +32,14 @@ * (by opening multiple fd.c temporary files). This is an essential feature * for sorts and hashjoins on large amounts of data. * - * BufFile supports temporary files that can be made read-only and shared with - * other backends, as infrastructure for parallel execution. Such files need - * to be created as a member of a SharedFileSet that all participants are - * attached to. + * BufFile supports temporary files that can be shared with other backends, as + * infrastructure for parallel execution. Such files need to be created as a + * member of a SharedFileSet that all participants are attached to. + * + * BufFile also supports temporary files that can be used by the single backend + * when the corresponding files need to be survived across the transaction and + * need to be opened and closed multiple times. Such files need to be created + * as a member of a SharedFileSet. *------------------------------------------------------------------------- */ @@ -277,7 +281,7 @@ BufFileCreateShared(SharedFileSet *fileset, const char *name) * backends and render it read-only. */ BufFile * -BufFileOpenShared(SharedFileSet *fileset, const char *name) +BufFileOpenShared(SharedFileSet *fileset, const char *name, int mode) { BufFile *file; char segment_name[MAXPGPATH]; @@ -301,7 +305,7 @@ BufFileOpenShared(SharedFileSet *fileset, const char *name) } /* Try to load a segment. */ SharedSegmentName(segment_name, name, nfiles); - files[nfiles] = SharedFileSetOpen(fileset, segment_name); + files[nfiles] = SharedFileSetOpen(fileset, segment_name, mode); if (files[nfiles] <= 0) break; ++nfiles; @@ -321,7 +325,7 @@ BufFileOpenShared(SharedFileSet *fileset, const char *name) file = makeBufFileCommon(nfiles); file->files = files; - file->readOnly = true; /* Can't write to files opened this way */ + file->readOnly = (mode == O_RDONLY) ? true : false; file->fileset = fileset; file->name = pstrdup(name); @@ -364,6 +368,9 @@ BufFileDeleteShared(SharedFileSet *fileset, const char *name) if (!found) elog(ERROR, "could not delete unknown shared BufFile \"%s\"", name); + + /* Unregister the shared fileset */ + SharedFileSetUnregister(fileset); } /* @@ -666,11 +673,21 @@ BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) newFile = file->curFile; newOffset = (file->curOffset + file->pos) + offset; break; -#ifdef NOT_USED case SEEK_END: - /* could be implemented, not needed currently */ + + /* + * The file size of the last file gives us the end offset of that + * file. + */ + newFile = file->numFiles - 1; + newOffset = FileSize(file->files[file->numFiles - 1]); + if (newOffset < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m", + FilePathName(file->files[file->numFiles - 1]), + file->name))); break; -#endif default: elog(ERROR, "invalid whence: %d", whence); return EOF; @@ -838,3 +855,58 @@ BufFileAppend(BufFile *target, BufFile *source) return startBlock; } + +/* + * Truncate the file upto the given fileno and the offset. + */ +void +BufFileTruncateShared(BufFile *file, int fileno, off_t offset) +{ + int numFiles = file->numFiles; + int curFile = file->curFile; + off_t curOffset = file->curOffset; + char segment_name[MAXPGPATH]; + int i; + + /* Loop over all the files upto the fileno which we want to truncate. */ + for (i = file->numFiles - 1; i >= fileno; i--) + { + /* + * Except the fileno, we can directly delete other files. If the + * offset is 0 then we can delete the fileno file as well unless it is + * the first file. + */ + if ((i != fileno || offset == 0) && fileno != 0) + { + SharedSegmentName(segment_name, file->name, i); + FileClose(file->files[i]); + if (!SharedFileSetDelete(file->fileset, segment_name, true)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not delete shared fileset \"%s\": %m", + segment_name))); + numFiles--; + curFile--; + curOffset = MAX_PHYSICAL_FILESIZE; + } + else + { + if (FileTruncate(file->files[i], offset, + WAIT_EVENT_BUFFILE_TRUNCATE) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\": %m", + FilePathName(file->files[i])))); + curOffset = offset; + } + } + + /* Otherwise, must reposition buffer, so flush any dirty data */ + BufFileFlush(file); + + file->numFiles = numFiles; + file->curFile = curFile; + file->curOffset = curOffset; + file->pos = 0; + file->nbytes = 0; +} diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 5f6420e..f376a97 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -1743,18 +1743,17 @@ PathNameCreateTemporaryFile(const char *path, bool error_on_failure) /* * Open a file that was created with PathNameCreateTemporaryFile, possibly in * another backend. Files opened this way don't count against the - * temp_file_limit of the caller, are read-only and are automatically closed - * at the end of the transaction but are not deleted on close. + * temp_file_limit of the caller, are automatically closed at the end of the + * transaction but are not deleted on close. */ File -PathNameOpenTemporaryFile(const char *path) +PathNameOpenTemporaryFile(const char *path, int mode) { File file; ResourceOwnerEnlargeFiles(CurrentResourceOwner); - /* We open the file read-only. */ - file = PathNameOpenFile(path, O_RDONLY | PG_BINARY); + file = PathNameOpenFile(path, mode | PG_BINARY); /* If no such file, then we don't raise an error. */ if (file <= 0 && errno != ENOENT) diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c index 16b7594..b183805 100644 --- a/src/backend/storage/file/sharedfileset.c +++ b/src/backend/storage/file/sharedfileset.c @@ -13,6 +13,10 @@ * files can be discovered by name, and a shared ownership semantics so that * shared files survive until the last user detaches. * + * SharedFileSets can be used by backends when the temporary files need to be + * opened/closed multiple times and the underlying files need to survive across + * transactions. + * *------------------------------------------------------------------------- */ @@ -25,25 +29,35 @@ #include "common/hashfn.h" #include "miscadmin.h" #include "storage/dsm.h" +#include "storage/ipc.h" #include "storage/sharedfileset.h" #include "utils/builtins.h" +static List *filesetlist = NIL; + static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum); +static void SharedFileSetDeleteOnProcExit(int status, Datum arg); static void SharedFileSetPath(char *path, SharedFileSet *fileset, Oid tablespace); static void SharedFilePath(char *path, SharedFileSet *fileset, const char *name); static Oid ChooseTablespace(const SharedFileSet *fileset, const char *name); /* - * Initialize a space for temporary files that can be opened for read-only - * access by other backends. Other backends must attach to it before - * accessing it. Associate this SharedFileSet with 'seg'. Any contained - * files will be deleted when the last backend detaches. + * Initialize a space for temporary files that can be opened by other backends. + * Other backends must attach to it before accessing it. Associate this + * SharedFileSet with 'seg'. Any contained files will be deleted when the + * last backend detaches. + * + * We can also use this interface if the temporary files are used only by + * single backend but the files need to be opened and closed multiple times + * and also the underlying files need to survive across transactions. For + * such cases, dsm segment 'seg' should be passed as NULL. We remove such + * files on proc exit. * * Files will be distributed over the tablespaces configured in * temp_tablespaces. * * Under the covers the set is one or more directories which will eventually - * be deleted when there are no backends attached. + * be deleted. */ void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg) @@ -84,7 +98,25 @@ SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg) } /* Register our cleanup callback. */ - on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); + if (seg) + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); + else + { + static bool registered_cleanup = false; + + if (!registered_cleanup) + { + /* + * We must not have registered any fileset before registering the + * fileset clean up. + */ + Assert(filesetlist == NIL); + on_proc_exit(SharedFileSetDeleteOnProcExit, 0); + registered_cleanup = true; + } + + filesetlist = lcons((void *) fileset, filesetlist); + } } /* @@ -147,13 +179,13 @@ SharedFileSetCreate(SharedFileSet *fileset, const char *name) * another backend. */ File -SharedFileSetOpen(SharedFileSet *fileset, const char *name) +SharedFileSetOpen(SharedFileSet *fileset, const char *name, int mode) { char path[MAXPGPATH]; File file; SharedFilePath(path, fileset, name); - file = PathNameOpenTemporaryFile(path); + file = PathNameOpenTemporaryFile(path, mode); return file; } @@ -223,6 +255,61 @@ SharedFileSetOnDetach(dsm_segment *segment, Datum datum) } /* + * Callback function that will be invoked on the process exit. This will + * process the list of all the registered sharedfilesets and delete the + * underlying files. + */ +static void +SharedFileSetDeleteOnProcExit(int status, Datum arg) +{ + ListCell *l; + + /* Loop over all the pending shared fileset entry */ + foreach (l, filesetlist) + { + SharedFileSet *fileset = (SharedFileSet *) lfirst(l); + SharedFileSetDeleteAll(fileset); + } + + filesetlist = NIL; +} + +/* + * Unregister the shared fileset entry registered for cleanup on proc exit. + */ +void +SharedFileSetUnregister(SharedFileSet *input_fileset) +{ + bool found = false; + ListCell *l; + + /* + * If the caller is following the dsm based cleanup then we don't + * maintain the filesetlist so return. + */ + if (filesetlist == NIL) + return; + + foreach (l, filesetlist) + { + SharedFileSet *fileset = (SharedFileSet *) lfirst(l); + + /* Remove the entry from the list */ + if (input_fileset == fileset) + { + filesetlist = list_delete_cell(filesetlist, l); + found = true; + break; + } + } + + Assert(found); + + /* Delete all files in the set */ + SharedFileSetDeleteAll(input_fileset); +} + +/* * Build the path for the directory holding the files backing a SharedFileSet * in a given tablespace. */ diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index 5517e59..788815c 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -78,6 +78,8 @@ #include "postgres.h" +#include + #include "storage/buffile.h" #include "utils/builtins.h" #include "utils/logtape.h" @@ -551,7 +553,7 @@ ltsConcatWorkerTapes(LogicalTapeSet *lts, TapeShare *shared, lt = <s->tapes[i]; pg_itoa(i, filename); - file = BufFileOpenShared(fileset, filename); + file = BufFileOpenShared(fileset, filename, O_RDONLY); filesize = BufFileSize(file); /* diff --git a/src/backend/utils/sort/sharedtuplestore.c b/src/backend/utils/sort/sharedtuplestore.c index 6537a43..b83fb50 100644 --- a/src/backend/utils/sort/sharedtuplestore.c +++ b/src/backend/utils/sort/sharedtuplestore.c @@ -559,7 +559,7 @@ sts_parallel_scan_next(SharedTuplestoreAccessor *accessor, void *meta_data) sts_filename(name, accessor, accessor->read_participant); accessor->read_file = - BufFileOpenShared(accessor->fileset, name); + BufFileOpenShared(accessor->fileset, name, O_RDONLY); } /* Seek and load the chunk header. */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 1387201..807a9c1 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -916,6 +916,7 @@ typedef enum WAIT_EVENT_BASEBACKUP_READ = PG_WAIT_IO, WAIT_EVENT_BUFFILE_READ, WAIT_EVENT_BUFFILE_WRITE, + WAIT_EVENT_BUFFILE_TRUNCATE, WAIT_EVENT_CONTROL_FILE_READ, WAIT_EVENT_CONTROL_FILE_SYNC, WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE, diff --git a/src/include/storage/buffile.h b/src/include/storage/buffile.h index f4752ba..fc34c49 100644 --- a/src/include/storage/buffile.h +++ b/src/include/storage/buffile.h @@ -48,7 +48,9 @@ extern long BufFileAppend(BufFile *target, BufFile *source); extern BufFile *BufFileCreateShared(SharedFileSet *fileset, const char *name); extern void BufFileExportShared(BufFile *file); -extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name); +extern BufFile *BufFileOpenShared(SharedFileSet *fileset, const char *name, + int mode); extern void BufFileDeleteShared(SharedFileSet *fileset, const char *name); +extern void BufFileTruncateShared(BufFile *file, int fileno, off_t offset); #endif /* BUFFILE_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 8cd125d..e209f04 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -94,7 +94,7 @@ extern mode_t FileGetRawMode(File file); /* Operations used for sharing named temporary files */ extern File PathNameCreateTemporaryFile(const char *name, bool error_on_failure); -extern File PathNameOpenTemporaryFile(const char *name); +extern File PathNameOpenTemporaryFile(const char *path, int mode); extern bool PathNameDeleteTemporaryFile(const char *name, bool error_on_failure); extern void PathNameCreateTemporaryDir(const char *base, const char *name); extern void PathNameDeleteTemporaryDir(const char *name); diff --git a/src/include/storage/sharedfileset.h b/src/include/storage/sharedfileset.h index 2d6cf07..d5edb60 100644 --- a/src/include/storage/sharedfileset.h +++ b/src/include/storage/sharedfileset.h @@ -37,9 +37,11 @@ typedef struct SharedFileSet extern void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg); extern void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg); extern File SharedFileSetCreate(SharedFileSet *fileset, const char *name); -extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name); +extern File SharedFileSetOpen(SharedFileSet *fileset, const char *name, + int mode); extern bool SharedFileSetDelete(SharedFileSet *fileset, const char *name, bool error_on_failure); extern void SharedFileSetDeleteAll(SharedFileSet *fileset); +extern void SharedFileSetUnregister(SharedFileSet *input_fileset); #endif -- 1.8.3.1