Thread: Display Pg buffer cache (WIP)
I found it useful to be able to look at what relations are occupying the cache (mainly for debugging and comparing with page estimates). I am wondering if it might be a useful addition generally? How it works a SRF called dump_cache() is created, which is then exposed as system view pg_dump_cache. So stuff like the following can be performed (cache of 2000 immediately after 'make installcheck'): regression=# SELECT c.relname, count(*) AS buffers regression-# FROM pg_class c, pg_dump_cache d regression-# WHERE d.relfilenode = c.relfilenode regression-# GROUP BY c.relname regression-# ORDER BY 2 DESC LIMIT 10; relname | buffers ---------------------------------+--------- tenk1 | 345 tenk2 | 345 onek | 138 pg_attribute | 102 road | 81 pg_attribute_relid_attnam_index | 74 onek2 | 69 pg_proc | 59 pg_proc_proname_args_nsp_index | 56 hash_f8_heap | 49 (10 rows) regression=# As of now the patch breaks 1 regression test (rules - that new view...) and does not peek into the contents of the buffers themselves (I was mainly interested in which relations were there), and does not worry too much about a consistent view of the buffer contents (as I didn't want it to block all other activity!). If it seems like a worthwhile addition I will amend the regression expected and resubmit. Any comments? Mark diff -Naur pgsql.orig/src/backend/catalog/system_views.sql pgsql/src/backend/catalog/system_views.sql --- pgsql.orig/src/backend/catalog/system_views.sql Thu Mar 3 11:29:55 2005 +++ pgsql/src/backend/catalog/system_views.sql Thu Mar 3 11:41:24 2005 @@ -277,3 +277,8 @@ DO INSTEAD NOTHING; GRANT SELECT, UPDATE ON pg_settings TO PUBLIC; + +CREATE VIEW pg_dump_cache AS + SELECT D.* FROM pg_dump_cache() AS D + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid); + diff -Naur pgsql.orig/src/backend/utils/adt/Makefile pgsql/src/backend/utils/adt/Makefile --- pgsql.orig/src/backend/utils/adt/Makefile Thu Mar 3 11:29:53 2005 +++ pgsql/src/backend/utils/adt/Makefile Thu Mar 3 11:32:10 2005 @@ -24,7 +24,7 @@ tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ - ascii.o quote.o pgstatfuncs.o encode.o + ascii.o quote.o pgstatfuncs.o encode.o dumpcache.o like.o: like.c like_match.c diff -Naur pgsql.orig/src/backend/utils/adt/dumpcache.c pgsql/src/backend/utils/adt/dumpcache.c --- pgsql.orig/src/backend/utils/adt/dumpcache.c Thu Jan 1 12:00:00 1970 +++ pgsql/src/backend/utils/adt/dumpcache.c Thu Mar 3 11:51:53 2005 @@ -0,0 +1,119 @@ +/*------------------------------------------------------------------------- + * + * dumpcache.c + * display some contents for the buffer cache + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "funcapi.h" +#include "catalog/pg_type.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "utils/relcache.h" + + +extern Datum dump_cache(PG_FUNCTION_ARGS); + + +/* + * Function context for data persisting over repeated calls. + */ +typedef struct { + int buffer; + AttInMetadata *attinmeta; + BufferDesc *bufhdr; + RelFileNode rnode; + char *values[3]; +} dumpcache_fctx; + + +/* + * Return a tuple that has bufferid, relfilenoide, reltablespace and + * reldatabase OIDs. + */ +Datum +dump_cache(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + MemoryContext oldcontext; + dumpcache_fctx *fctx; /* User function context. */ + TupleDesc tupledesc; + HeapTuple tuple; + + if (SRF_IS_FIRSTCALL()) + { + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* construct a tuple to return */ + tupledesc = CreateTemplateTupleDesc(4, false); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", + OIDOID, -1, 0); + + /* Generate attribute metadata needed later to produce tuples */ + funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc); + + /* + * Create a function context for cross-call persistence + * and initialize the buffer counters. + */ + fctx = (dumpcache_fctx *) palloc(sizeof(dumpcache_fctx)); + + fctx->buffer = 0; + fctx->bufhdr = BufferDescriptors; + + funcctx->user_fctx = fctx; + funcctx->max_calls = NBuffers; + + /* allocate the strings for tuple formation */ + fctx->values[0] = (char *) palloc(NAMEDATALEN + 1); + fctx->values[1] = (char *) palloc(NAMEDATALEN + 1); + fctx->values[2] = (char *) palloc(NAMEDATALEN + 1); + fctx->values[3] = (char *) palloc(NAMEDATALEN + 1); + + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + } + /* <user defined code> */ + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state, and set up the result */ + fctx = funcctx->user_fctx; + + + if (funcctx->call_cntr < funcctx->max_calls) + { + /* Get the relation node */ + fctx->rnode = (fctx->bufhdr)->tag.rnode; + + /* setup cstrings and create a tuple from them */ + sprintf(fctx->values[0], "%u", fctx->buffer); + sprintf(fctx->values[1], "%u", fctx->rnode.relNode); + sprintf(fctx->values[2], "%u", fctx->rnode.spcNode); + sprintf(fctx->values[3], "%u", fctx->rnode.dbNode); + + tuple = BuildTupleFromCStrings(funcctx->attinmeta, fctx->values); + result = HeapTupleGetDatum(tuple); + + + /* increment the buffer count and the buffer it points to */ + fctx->buffer++; + fctx->bufhdr++; + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); +} + diff -Naur pgsql.orig/src/include/catalog/pg_proc.h pgsql/src/include/catalog/pg_proc.h --- pgsql.orig/src/include/catalog/pg_proc.h Thu Mar 3 11:30:02 2005 +++ pgsql/src/include/catalog/pg_proc.h Thu Mar 3 11:39:27 2005 @@ -3615,6 +3615,8 @@ DATA(insert OID = 2558 ( int4 PGNSP PGUID 12 f f t f i 1 23 "16" _null_ bool_int4 - _null_ )); DESCR("convert boolean to int4"); +/* dump cache */ +DATA(insert OID = 2510 ( pg_dump_cache PGNSP PGUID 12 f f t t v 0 2249 "" _null_ dump_cache - _null_ )); /* * Symbolic values for provolatile column: these indicate whether the result
Mark Kirkwood wrote: > does not worry too much about a consistent view of the buffer > contents (as I didn't want it to block all other activity!). I don't like accessing shared data structures without acquiring the appropriate locks; even if it doesn't break anything, it seems like just asking for trouble. In order to be able to inspect a buffer's tag in Tom's new locking scheme (not yet in HEAD, but will be in 8.1), you need only hold a per-buffer lock. That will mean acquiring and releasing a spinlock for each buffer, which isn't _too_ bad... That means the data reported by the function might still be inconsistent; not sure how big a problem that is. It might be nice for the patch to indicate whether the buffers are dirty, and what their shared reference count is. > +extern Datum dump_cache(PG_FUNCTION_ARGS); This declaration belongs in a header file (such as include/utils/builtins.h). > +typedef struct { > + int buffer; > + AttInMetadata *attinmeta; > + BufferDesc *bufhdr; > + RelFileNode rnode; > + char *values[3]; > +} dumpcache_fctx; This should be values[4], no? This is trivial, but I think most type names use camel case (NamesLikeThis). Why does `rnode' need to be in the struct? You can also fetch the buffer number from the buffer desc, so you needn't store another copy of it. > + /* allocate the strings for tuple formation */ > + fctx->values[0] = (char *) palloc(NAMEDATALEN + 1); > + fctx->values[1] = (char *) palloc(NAMEDATALEN + 1); > + fctx->values[2] = (char *) palloc(NAMEDATALEN + 1); > + fctx->values[3] = (char *) palloc(NAMEDATALEN + 1); Is there a reason for choosing NAMEDATALEN + 1 as the size of these buffers? (There is no relation between NAMEDATALEN and the number of characters an OID will consume when converted via sprintf("%u") ) The patch doesn't treat unassigned buffers specially (i.e. those buffers whose tag contains of InvalidOids). Perhaps it should return NULL for their OID fields, rather than InvalidOid (which will be 0)? (An alternative would be to not include those rows in the result set, although perhaps administrators might want this information.) -Neil
Neil Conway wrote: > I don't like accessing shared data structures without acquiring the > appropriate locks; even if it doesn't break anything, it seems like just > asking for trouble. In order to be able to inspect a buffer's tag in > Tom's new locking scheme (not yet in HEAD, but will be in 8.1), you need > only hold a per-buffer lock. That will mean acquiring and releasing a > spinlock for each buffer, which isn't _too_ bad... > > That means the data reported by the function might still be > inconsistent; not sure how big a problem that is. > > It might be nice for the patch to indicate whether the buffers are > dirty, and what their shared reference count is. > Yeah - sounds good, will do. >> +extern Datum dump_cache(PG_FUNCTION_ARGS); > > Yep. > This declaration belongs in a header file (such as > include/utils/builtins.h). > >> +typedef struct { >> + int buffer; >> + AttInMetadata *attinmeta; >> + BufferDesc *bufhdr; >> + RelFileNode rnode; >> + char *values[3]; >> +} dumpcache_fctx; > > > This should be values[4], no? > Arrg... surprised there wasn't crashes everywhere.... > This is trivial, but I think most type names use camel case > (NamesLikeThis). > > Why does `rnode' need to be in the struct? You can also fetch the buffer > number from the buffer desc, so you needn't store another copy of it. > I thought it made readability a bit better, but yeah, could take it out. >> + /* allocate the strings for tuple formation */ >> + fctx->values[0] = (char *) palloc(NAMEDATALEN + 1); >> + fctx->values[1] = (char *) palloc(NAMEDATALEN + 1); >> + fctx->values[2] = (char *) palloc(NAMEDATALEN + 1); >> + fctx->values[3] = (char *) palloc(NAMEDATALEN + 1); > > > Is there a reason for choosing NAMEDATALEN + 1 as the size of these > buffers? (There is no relation between NAMEDATALEN and the number of > characters an OID will consume when converted via sprintf("%u") ) Hmmm... good spot, originally I was returning TEXTOID and relation names etc...and then it was "big enough" after converting to oid during testing, so err, yes - I will change that (as well) ! > > The patch doesn't treat unassigned buffers specially (i.e. those buffers > whose tag contains of InvalidOids). Perhaps it should return NULL for > their OID fields, rather than InvalidOid (which will be 0)? (An > alternative would be to not include those rows in the result set, > although perhaps administrators might want this information.) > I thought it might be handy to explicitly see unused (or belonging to another db) buffers. Clearly joining to pg_class will project 'em out! Finally, thanks for the excellent feedback (fast too) regards Mark
Neil Conway wrote: > > I don't like accessing shared data structures without acquiring the > appropriate locks; even if it doesn't break anything, it seems like just > asking for trouble. In order to be able to inspect a buffer's tag in > Tom's new locking scheme (not yet in HEAD, but will be in 8.1), you need > only hold a per-buffer lock. That will mean acquiring and releasing a > spinlock for each buffer, which isn't _too_ bad... > > I am looking at this bit now. I take the first part to mean that don't need to wrap LWLockAcquire(BufMgrLock, LW_EXCLUSIVE) ... LWLockRelease(BufMgrLock) around the inspection of the buffers (?) This per buffer lock, are we talking about LockBuffer(buf, BUFFER_LOCK_SHARE) ... releaseBuffer(buf) ? If so, it looks like I need to do some stuff with ResourceOwners, otherwise ReleaseBuffer will fail (or am I wandering up the wrong track here?). I am using anoncvs from yesterday, so if Tom's new scheme is *very* new I may be missing it. Thanks Mark
Mark Kirkwood wrote: > If so, it looks like I need to do some stuff with ResourceOwners, > otherwise ReleaseBuffer will fail (or am I wandering up the wrong track > here?). I am using anoncvs from yesterday, so if Tom's new scheme is > *very* new I may be missing it. It's so new, in fact, it's not in CVS yet :) I believe the latest revision of the patch is here: http://archives.postgresql.org/pgsql-patches/2005-02/msg00115.php The locking scheme is described here: http://archives.postgresql.org/pgsql-hackers/2005-02/msg00391.php Holding the per-buffer header spinlock should be enough to ensure the tag doesn't change. To get a globally consistent snapshot of the state of the bufmgr, I believe it should be sufficient to also share-lock the BufMappingLock for the duration of the scan. The latter will prevent new pages from being loaded in the buffer pool, so I'm not sure if it's worth doing. If you do decide to hold the BufMappingLock, it might make sense to: 1. allocate an array of NBuffers elements 2. acquire BufferMappingLock in share mode 3. sequentially scan through the buffer pool, copying data into the array 4. release the lock 5. on each subsequent call to the SRF, format and return an element of the array Which should reduce the time to lock is held. This will require allocating NBuffers * size_of_stats memory (where size_of_stats will be something like 16 bytes). -Neil
Mark Kirkwood <markir@coretech.co.nz> writes: > I am using anoncvs from yesterday, so if Tom's new scheme is > *very* new I may be missing it. It's not committed yet ;-) regards, tom lane
Tom Lane wrote: > Mark Kirkwood <markir@coretech.co.nz> writes: > >>I am using anoncvs from yesterday, so if Tom's new scheme is >>*very* new I may be missing it. > > > It's not committed yet ;-) > Yep - that's pretty new, apologies for slow grey matter... I have been following the discussion about the new buffer design, but didn't think to connect it to this context! cheers Mark
Neil Conway wrote: > If you do decide to hold the BufMappingLock, it might make > sense to: > > 1. allocate an array of NBuffers elements > 2. acquire BufferMappingLock in share mode > 3. sequentially scan through the buffer pool, copying data into the array > 4. release the lock > 5. on each subsequent call to the SRF, format and return an element of > the array > > Which should reduce the time to lock is held. This will require > allocating NBuffers * size_of_stats memory (where size_of_stats will be > something like 16 bytes). > That is a better approach, so I've used it in this new iteration. In addition to holding the BufMappingLock, each buffer header is (spin) locked before examining it, hopefully this is correct - BTW, I like the new buffer lock design. I'm still using BuildTupleFromCStrings, so there is considerable use of sprintf conversion and "temporary" char * stuff. I would like this to be a bit cleaner, so any suggestions welcome. regards Mark diff -Naur pgsql.orig/src/backend/catalog/system_views.sql pgsql/src/backend/catalog/system_views.sql --- pgsql.orig/src/backend/catalog/system_views.sql Fri Mar 4 14:23:09 2005 +++ pgsql/src/backend/catalog/system_views.sql Fri Mar 4 14:21:33 2005 @@ -277,3 +277,9 @@ DO INSTEAD NOTHING; GRANT SELECT, UPDATE ON pg_settings TO PUBLIC; + +CREATE VIEW pg_cache_dump AS + SELECT D.* FROM pg_cache_dump() AS D + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + isdirty bool, refcount int4); + diff -Naur pgsql.orig/src/backend/utils/adt/cachedump.c pgsql/src/backend/utils/adt/cachedump.c --- pgsql.orig/src/backend/utils/adt/cachedump.c Thu Jan 1 12:00:00 1970 +++ pgsql/src/backend/utils/adt/cachedump.c Sat Mar 5 20:21:45 2005 @@ -0,0 +1,221 @@ +/*------------------------------------------------------------------------- + * + * cachedump.c + * display some contents of the buffer cache + * + * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL$ + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "funcapi.h" +#include "catalog/pg_type.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "utils/relcache.h" +#include "utils/builtins.h" + + +#define NUM_CACHE_DUMP_ELEM 6 + +/* + * Record structure holding the to be exposed cache data. + */ +typedef struct +{ + uint32 bufferid; + Oid relfilenode; + Oid reltablespace; + Oid reldatabase; + bool isdirty; + uint32 refcount; + BlockNumber blocknum; + +} CacheDumpRec; + + +/* + * Function context for data persisting over repeated calls. + */ +typedef struct +{ + AttInMetadata *attinmeta; + CacheDumpRec *record; + char *values[NUM_CACHE_DUMP_ELEM]; +} CacheDumpContext; + + +/* + * Function returning data from the shared buffer cache - buffer number, + * relation node/tablespace/database, dirty indicator and refcount. + */ +Datum +cache_dump(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + Datum result; + MemoryContext oldcontext; + CacheDumpContext *fctx; /* User function context. */ + TupleDesc tupledesc; + HeapTuple tuple; + + if (SRF_IS_FIRSTCALL()) + { + RelFileNode rnode; + uint32 i; + BufferDesc *bufHdr; + + + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* construct a tuple to return */ + tupledesc = CreateTemplateTupleDesc(NUM_CACHE_DUMP_ELEM, false); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "isdirty", + BOOLOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 6, "refcount", + INT4OID, -1, 0); + + /* Generate attribute metadata needed later to produce tuples */ + funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc); + + /* + * Create a function context for cross-call persistence + * and initialize the buffer counters. + */ + fctx = (CacheDumpContext *) palloc(sizeof(CacheDumpContext)); + funcctx->max_calls = NBuffers; + funcctx->user_fctx = fctx; + + + /* Allocate NBuffers worth of CacheDumpRec records. */ + fctx->record = (CacheDumpRec *) palloc(sizeof(CacheDumpRec) * NBuffers); + + /* allocate the strings for tuple formation */ + fctx->values[0] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[1] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[2] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[3] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[4] = (char *) palloc(10); + fctx->values[5] = (char *) palloc(3 * sizeof(uint32) + 1); + + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + + /* + * Lock Buffer map and scan though all the buffers, saving the + * relevant fields in the fctx->record structure. + */ + LWLockAcquire(BufMappingLock, LW_SHARED); + + for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) + { + /* Lock each buffer header before inspecting. */ + LockBufHdr(bufHdr); + + rnode = bufHdr->tag.rnode; + + fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); + fctx->record[i].relfilenode = rnode.relNode; + fctx->record[i].reltablespace = rnode.spcNode; + fctx->record[i].reldatabase = rnode.dbNode; + fctx->record[i].refcount = bufHdr->refcount; + fctx->record[i].blocknum = bufHdr->tag.blockNum; + if ( bufHdr->flags & BM_DIRTY) + { + fctx->record[i].isdirty = true; + } + else + { + fctx->record[i].isdirty = false; + } + + UnlockBufHdr(bufHdr); + + } + + /* Release Buffer map. */ + LWLockRelease(BufMappingLock); + } + + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state */ + fctx = funcctx->user_fctx; + + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + char *values[NUM_CACHE_DUMP_ELEM]; + int j; + + /* + * Use a temporary values array, initially pointing to + * fctx->values, so it can be reassigned w/o losing the storage + * for subsequent calls. + */ + for (j = 0; j < NUM_CACHE_DUMP_ELEM; j++) + { + values[j] = fctx->values[j]; + } + + + if (fctx->record[i].blocknum == InvalidBlockNumber) + { + + sprintf(values[0], "%u", fctx->record[i].bufferid); + values[1] = NULL; + values[2] = NULL; + values[3] = NULL; + values[4] = NULL; + values[5] = NULL; + + } + else + { + + sprintf(values[0], "%u", fctx->record[i].bufferid); + sprintf(values[1], "%u", fctx->record[i].relfilenode); + sprintf(values[2], "%u", fctx->record[i].reltablespace); + sprintf(values[3], "%u", fctx->record[i].reldatabase); + if (fctx->record[i].isdirty) + { + strcpy(values[4], "true"); + } + else + { + strcpy(values[4], "false"); + } + sprintf(values[5], "%u", fctx->record[i].refcount); + + } + + + /* Build and return the tuple. */ + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); +} + diff -Naur pgsql.orig/src/include/catalog/pg_proc.h pgsql/src/include/catalog/pg_proc.h --- pgsql.orig/src/include/catalog/pg_proc.h Fri Mar 4 14:24:20 2005 +++ pgsql/src/include/catalog/pg_proc.h Fri Mar 4 14:21:56 2005 @@ -3615,6 +3615,8 @@ DATA(insert OID = 2558 ( int4 PGNSP PGUID 12 f f t f i 1 23 "16" _null_ bool_int4 - _null_ )); DESCR("convert boolean to int4"); +/* cache dump */ +DATA(insert OID = 2510 ( pg_cache_dump PGNSP PGUID 12 f f t t v 0 2249 "" _null_ cache_dump - _null_ )); /* * Symbolic values for provolatile column: these indicate whether the result diff -Naur pgsql.orig/src/include/utils/builtins.h pgsql/src/include/utils/builtins.h --- pgsql.orig/src/include/utils/builtins.h Fri Mar 4 14:24:31 2005 +++ pgsql/src/include/utils/builtins.h Fri Mar 4 14:22:08 2005 @@ -823,4 +823,7 @@ /* catalog/pg_conversion.c */ extern Datum pg_convert_using(PG_FUNCTION_ARGS); +/* cache dump */ +extern Datum cache_dump(PG_FUNCTION_ARGS); + #endif /* BUILTINS_H */
Mark Kirkwood <markir@coretech.co.nz> writes: > In addition to holding the BufMappingLock, each buffer header is (spin) > locked before examining it, hopefully this is correct - BTW, I like the > new buffer lock design. It'd be possible to dispense with the per-buffer spinlocks so long as you look only at the tag (and perhaps the TAG_VALID flag bit). The tags can't be changing while you hold the BufMappingLock. I'm dubious that there's any point in recording information as transient as the refcounts and dirtybits, and reducing the amount of time you hold the lock would be a Good Thing. Also, you're still not dealing with the case of a not-valid buffer in a reasonable way. regards, tom lane
Tom Lane wrote: > It'd be possible to dispense with the per-buffer spinlocks so long as > you look only at the tag (and perhaps the TAG_VALID flag bit). The > tags can't be changing while you hold the BufMappingLock. That's what I had thought at first, but this comment in buf_internals.h dissuaded me: "buf_hdr_lock must be held to examine or change the tag, flags, usage_count, refcount, or wait_backend_id fields." The comment already notes this isn't true if you've got the buffer pinned; it would be worth adding another exception for holding the BufMappingLock, IMHO. > I'm dubious that there's any point in recording information as > transient as the refcounts and dirtybits I think it's worth recording dirty bits -- it provides an indication of the effectiveness of the bgwriter, for example. Reference counts could be done away with, although I doubt it would have a significant effect on the time spent holding the lock. -Neil
Only two things to add: you forgot to add `cachedump.o' to the list of OBJS in the utils/adt Makefile. Mark Kirkwood wrote: > +typedef struct > +{ > + uint32 bufferid; > + Oid relfilenode; > + Oid reltablespace; > + Oid reldatabase; > + bool isdirty; > + uint32 refcount; > + BlockNumber blocknum; > + > +} CacheDumpRec; You should probably make `isdirty' the last member of this struct, so as to reduce alignment/padding requirements (this won't actually save any space right now, but it might save some space if more members are added to the struct in the future). -Neil
Neil Conway wrote: > Tom Lane wrote: > >> It'd be possible to dispense with the per-buffer spinlocks so long as >> you look only at the tag (and perhaps the TAG_VALID flag bit). The >> tags can't be changing while you hold the BufMappingLock. > > > That's what I had thought at first, but this comment in buf_internals.h > dissuaded me: "buf_hdr_lock must be held to examine or change the tag, > flags, usage_count, refcount, or wait_backend_id fields." The comment > already notes this isn't true if you've got the buffer pinned; it would > be worth adding another exception for holding the BufMappingLock, IMHO. > >> I'm dubious that there's any point in recording information as >> transient as the refcounts and dirtybits > > > I think it's worth recording dirty bits -- it provides an indication of > the effectiveness of the bgwriter, for example. Reference counts could > be done away with, although I doubt it would have a significant effect > on the time spent holding the lock. > > Let's suppose refcount is eliminated. I will then be examining the tag, flags and buf_id elements of the buffer. Holding the BufMappingLock prevents the tag changing, but what about the flags? In addition Tom pointed out that I am not examining the BM_TAG_VALID or BM_VALID flag bits (I am only checking if tag.blockNum equals InvalidBlockNumber). My initial thought is to handle !BM_TAG_VALID or !BM_VALID similarly to InvalidBlockNumber i.e all non buf_id fields set to NULL. Mark
The latest iteration. I have added documentation and updated the expected output so that the regression tests pass. In addition, after looking at the various system view names, I decided that 'pg_cache_dump' does not fit in nicely - so chose an more Pg suitable name of 'pg_buffercache'. Some renaming of the backend functions happened too. Finally, since I was saving blocknum, it went into the view as well. Hopefully I am dealing with invalid buffer tags sensibly now. The per-buffer spin lock is still being held - altho it is obviously trivial to remove if not actually required. regards Mark P.s : remembered to use diff -c Mark Kirkwood wrote: > Neil Conway wrote: > >> Tom Lane wrote: >> >>> It'd be possible to dispense with the per-buffer spinlocks so long as >>> you look only at the tag (and perhaps the TAG_VALID flag bit). The >>> tags can't be changing while you hold the BufMappingLock. >> >> >> >> That's what I had thought at first, but this comment in >> buf_internals.h dissuaded me: "buf_hdr_lock must be held to examine or >> change the tag, flags, usage_count, refcount, or wait_backend_id >> fields." The comment already notes this isn't true if you've got the >> buffer pinned; it would be worth adding another exception for holding >> the BufMappingLock, IMHO. >> >>> I'm dubious that there's any point in recording information as >>> transient as the refcounts and dirtybits >> >> >> >> I think it's worth recording dirty bits -- it provides an indication >> of the effectiveness of the bgwriter, for example. Reference counts >> could be done away with, although I doubt it would have a significant >> effect on the time spent holding the lock. >> >> > Let's suppose refcount is eliminated. I will then be examining the tag, > flags and buf_id elements of the buffer. Holding the BufMappingLock > prevents the tag changing, but what about the flags? > > In addition Tom pointed out that I am not examining the BM_TAG_VALID or > BM_VALID flag bits (I am only checking if tag.blockNum equals > InvalidBlockNumber). My initial thought is to handle !BM_TAG_VALID or > !BM_VALID similarly to InvalidBlockNumber i.e all non buf_id fields set > to NULL. > > Mark > > > > ---------------------------(end of broadcast)--------------------------- > TIP 2: you can get off all lists at once with the unregister command > (send "unregister YourEmailAddressHere" to majordomo@postgresql.org) diff -Nacr pgsql.orig/doc/src/sgml/catalogs.sgml pgsql/doc/src/sgml/catalogs.sgml *** pgsql.orig/doc/src/sgml/catalogs.sgml Mon Mar 7 12:20:17 2005 --- pgsql/doc/src/sgml/catalogs.sgml Tue Mar 8 12:03:50 2005 *************** *** 3875,3880 **** --- 3875,3885 ---- <tbody> <row> + <entry><link linkend="view-pg-buffercache"><structname>pg_buffercache</structname></link></entry> + <entry>shared buffer cache</entry> + </row> + + <row> <entry><link linkend="view-pg-indexes"><structname>pg_indexes</structname></link></entry> <entry>indexes</entry> </row> *************** *** 3917,3922 **** --- 3922,4021 ---- </tbody> </tgroup> </table> + </sect1> + + <sect1 id="view-pg-buffercache"> + <title><structname>pg_buffercache</structname></title> + + <indexterm zone="view-pg-buffercache"> + <primary>pg_buffercache</primary> + </indexterm> + <para> + The view <structname>pg_buffercache</structname> provides access to + some information from the shared buffer cache. + </para> + + <para> + There is one row for each buffer in the shared cache. Unused buffers are + shown with all fields null except <structfield>bufferid</structfield>. + Because the cache is shared by all the databases, there are pages from + relations not belonging to the current database. + </para> + + + <table> + <title><structname>pg_buffercache</structname> Columns</title> + + <tgroup cols=4> + <thead> + <row> + <entry>Name</entry> + <entry>Type</entry> + <entry>References</entry> + <entry>Description</entry> + </row> + </thead> + <tbody> + <row> + <entry>bufferid</entry> + <entry><type>integer</type></entry> + <entry></entry> + <entry> + The buffer number. This is numbered 1 to <varname>shared_buffers</varname>. + </entry> + </row> + <row> + <entry>relfilenode</entry> + <entry><type>oid</type></entry> + <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.relfilenode</literal></entry> + <entry> + The on-disk file for the relation that this page came from. + </entry> + </row> + <row> + <entry>reltablespace</entry> + <entry><type>oid</type></entry> + <entry> + <literal><link linkend="catalog-pg-tablespace"><structname>pg_tablespace</structname></link>.oid</literal> + </entry> + <entry>Tablespace the corresponding relation is in.</entry> + </row> + <row> + <entry>reldatabase</entry> + <entry><type>oid</type></entry> + <entry><literal><link linkend="catalog-pg-database"><structname>pg_database</structname></link>.oid</literal></entry> + <entry> + Database the corresponding relation belongs to, or zero if the + relation is a globally-shared table</entry> + </row> + <row> + <entry>relblocknumber</entry> + <entry><type>numeric</type></entry> + <entry></entry> + <entry> + The page offset within the relation that this buffer page is for. + </entry> + </row> + <row> + <entry>isdirty</entry> + <entry><type>bool</type></entry> + <entry></entry> + <entry>True if the buffer is dirty.</entry> + </row> + </tbody> + </tgroup> + + </table> + + <para> + When the <structname>pg_buffercache</structname> view is accessed, + internal buffer manager locks are taken, and a copy of the buffer cache data + is made for the view to display. This ensures that the view produces a + consistent set of results, while not blocking normal buffer activity longer + than necessary. Nonetheless there could be some impact on database + performance if this view is read often. + </para> + </sect1> <sect1 id="view-pg-indexes"> diff -Nacr pgsql.orig/src/backend/catalog/system_views.sql pgsql/src/backend/catalog/system_views.sql *** pgsql.orig/src/backend/catalog/system_views.sql Fri Mar 4 14:23:09 2005 --- pgsql/src/backend/catalog/system_views.sql Tue Mar 8 11:45:39 2005 *************** *** 277,279 **** --- 277,285 ---- DO INSTEAD NOTHING; GRANT SELECT, UPDATE ON pg_settings TO PUBLIC; + + CREATE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relblocknumber numeric(10), isdirty bool); + diff -Nacr pgsql.orig/src/backend/utils/adt/Makefile pgsql/src/backend/utils/adt/Makefile *** pgsql.orig/src/backend/utils/adt/Makefile Mon Mar 7 10:16:24 2005 --- pgsql/src/backend/utils/adt/Makefile Tue Mar 8 11:44:10 2005 *************** *** 24,30 **** tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ ! ascii.o quote.o pgstatfuncs.o encode.o like.o: like.c like_match.c --- 24,30 ---- tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \ ! ascii.o quote.o pgstatfuncs.o encode.o buffercache.o like.o: like.c like_match.c diff -Nacr pgsql.orig/src/backend/utils/adt/buffercache.c pgsql/src/backend/utils/adt/buffercache.c *** pgsql.orig/src/backend/utils/adt/buffercache.c Thu Jan 1 12:00:00 1970 --- pgsql/src/backend/utils/adt/buffercache.c Tue Mar 8 11:48:56 2005 *************** *** 0 **** --- 1,237 ---- + /*------------------------------------------------------------------------- + * + * buffercache.c + * display some contents of the buffer cache + * + * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * $PostgreSQL$ + *------------------------------------------------------------------------- + */ + #include "postgres.h" + #include "funcapi.h" + #include "catalog/pg_type.h" + #include "storage/buf_internals.h" + #include "storage/bufmgr.h" + #include "utils/relcache.h" + #include "utils/builtins.h" + + + #define NUM_BUFFERCACHE_PAGES_ELEM 6 + + + /* + * Record structure holding the to be exposed cache data. + */ + typedef struct + { + uint32 bufferid; + Oid relfilenode; + Oid reltablespace; + Oid reldatabase; + BlockNumber blocknum; + bool isvalid; + bool isdirty; + + } BufferCachePagesRec; + + + /* + * Function context for data persisting over repeated calls. + */ + typedef struct + { + AttInMetadata *attinmeta; + BufferCachePagesRec *record; + char *values[NUM_BUFFERCACHE_PAGES_ELEM]; + } BufferCachePagesContext; + + + /* + * Function returning data from the shared buffer cache - buffer number, + * relation node/tablespace/database/blocknum and dirty indicator. + */ + Datum + pg_buffercache_pages(PG_FUNCTION_ARGS) + { + FuncCallContext *funcctx; + Datum result; + MemoryContext oldcontext; + BufferCachePagesContext *fctx; /* User function context. */ + TupleDesc tupledesc; + HeapTuple tuple; + + if (SRF_IS_FIRSTCALL()) + { + RelFileNode rnode; + uint32 i; + BufferDesc *bufHdr; + + + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* construct a tuple to return */ + tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber", + NUMERICOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 6, "isdirty", + BOOLOID, -1, 0); + + /* Generate attribute metadata needed later to produce tuples */ + funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc); + + /* + * Create a function context for cross-call persistence + * and initialize the buffer counters. + */ + fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); + funcctx->max_calls = NBuffers; + funcctx->user_fctx = fctx; + + + /* Allocate NBuffers worth of BufferCachePagesRec records. */ + fctx->record = (BufferCachePagesRec *) palloc(sizeof(BufferCachePagesRec) * NBuffers); + + /* allocate the strings for tuple formation */ + fctx->values[0] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[1] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[2] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[3] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[4] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[5] = (char *) palloc(10); + + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + + /* + * Lock Buffer map and scan though all the buffers, saving the + * relevant fields in the fctx->record structure. + */ + LWLockAcquire(BufMappingLock, LW_SHARED); + + for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) + { + /* Lock each buffer header before inspecting. */ + LockBufHdr(bufHdr); + + rnode = bufHdr->tag.rnode; + + fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); + fctx->record[i].relfilenode = rnode.relNode; + fctx->record[i].reltablespace = rnode.spcNode; + fctx->record[i].reldatabase = rnode.dbNode; + fctx->record[i].blocknum = bufHdr->tag.blockNum; + + if ( bufHdr->flags & BM_DIRTY) + { + fctx->record[i].isdirty = true; + } + else + { + fctx->record[i].isdirty = false; + } + + /* Note if the buffer is valid, and has storage created */ + if ( (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID)) + { + fctx->record[i].isvalid = true; + } + else + { + fctx->record[i].isvalid = false; + } + + UnlockBufHdr(bufHdr); + + } + + /* Release Buffer map. */ + LWLockRelease(BufMappingLock); + } + + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state */ + fctx = funcctx->user_fctx; + + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + char *values[NUM_BUFFERCACHE_PAGES_ELEM]; + int j; + + /* + * Use a temporary values array, initially pointing to + * fctx->values, so it can be reassigned w/o losing the storage + * for subsequent calls. + */ + for (j = 0; j < NUM_BUFFERCACHE_PAGES_ELEM; j++) + { + values[j] = fctx->values[j]; + } + + + /* + * Set all fields except the bufferid to null if the buffer is + * unused or not valid. + */ + if (fctx->record[i].blocknum == InvalidBlockNumber || + fctx->record[i].isvalid == false ) + { + + sprintf(values[0], "%u", fctx->record[i].bufferid); + values[1] = NULL; + values[2] = NULL; + values[3] = NULL; + values[4] = NULL; + values[5] = NULL; + + } + else + { + + sprintf(values[0], "%u", fctx->record[i].bufferid); + sprintf(values[1], "%u", fctx->record[i].relfilenode); + sprintf(values[2], "%u", fctx->record[i].reltablespace); + sprintf(values[3], "%u", fctx->record[i].reldatabase); + sprintf(values[4], "%u", fctx->record[i].blocknum); + if (fctx->record[i].isdirty) + { + strcpy(values[5], "true"); + } + else + { + strcpy(values[5], "false"); + } + + } + + + /* Build and return the tuple. */ + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); + } + diff -Nacr pgsql.orig/src/include/catalog/pg_proc.h pgsql/src/include/catalog/pg_proc.h *** pgsql.orig/src/include/catalog/pg_proc.h Fri Mar 4 14:24:20 2005 --- pgsql/src/include/catalog/pg_proc.h Tue Mar 8 11:46:06 2005 *************** *** 3615,3620 **** --- 3615,3622 ---- DATA(insert OID = 2558 ( int4 PGNSP PGUID 12 f f t f i 1 23 "16" _null_ bool_int4 - _null_ )); DESCR("convert boolean to int4"); + /* builtin for cache internals view */ + DATA(insert OID = 2510 ( pg_buffercache_pages PGNSP PGUID 12 f f t t v 0 2249 "" _null_ pg_buffercache_pages - _null_ )); /* * Symbolic values for provolatile column: these indicate whether the result diff -Nacr pgsql.orig/src/include/utils/builtins.h pgsql/src/include/utils/builtins.h *** pgsql.orig/src/include/utils/builtins.h Fri Mar 4 14:24:31 2005 --- pgsql/src/include/utils/builtins.h Tue Mar 8 11:46:26 2005 *************** *** 823,826 **** --- 823,829 ---- /* catalog/pg_conversion.c */ extern Datum pg_convert_using(PG_FUNCTION_ARGS); + /* cache dump */ + extern Datum pg_buffercache_pages(PG_FUNCTION_ARGS); + #endif /* BUILTINS_H */ diff -Nacr pgsql.orig/src/test/regress/expected/rules.out pgsql/src/test/regress/expected/rules.out *** pgsql.orig/src/test/regress/expected/rules.out Tue Mar 8 13:00:00 2005 --- pgsql/src/test/regress/expected/rules.out Tue Mar 8 12:59:24 2005 *************** *** 1275,1280 **** --- 1275,1281 ---- viewname | definition --------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- iexit | SELECT ih.name, ih.thepath, interpt_pp(ih.thepath, r.thepath) AS exit FROM ihighway ih, rampr WHERE (ih.thepath ## r.thepath); + pg_buffercache | SELECT p.bufferid, p.relfilenode, p.reltablespace, p.reldatabase, p.relblocknumber, p.isdirtyFROM pg_buffercache_pages() p(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, relblocknumbernumeric(10,0),isdirty boolean); pg_indexes | SELECT n.nspname AS schemaname, c.relname AS tablename, i.relname AS indexname, t.spcname AS"tablespace", pg_get_indexdef(i.oid) AS indexdef FROM ((((pg_index x JOIN pg_class c ON ((c.oid = x.indrelid))) JOIN pg_classi ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) LEFT JOIN pg_tablespace tON ((t.oid = i.reltablespace))) WHERE ((c.relkind = 'r'::"char") AND (i.relkind = 'i'::"char")); pg_locks | SELECT l.relation, l."database", l."transaction", l.pid, l."mode", l.granted FROM pg_lock_status()l(relation oid, "database" oid, "transaction" xid, pid integer, "mode" text, granted boolean); pg_rules | SELECT n.nspname AS schemaname, c.relname AS tablename, r.rulename, pg_get_ruledef(r.oid) ASdefinition FROM ((pg_rewrite r JOIN pg_class c ON ((c.oid = r.ev_class))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace)))WHERE (r.rulename <> '_RETURN'::name); *************** *** 1314,1320 **** shoelace_obsolete | SELECT shoelace.sl_name, shoelace.sl_avail, shoelace.sl_color, shoelace.sl_len, shoelace.sl_unit,shoelace.sl_len_cm FROM shoelace WHERE (NOT (EXISTS (SELECT shoe.shoename FROM shoe WHERE (shoe.slcolor= shoelace.sl_color)))); street | SELECT r.name, r.thepath, c.cname FROM ONLY road r, real_city c WHERE (c.outline ## r.thepath); toyemp | SELECT emp.name, emp.age, emp."location", (12 * emp.salary) AS annualsal FROM emp; ! (40 rows) SELECT tablename, rulename, definition FROM pg_rules ORDER BY tablename, rulename; --- 1315,1321 ---- shoelace_obsolete | SELECT shoelace.sl_name, shoelace.sl_avail, shoelace.sl_color, shoelace.sl_len, shoelace.sl_unit,shoelace.sl_len_cm FROM shoelace WHERE (NOT (EXISTS (SELECT shoe.shoename FROM shoe WHERE (shoe.slcolor= shoelace.sl_color)))); street | SELECT r.name, r.thepath, c.cname FROM ONLY road r, real_city c WHERE (c.outline ## r.thepath); toyemp | SELECT emp.name, emp.age, emp."location", (12 * emp.salary) AS annualsal FROM emp; ! (41 rows) SELECT tablename, rulename, definition FROM pg_rules ORDER BY tablename, rulename;
Mark Kirkwood wrote: > + tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false); > + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", > + INT4OID, -1, 0); > + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", > + OIDOID, -1, 0); > + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", > + OIDOID, -1, 0); > + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", > + OIDOID, -1, 0); > + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber", > + NUMERICOID, -1, 0); I think this should be an int4, not numeric. Otherwise, looks good to me. Barring any objections, I'll apply this with a few stylistic tweaks and the numeric -> int4 change tomorrow. -Neil
Neil Conway <neilc@samurai.com> writes: > Mark Kirkwood wrote: >> + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber", >> + NUMERICOID, -1, 0); > I think this should be an int4, not numeric. needs spell check too ;-) More generally, BlockNumber is unsigned and so int4 is not at all an accurate conversion. Perhaps OID would be a good choice even though it's horribly wrong on one level. > Otherwise, looks good to me. Barring any objections, I'll apply this > with a few stylistic tweaks and the numeric -> int4 change tomorrow. I would rather see this as a contrib module. There has been *zero* consensus that we need this in core, nor any discussion about whether it might be a security hole. regards, tom lane
Tom Lane wrote: > Perhaps OID would be a good choice even though it's horribly wrong on > one level. Either that or add unsigned numeric types to PG :-) > I would rather see this as a contrib module. I don't really have an opinion either way. Does anyone else have thoughts on this? > There has been *zero* consensus that we need this in core, nor any > discussion about whether it might be a security hole. ISTM if it can only be invoked by the superuser, it should be safe enough. -Neil
Neil Conway wrote: > Mark Kirkwood wrote: >> + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber", >> + NUMERICOID, -1, 0); > > I think this should be an int4, not numeric. I was looking for an UINT4OID :-), but numeric seemed the best compromise (safer than overflowing int4). I guess I could add a type 'blocknumber' that is actually the same as 'oid', but it seems liks a lot of effort for one column. cheers Mark
Tom Lane wrote: > I would rather see this as a contrib module. There has been *zero* > consensus that we need this in core, nor any discussion about whether > it might be a security hole. Hmmm, I guess my motivation for thinking it might be useful in core was that it was similar to 'pg_locks' and 'pg_stats', i.e. exposing some internals in a convenient queryable manner (useful for problem solving). On the security front, I should have added a 'REVOKE ALL ...FROM PUBLIC' on the view to at least control access (fortunately easy to add). cheers Mark
Mark Kirkwood <markir@coretech.co.nz> writes: > Tom Lane wrote: >> I would rather see this as a contrib module. There has been *zero* >> consensus that we need this in core, nor any discussion about whether >> it might be a security hole. > Hmmm, I guess my motivation for thinking it might be useful in core was > that it was similar to 'pg_locks' and 'pg_stats', i.e. exposing some > internals in a convenient queryable manner (useful for problem solving). One reason for making it contrib is that I don't think you've got it entirely right yet, and there will be several iterations before it settles down. In a contrib module that is no problem, in core it's a forced initdb each time. regards, tom lane
Tom Lane wrote: > One reason for making it contrib is that I don't think you've got it > entirely right yet, and there will be several iterations before it > settles down. In a contrib module that is no problem, in core it's a > forced initdb each time. Yeah - certainly less intrusive as a contrib if amendments are required! Barring a huge groundswell of support for it in core :-) I will resubmit a patch for it as a contrib module. cheers Mark
Mark Kirkwood wrote: > Tom Lane wrote: > >> One reason for making it contrib is that I don't think you've got it >> entirely right yet, and there will be several iterations before it >> settles down. In a contrib module that is no problem, in core it's a >> forced initdb each time. > > > Yeah - certainly less intrusive as a contrib if amendments are required! > > Barring a huge groundswell of support for it in core :-) I will resubmit > a patch for it as a contrib module. > And here it is... Mark diff -Nacr pgsql.orig/contrib/Makefile pgsql/contrib/Makefile *** pgsql.orig/contrib/Makefile Thu Mar 3 11:29:53 2005 --- pgsql/contrib/Makefile Wed Mar 9 10:10:41 2005 *************** *** 26,31 **** --- 26,32 ---- noupdate \ oid2name \ pg_autovacuum \ + pg_buffercache \ pg_dumplo \ pg_trgm \ pgbench \ diff -Nacr pgsql.orig/contrib/pg_buffercache/Makefile pgsql/contrib/pg_buffercache/Makefile *** pgsql.orig/contrib/pg_buffercache/Makefile Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/Makefile Wed Mar 9 11:42:05 2005 *************** *** 0 **** --- 1,21 ---- + # $PostgreSQL$ + + MODULE_big = pg_buffercache + PG_CPPFLAGS = -I$(libpq_srcdir) + OBJS = pg_buffercache_pages.o + SHLIB_LINK = $(libpq) + + DATA_built = pg_buffercache.sql + DOCS = README.pg_buffercache + REGRESS = pg_buffercache + + + ifdef USE_PGXS + PGXS = $(shell pg_config --pgxs) + include $(PGXS) + else + subdir = contrib/pg_buffercache + top_builddir = ../.. + include $(top_builddir)/src/Makefile.global + include $(top_srcdir)/contrib/contrib-global.mk + endif diff -Nacr pgsql.orig/contrib/pg_buffercache/README.pg_buffercache pgsql/contrib/pg_buffercache/README.pg_buffercache *** pgsql.orig/contrib/pg_buffercache/README.pg_buffercache Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/README.pg_buffercache Wed Mar 9 12:26:56 2005 *************** *** 0 **** --- 1,112 ---- + Pg_buffercache - Real-time queries on the shared buffer cache. + -------------- + + This module consists of a C function 'pg_buffercache_pages()' that returns + a set of records, plus a view 'pg_buffercache' to wrapper the function. + + The intent is to do for the buffercache what pg_locks does for locks, i.e - + ability to examine what is happening at any given time without having to + restart or rebuild the server with debugging code added. + + By default public access is REVOKED from both of these, just in case there + are security issues lurking. + + + Installation + ------------ + + Build and install the main Postgresql source, then this contrib module: + + $ cd contrib/pg_buffercache + $ gmake + $ gmake install + + + To register the functions: + + $ psql -d <database> -f pg_buffercache.sql + + + Notes + ----- + + The definition of the columns exposed in the view is: + + Column | references | Description + ----------------+----------------------+------------------------------------ + bufferid | | Id, 1->shared_buffers. + relfilenode | pg_class.relfilenode | Refilenode of the relation. + reltablespace | pg_tablespace.oid | Tablespace oid of the relation. + reldatabase | pg_database.oid | Database for the relation. + relblocknumber | | Offset of the page in the relation. + isdirty | | Is the page dirty? + + + There is one row for each buffer in the shared cache. Unused buffers are + shown with all fields null except bufferid. + + Because the cache is shared by all the databases, there are pages from + relations not belonging to the current database. + + When the pg_buffercache view is accessed, internal buffer manager locks are + taken, and a copy of the buffer cache data is made for the view to display. + This ensures that the view produces a consistent set of results, while not + blocking normal buffer activity longer than necessary. Nonetheless there + could be some impact on database performance if this view is read often. + + + Sample output + ------------- + + regression=# \d pg_buffercache; + View "public.pg_buffercache" + Column | Type | Modifiers + ----------------+---------+----------- + bufferid | integer | + relfilenode | oid | + reltablespace | oid | + reldatabase | oid | + relblocknumber | numeric | + isdirty | boolean | + View definition: + SELECT p.bufferid, p.relfilenode, p.reltablespace, p.reldatabase, + p.relblocknumber, p.isdirty + FROM pg_buffercache_pages() p(bufferid integer, relfilenode oid, + reltablespace oid, reldatabase oid, relblocknumber numeric(10,0), + isdirty boolean); + + regression=# SELECT c.relname, count(*) AS buffers + FROM pg_class c, pg_buffercache b + WHERE b.relfilenode = c.relfilenode + GROUP BY c.relname + ORDER BY 2 DESC LIMIT 10; + relname | buffers + ---------------------------------+--------- + tenk2 | 345 + tenk1 | 141 + pg_proc | 46 + pg_class | 45 + pg_attribute | 43 + pg_class_relname_nsp_index | 30 + pg_proc_proname_args_nsp_index | 28 + pg_attribute_relid_attnam_index | 26 + pg_depend | 22 + pg_depend_reference_index | 20 + (10 rows) + + regression=# + + + Author + ------ + + * Mark Kirkwood <markir@coretech.co.nz *or* markir@paradise.net.nz> + + + Help + ---- + + * Design suggestions : Neil Conway <neilc@samurai.com> + * Debugging advice : Tom Lane <tgl@sss.pgh.pa.us> + + Thanks guys! diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache.sql.in pgsql/contrib/pg_buffercache/pg_buffercache.sql.in *** pgsql.orig/contrib/pg_buffercache/pg_buffercache.sql.in Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/pg_buffercache.sql.in Wed Mar 9 11:45:32 2005 *************** *** 0 **** --- 1,18 ---- + -- Adjust this setting to control where the objects get created. + SET search_path = public; + + -- Register the function. + CREATE OR REPLACE FUNCTION pg_buffercache_pages() + RETURNS SETOF RECORD + AS 'MODULE_PATHNAME', 'pg_buffercache_pages' + LANGUAGE 'C'; + + -- Create a view for convenient access. + CREATE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relblocknumber numeric(10), isdirty bool); + + -- Don't want these to be available at public. + REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC; + REVOKE ALL ON pg_buffercache FROM PUBLIC; diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.c pgsql/contrib/pg_buffercache/pg_buffercache_pages.c *** pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.c Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/pg_buffercache_pages.c Wed Mar 9 13:18:30 2005 *************** *** 0 **** --- 1,239 ---- + /*------------------------------------------------------------------------- + * + * pg_buffercache_pages.c + * display some contents of the buffer cache + * + * $PostgreSQL$ + *------------------------------------------------------------------------- + */ + #include "postgres.h" + #include "libpq-fe.h" + #include "funcapi.h" + #include "catalog/pg_type.h" + #include "storage/buf_internals.h" + #include "storage/bufmgr.h" + #include "utils/relcache.h" + #include "pg_buffercache_pages.h" + + + #define NUM_BUFFERCACHE_PAGES_ELEM 6 + + + /* + * Record structure holding the to be exposed cache data. + */ + typedef struct + { + + uint32 bufferid; + Oid relfilenode; + Oid reltablespace; + Oid reldatabase; + BlockNumber blocknum; + bool isvalid; + bool isdirty; + + } BufferCachePagesRec; + + + /* + * Function context for data persisting over repeated calls. + */ + typedef struct + { + + AttInMetadata *attinmeta; + BufferCachePagesRec *record; + char *values[NUM_BUFFERCACHE_PAGES_ELEM]; + + } BufferCachePagesContext; + + + /* + * Function returning data from the shared buffer cache - buffer number, + * relation node/tablespace/database/blocknum and dirty indicator. + */ + PG_FUNCTION_INFO_V1(pg_buffercache_pages); + Datum + pg_buffercache_pages(PG_FUNCTION_ARGS) + { + + FuncCallContext *funcctx; + Datum result; + MemoryContext oldcontext; + BufferCachePagesContext *fctx; /* User function context. */ + TupleDesc tupledesc; + HeapTuple tuple; + + if (SRF_IS_FIRSTCALL()) + { + RelFileNode rnode; + uint32 i; + BufferDesc *bufHdr; + + + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* Construct a tuple to return. */ + tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber", + NUMERICOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 6, "isdirty", + BOOLOID, -1, 0); + + /* Generate attribute metadata needed later to produce tuples */ + funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc); + + /* + * Create a function context for cross-call persistence + * and initialize the buffer counters. + */ + fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); + funcctx->max_calls = NBuffers; + funcctx->user_fctx = fctx; + + + /* Allocate NBuffers worth of BufferCachePagesRec records. */ + fctx->record = (BufferCachePagesRec *) palloc(sizeof(BufferCachePagesRec) * NBuffers); + + /* allocate the strings for tuple formation */ + fctx->values[0] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[1] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[2] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[3] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[4] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[5] = (char *) palloc(2); + + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + + /* + * Lock Buffer map and scan though all the buffers, saving the + * relevant fields in the fctx->record structure. + */ + LWLockAcquire(BufMappingLock, LW_SHARED); + + for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) + { + /* Lock each buffer header before inspecting. */ + LockBufHdr(bufHdr); + + rnode = bufHdr->tag.rnode; + + fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); + fctx->record[i].relfilenode = rnode.relNode; + fctx->record[i].reltablespace = rnode.spcNode; + fctx->record[i].reldatabase = rnode.dbNode; + fctx->record[i].blocknum = bufHdr->tag.blockNum; + + if ( bufHdr->flags & BM_DIRTY) + { + fctx->record[i].isdirty = true; + } + else + { + fctx->record[i].isdirty = false; + } + + /* Note if the buffer is valid, and has storage created */ + if ( (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID)) + { + fctx->record[i].isvalid = true; + } + else + { + fctx->record[i].isvalid = false; + } + + UnlockBufHdr(bufHdr); + + } + + /* Release Buffer map. */ + LWLockRelease(BufMappingLock); + } + + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state */ + fctx = funcctx->user_fctx; + + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + char *values[NUM_BUFFERCACHE_PAGES_ELEM]; + int j; + + /* + * Use a temporary values array, initially pointing to + * fctx->values, so it can be reassigned w/o losing the storage + * for subsequent calls. + */ + for (j = 0; j < NUM_BUFFERCACHE_PAGES_ELEM; j++) + { + values[j] = fctx->values[j]; + } + + + /* + * Set all fields except the bufferid to null if the buffer is + * unused or not valid. + */ + if (fctx->record[i].blocknum == InvalidBlockNumber || + fctx->record[i].isvalid == false ) + { + + sprintf(values[0], "%u", fctx->record[i].bufferid); + values[1] = NULL; + values[2] = NULL; + values[3] = NULL; + values[4] = NULL; + values[5] = NULL; + + } + else + { + + sprintf(values[0], "%u", fctx->record[i].bufferid); + sprintf(values[1], "%u", fctx->record[i].relfilenode); + sprintf(values[2], "%u", fctx->record[i].reltablespace); + sprintf(values[3], "%u", fctx->record[i].reldatabase); + sprintf(values[4], "%u", fctx->record[i].blocknum); + if (fctx->record[i].isdirty) + { + strcpy(values[5], "t"); + } + else + { + strcpy(values[5], "f"); + } + + } + + + /* Build and return the tuple. */ + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); + + } + diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.h pgsql/contrib/pg_buffercache/pg_buffercache_pages.h *** pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.h Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/pg_buffercache_pages.h Wed Mar 9 11:42:20 2005 *************** *** 0 **** --- 1,18 ---- + /*------------------------------------------------------------------------- + * + * pg_buffercache_pages.h + * Prototypes for pg_buffercache_pages + * + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + + + #ifndef PG_BUFFERCACHE_PAGES_H + #define PG_BUFFERCACHE_PAGES_H + + extern Datum pg_buffercache_pages(PG_FUNCTION_ARGS); + + #endif /* PG_BUFFERCACHE_PAGES_H */
A couple of minor amendments here: - remove link to libpq (from cut+past of dblnk's Makefile) - add comment for pg_buffercache module in contrib/README - change my listed email to this one (I have resigned....) regards Mark diff -Nacr pgsql.orig/contrib/Makefile pgsql/contrib/Makefile *** pgsql.orig/contrib/Makefile Thu Mar 3 11:29:53 2005 --- pgsql/contrib/Makefile Wed Mar 9 10:10:41 2005 *************** *** 26,31 **** --- 26,32 ---- noupdate \ oid2name \ pg_autovacuum \ + pg_buffercache \ pg_dumplo \ pg_trgm \ pgbench \ diff -Nacr pgsql.orig/contrib/README pgsql/contrib/README *** pgsql.orig/contrib/README Thu Mar 3 11:29:53 2005 --- pgsql/contrib/README Fri Mar 11 12:11:18 2005 *************** *** 136,141 **** --- 136,145 ---- Automatically performs vacuum by Matthew T. O'Connor <matthew@zeut.net> + pg_buffercace - + Real time queries on the shared buffer cache. + by Mark Kirkwood <markir@paradise.net.nz> + pg_dumplo - Dump large objects by Karel Zak <zakkr@zf.jcu.cz> diff -Nacr pgsql.orig/contrib/pg_buffercache/Makefile pgsql/contrib/pg_buffercache/Makefile *** pgsql.orig/contrib/pg_buffercache/Makefile Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/Makefile Thu Mar 10 08:19:20 2005 *************** *** 0 **** --- 1,19 ---- + # $PostgreSQL$ + + MODULE_big = pg_buffercache + OBJS = pg_buffercache_pages.o + + DATA_built = pg_buffercache.sql + DOCS = README.pg_buffercache + REGRESS = pg_buffercache + + + ifdef USE_PGXS + PGXS = $(shell pg_config --pgxs) + include $(PGXS) + else + subdir = contrib/pg_buffercache + top_builddir = ../.. + include $(top_builddir)/src/Makefile.global + include $(top_srcdir)/contrib/contrib-global.mk + endif diff -Nacr pgsql.orig/contrib/pg_buffercache/README.pg_buffercache pgsql/contrib/pg_buffercache/README.pg_buffercache *** pgsql.orig/contrib/pg_buffercache/README.pg_buffercache Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/README.pg_buffercache Fri Mar 11 12:11:03 2005 *************** *** 0 **** --- 1,112 ---- + Pg_buffercache - Real time queries on the shared buffer cache. + -------------- + + This module consists of a C function 'pg_buffercache_pages()' that returns + a set of records, plus a view 'pg_buffercache' to wrapper the function. + + The intent is to do for the buffercache what pg_locks does for locks, i.e - + ability to examine what is happening at any given time without having to + restart or rebuild the server with debugging code added. + + By default public access is REVOKED from both of these, just in case there + are security issues lurking. + + + Installation + ------------ + + Build and install the main Postgresql source, then this contrib module: + + $ cd contrib/pg_buffercache + $ gmake + $ gmake install + + + To register the functions: + + $ psql -d <database> -f pg_buffercache.sql + + + Notes + ----- + + The definition of the columns exposed in the view is: + + Column | references | Description + ----------------+----------------------+------------------------------------ + bufferid | | Id, 1->shared_buffers. + relfilenode | pg_class.relfilenode | Refilenode of the relation. + reltablespace | pg_tablespace.oid | Tablespace oid of the relation. + reldatabase | pg_database.oid | Database for the relation. + relblocknumber | | Offset of the page in the relation. + isdirty | | Is the page dirty? + + + There is one row for each buffer in the shared cache. Unused buffers are + shown with all fields null except bufferid. + + Because the cache is shared by all the databases, there are pages from + relations not belonging to the current database. + + When the pg_buffercache view is accessed, internal buffer manager locks are + taken, and a copy of the buffer cache data is made for the view to display. + This ensures that the view produces a consistent set of results, while not + blocking normal buffer activity longer than necessary. Nonetheless there + could be some impact on database performance if this view is read often. + + + Sample output + ------------- + + regression=# \d pg_buffercache; + View "public.pg_buffercache" + Column | Type | Modifiers + ----------------+---------+----------- + bufferid | integer | + relfilenode | oid | + reltablespace | oid | + reldatabase | oid | + relblocknumber | numeric | + isdirty | boolean | + View definition: + SELECT p.bufferid, p.relfilenode, p.reltablespace, p.reldatabase, + p.relblocknumber, p.isdirty + FROM pg_buffercache_pages() p(bufferid integer, relfilenode oid, + reltablespace oid, reldatabase oid, relblocknumber numeric(10,0), + isdirty boolean); + + regression=# SELECT c.relname, count(*) AS buffers + FROM pg_class c, pg_buffercache b + WHERE b.relfilenode = c.relfilenode + GROUP BY c.relname + ORDER BY 2 DESC LIMIT 10; + relname | buffers + ---------------------------------+--------- + tenk2 | 345 + tenk1 | 141 + pg_proc | 46 + pg_class | 45 + pg_attribute | 43 + pg_class_relname_nsp_index | 30 + pg_proc_proname_args_nsp_index | 28 + pg_attribute_relid_attnam_index | 26 + pg_depend | 22 + pg_depend_reference_index | 20 + (10 rows) + + regression=# + + + Author + ------ + + * Mark Kirkwood <markir@paradise.net.nz> + + + Help + ---- + + * Design suggestions : Neil Conway <neilc@samurai.com> + * Debugging advice : Tom Lane <tgl@sss.pgh.pa.us> + + Thanks guys! diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache.sql.in pgsql/contrib/pg_buffercache/pg_buffercache.sql.in *** pgsql.orig/contrib/pg_buffercache/pg_buffercache.sql.in Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/pg_buffercache.sql.in Wed Mar 9 11:45:32 2005 *************** *** 0 **** --- 1,18 ---- + -- Adjust this setting to control where the objects get created. + SET search_path = public; + + -- Register the function. + CREATE OR REPLACE FUNCTION pg_buffercache_pages() + RETURNS SETOF RECORD + AS 'MODULE_PATHNAME', 'pg_buffercache_pages' + LANGUAGE 'C'; + + -- Create a view for convenient access. + CREATE VIEW pg_buffercache AS + SELECT P.* FROM pg_buffercache_pages() AS P + (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid, + relblocknumber numeric(10), isdirty bool); + + -- Don't want these to be available at public. + REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC; + REVOKE ALL ON pg_buffercache FROM PUBLIC; diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.c pgsql/contrib/pg_buffercache/pg_buffercache_pages.c *** pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.c Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/pg_buffercache_pages.c Thu Mar 10 08:19:13 2005 *************** *** 0 **** --- 1,238 ---- + /*------------------------------------------------------------------------- + * + * pg_buffercache_pages.c + * display some contents of the buffer cache + * + * $PostgreSQL$ + *------------------------------------------------------------------------- + */ + #include "postgres.h" + #include "funcapi.h" + #include "catalog/pg_type.h" + #include "storage/buf_internals.h" + #include "storage/bufmgr.h" + #include "utils/relcache.h" + #include "pg_buffercache_pages.h" + + + #define NUM_BUFFERCACHE_PAGES_ELEM 6 + + + /* + * Record structure holding the to be exposed cache data. + */ + typedef struct + { + + uint32 bufferid; + Oid relfilenode; + Oid reltablespace; + Oid reldatabase; + BlockNumber blocknum; + bool isvalid; + bool isdirty; + + } BufferCachePagesRec; + + + /* + * Function context for data persisting over repeated calls. + */ + typedef struct + { + + AttInMetadata *attinmeta; + BufferCachePagesRec *record; + char *values[NUM_BUFFERCACHE_PAGES_ELEM]; + + } BufferCachePagesContext; + + + /* + * Function returning data from the shared buffer cache - buffer number, + * relation node/tablespace/database/blocknum and dirty indicator. + */ + PG_FUNCTION_INFO_V1(pg_buffercache_pages); + Datum + pg_buffercache_pages(PG_FUNCTION_ARGS) + { + + FuncCallContext *funcctx; + Datum result; + MemoryContext oldcontext; + BufferCachePagesContext *fctx; /* User function context. */ + TupleDesc tupledesc; + HeapTuple tuple; + + if (SRF_IS_FIRSTCALL()) + { + RelFileNode rnode; + uint32 i; + BufferDesc *bufHdr; + + + funcctx = SRF_FIRSTCALL_INIT(); + + /* Switch context when allocating stuff to be used in later calls */ + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* Construct a tuple to return. */ + tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false); + TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid", + INT4OID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase", + OIDOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber", + NUMERICOID, -1, 0); + TupleDescInitEntry(tupledesc, (AttrNumber) 6, "isdirty", + BOOLOID, -1, 0); + + /* Generate attribute metadata needed later to produce tuples */ + funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc); + + /* + * Create a function context for cross-call persistence + * and initialize the buffer counters. + */ + fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext)); + funcctx->max_calls = NBuffers; + funcctx->user_fctx = fctx; + + + /* Allocate NBuffers worth of BufferCachePagesRec records. */ + fctx->record = (BufferCachePagesRec *) palloc(sizeof(BufferCachePagesRec) * NBuffers); + + /* allocate the strings for tuple formation */ + fctx->values[0] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[1] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[2] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[3] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[4] = (char *) palloc(3 * sizeof(uint32) + 1); + fctx->values[5] = (char *) palloc(2); + + + /* Return to original context when allocating transient memory */ + MemoryContextSwitchTo(oldcontext); + + + /* + * Lock Buffer map and scan though all the buffers, saving the + * relevant fields in the fctx->record structure. + */ + LWLockAcquire(BufMappingLock, LW_SHARED); + + for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++) + { + /* Lock each buffer header before inspecting. */ + LockBufHdr(bufHdr); + + rnode = bufHdr->tag.rnode; + + fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr); + fctx->record[i].relfilenode = rnode.relNode; + fctx->record[i].reltablespace = rnode.spcNode; + fctx->record[i].reldatabase = rnode.dbNode; + fctx->record[i].blocknum = bufHdr->tag.blockNum; + + if ( bufHdr->flags & BM_DIRTY) + { + fctx->record[i].isdirty = true; + } + else + { + fctx->record[i].isdirty = false; + } + + /* Note if the buffer is valid, and has storage created */ + if ( (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID)) + { + fctx->record[i].isvalid = true; + } + else + { + fctx->record[i].isvalid = false; + } + + UnlockBufHdr(bufHdr); + + } + + /* Release Buffer map. */ + LWLockRelease(BufMappingLock); + } + + funcctx = SRF_PERCALL_SETUP(); + + /* Get the saved state */ + fctx = funcctx->user_fctx; + + + if (funcctx->call_cntr < funcctx->max_calls) + { + uint32 i = funcctx->call_cntr; + char *values[NUM_BUFFERCACHE_PAGES_ELEM]; + int j; + + /* + * Use a temporary values array, initially pointing to + * fctx->values, so it can be reassigned w/o losing the storage + * for subsequent calls. + */ + for (j = 0; j < NUM_BUFFERCACHE_PAGES_ELEM; j++) + { + values[j] = fctx->values[j]; + } + + + /* + * Set all fields except the bufferid to null if the buffer is + * unused or not valid. + */ + if (fctx->record[i].blocknum == InvalidBlockNumber || + fctx->record[i].isvalid == false ) + { + + sprintf(values[0], "%u", fctx->record[i].bufferid); + values[1] = NULL; + values[2] = NULL; + values[3] = NULL; + values[4] = NULL; + values[5] = NULL; + + } + else + { + + sprintf(values[0], "%u", fctx->record[i].bufferid); + sprintf(values[1], "%u", fctx->record[i].relfilenode); + sprintf(values[2], "%u", fctx->record[i].reltablespace); + sprintf(values[3], "%u", fctx->record[i].reldatabase); + sprintf(values[4], "%u", fctx->record[i].blocknum); + if (fctx->record[i].isdirty) + { + strcpy(values[5], "t"); + } + else + { + strcpy(values[5], "f"); + } + + } + + + /* Build and return the tuple. */ + tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); + result = HeapTupleGetDatum(tuple); + + + SRF_RETURN_NEXT(funcctx, result); + } + else + SRF_RETURN_DONE(funcctx); + + } + diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.h pgsql/contrib/pg_buffercache/pg_buffercache_pages.h *** pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.h Thu Jan 1 12:00:00 1970 --- pgsql/contrib/pg_buffercache/pg_buffercache_pages.h Wed Mar 9 11:42:20 2005 *************** *** 0 **** --- 1,18 ---- + /*------------------------------------------------------------------------- + * + * pg_buffercache_pages.h + * Prototypes for pg_buffercache_pages + * + * + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + + + #ifndef PG_BUFFERCACHE_PAGES_H + #define PG_BUFFERCACHE_PAGES_H + + extern Datum pg_buffercache_pages(PG_FUNCTION_ARGS); + + #endif /* PG_BUFFERCACHE_PAGES_H */
Mark Kirkwood wrote: > A couple of minor amendments here [...] Barring any objections, I'll apply this tomorrow. -Neil
Mark Kirkwood wrote: > A couple of minor amendments here: > > - remove link to libpq (from cut+past of dblnk's Makefile) > - add comment for pg_buffercache module in contrib/README > - change my listed email to this one (I have resigned....) Applied, thanks for the patch. BTW, I removed the "REGRESS=..." line from the Makefile, as that caused `make installcheck' of contrib/ to fail on my machine. -Neil