Thread: Display Pg buffer cache (WIP)

Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
I found it useful to be able to look at what relations are occupying the
cache (mainly for debugging and comparing with page estimates).

I am wondering if it might be a useful addition generally?

How it works a SRF called dump_cache() is created, which is then exposed
as system view pg_dump_cache. So stuff like the following can be
performed (cache of 2000 immediately after 'make installcheck'):

regression=# SELECT c.relname, count(*) AS buffers
regression-# FROM pg_class c, pg_dump_cache d
regression-# WHERE d.relfilenode = c.relfilenode
regression-# GROUP BY c.relname
regression-# ORDER BY 2 DESC LIMIT 10;

                relname             | buffers
---------------------------------+---------
    tenk1                           |     345
    tenk2                           |     345
    onek                            |     138
    pg_attribute                    |     102
    road                            |      81
    pg_attribute_relid_attnam_index |      74
    onek2                           |      69
    pg_proc                         |      59
    pg_proc_proname_args_nsp_index  |      56
    hash_f8_heap                    |      49
(10 rows)

regression=#

As of now the patch breaks 1 regression test (rules - that new view...)
and does not peek into the contents of the buffers themselves (I was
mainly interested in which relations were there), and does not worry too
much about a consistent view of the buffer contents (as I didn't want it
to block all other activity!).


If it seems like a worthwhile addition I will amend the regression
expected and resubmit.

Any comments?

Mark

diff -Naur pgsql.orig/src/backend/catalog/system_views.sql pgsql/src/backend/catalog/system_views.sql
--- pgsql.orig/src/backend/catalog/system_views.sql    Thu Mar  3 11:29:55 2005
+++ pgsql/src/backend/catalog/system_views.sql    Thu Mar  3 11:41:24 2005
@@ -277,3 +277,8 @@
     DO INSTEAD NOTHING;

 GRANT SELECT, UPDATE ON pg_settings TO PUBLIC;
+
+CREATE VIEW pg_dump_cache AS
+    SELECT D.* FROM pg_dump_cache() AS D
+    (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid);
+
diff -Naur pgsql.orig/src/backend/utils/adt/Makefile pgsql/src/backend/utils/adt/Makefile
--- pgsql.orig/src/backend/utils/adt/Makefile    Thu Mar  3 11:29:53 2005
+++ pgsql/src/backend/utils/adt/Makefile    Thu Mar  3 11:32:10 2005
@@ -24,7 +24,7 @@
     tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \
     network.o mac.o inet_net_ntop.o inet_net_pton.o \
     ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \
-    ascii.o quote.o pgstatfuncs.o encode.o
+    ascii.o quote.o pgstatfuncs.o encode.o dumpcache.o

 like.o: like.c like_match.c

diff -Naur pgsql.orig/src/backend/utils/adt/dumpcache.c pgsql/src/backend/utils/adt/dumpcache.c
--- pgsql.orig/src/backend/utils/adt/dumpcache.c    Thu Jan  1 12:00:00 1970
+++ pgsql/src/backend/utils/adt/dumpcache.c    Thu Mar  3 11:51:53 2005
@@ -0,0 +1,119 @@
+/*-------------------------------------------------------------------------
+ *
+ * dumpcache.c
+ *    display some contents for the buffer cache
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "funcapi.h"
+#include "catalog/pg_type.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "utils/relcache.h"
+
+
+extern Datum dump_cache(PG_FUNCTION_ARGS);
+
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct {
+    int                buffer;
+    AttInMetadata    *attinmeta;
+    BufferDesc        *bufhdr;
+    RelFileNode        rnode;
+    char            *values[3];
+} dumpcache_fctx;
+
+
+/*
+ * Return a tuple that has bufferid, relfilenoide, reltablespace and
+ * reldatabase OIDs.
+ */
+Datum
+dump_cache(PG_FUNCTION_ARGS)
+{
+    FuncCallContext        *funcctx;
+    Datum                result;
+    MemoryContext        oldcontext;
+    dumpcache_fctx        *fctx;        /* User function context. */
+    TupleDesc            tupledesc;
+    HeapTuple            tuple;
+
+    if (SRF_IS_FIRSTCALL())
+    {
+        funcctx = SRF_FIRSTCALL_INIT();
+
+        /* Switch context when allocating stuff to be used in later calls */
+        oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+        /* construct a tuple to return */
+        tupledesc = CreateTemplateTupleDesc(4, false);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+                                    INT4OID, -1, 0);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+                                    OIDOID, -1, 0);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+                                    OIDOID, -1, 0);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+                                    OIDOID, -1, 0);
+
+        /* Generate attribute metadata needed later to produce tuples */
+        funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc);
+
+        /*
+         * Create a function context for cross-call persistence
+         * and initialize the buffer counters.
+         */
+        fctx = (dumpcache_fctx *) palloc(sizeof(dumpcache_fctx));
+
+        fctx->buffer = 0;
+        fctx->bufhdr = BufferDescriptors;
+
+        funcctx->user_fctx = fctx;
+        funcctx->max_calls = NBuffers;
+
+        /* allocate the strings for tuple formation */
+        fctx->values[0] = (char *) palloc(NAMEDATALEN + 1);
+        fctx->values[1] = (char *) palloc(NAMEDATALEN + 1);
+        fctx->values[2] = (char *) palloc(NAMEDATALEN + 1);
+        fctx->values[3] = (char *) palloc(NAMEDATALEN + 1);
+
+
+        /* Return to original context when allocating transient memory */
+        MemoryContextSwitchTo(oldcontext);
+    }
+    /* <user defined code> */
+    funcctx = SRF_PERCALL_SETUP();
+
+    /* Get the saved state, and set up the result */
+    fctx = funcctx->user_fctx;
+
+
+    if (funcctx->call_cntr < funcctx->max_calls)
+    {
+        /* Get the relation node */
+        fctx->rnode = (fctx->bufhdr)->tag.rnode;
+
+        /* setup cstrings and create a tuple from them */
+        sprintf(fctx->values[0], "%u", fctx->buffer);
+        sprintf(fctx->values[1], "%u", fctx->rnode.relNode);
+        sprintf(fctx->values[2], "%u", fctx->rnode.spcNode);
+        sprintf(fctx->values[3], "%u", fctx->rnode.dbNode);
+
+        tuple = BuildTupleFromCStrings(funcctx->attinmeta, fctx->values);
+        result = HeapTupleGetDatum(tuple);
+
+
+        /* increment the buffer count and the buffer it points to */
+        fctx->buffer++;
+        fctx->bufhdr++;
+
+        SRF_RETURN_NEXT(funcctx, result);
+    }
+    else
+        SRF_RETURN_DONE(funcctx);
+}
+
diff -Naur pgsql.orig/src/include/catalog/pg_proc.h pgsql/src/include/catalog/pg_proc.h
--- pgsql.orig/src/include/catalog/pg_proc.h    Thu Mar  3 11:30:02 2005
+++ pgsql/src/include/catalog/pg_proc.h    Thu Mar  3 11:39:27 2005
@@ -3615,6 +3615,8 @@
 DATA(insert OID = 2558 ( int4                   PGNSP PGUID 12 f f t f i 1  23 "16" _null_    bool_int4 - _null_ ));
 DESCR("convert boolean to int4");

+/* dump cache */
+DATA(insert OID = 2510 (  pg_dump_cache PGNSP PGUID 12 f f t t v 0 2249 "" _null_ dump_cache - _null_ ));

 /*
  * Symbolic values for provolatile column: these indicate whether the result



Re: Display Pg buffer cache (WIP)

From
Neil Conway
Date:
Mark Kirkwood wrote:
> does not worry too much about a consistent view of the buffer
> contents (as I didn't want it to block all other activity!).

I don't like accessing shared data structures without acquiring the
appropriate locks; even if it doesn't break anything, it seems like just
asking for trouble. In order to be able to inspect a buffer's tag in
Tom's new locking scheme (not yet in HEAD, but will be in 8.1), you need
only hold a per-buffer lock. That will mean acquiring and releasing a
spinlock for each buffer, which isn't _too_ bad...

That means the data reported by the function might still be
inconsistent; not sure how big a problem that is.

It might be nice for the patch to indicate whether the buffers are
dirty, and what their shared reference count is.

> +extern Datum dump_cache(PG_FUNCTION_ARGS);

This declaration belongs in a header file (such as
include/utils/builtins.h).

> +typedef struct {
> +    int                buffer;
> +    AttInMetadata    *attinmeta;
> +    BufferDesc        *bufhdr;
> +    RelFileNode        rnode;
> +    char            *values[3];
> +} dumpcache_fctx;

This should be values[4], no?

This is trivial, but I think most type names use camel case (NamesLikeThis).

Why does `rnode' need to be in the struct? You can also fetch the buffer
number from the buffer desc, so you needn't store another copy of it.

> +        /* allocate the strings for tuple formation */
> +        fctx->values[0] = (char *) palloc(NAMEDATALEN + 1);
> +        fctx->values[1] = (char *) palloc(NAMEDATALEN + 1);
> +        fctx->values[2] = (char *) palloc(NAMEDATALEN + 1);
> +        fctx->values[3] = (char *) palloc(NAMEDATALEN + 1);

Is there a reason for choosing NAMEDATALEN + 1 as the size of these
buffers? (There is no relation between NAMEDATALEN and the number of
characters an OID will consume when converted via sprintf("%u") )

The patch doesn't treat unassigned buffers specially (i.e. those buffers
whose tag contains of InvalidOids). Perhaps it should return NULL for
their OID fields, rather than InvalidOid (which will be 0)? (An
alternative would be to not include those rows in the result set,
although perhaps administrators might want this information.)

-Neil

Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
Neil Conway wrote:
> I don't like accessing shared data structures without acquiring the
> appropriate locks; even if it doesn't break anything, it seems like just
> asking for trouble. In order to be able to inspect a buffer's tag in
> Tom's new locking scheme (not yet in HEAD, but will be in 8.1), you need
> only hold a per-buffer lock. That will mean acquiring and releasing a
> spinlock for each buffer, which isn't _too_ bad...
>
> That means the data reported by the function might still be
> inconsistent; not sure how big a problem that is.
>
> It might be nice for the patch to indicate whether the buffers are
> dirty, and what their shared reference count is.
>

Yeah - sounds good, will do.

>> +extern Datum dump_cache(PG_FUNCTION_ARGS);
>
>

Yep.

> This declaration belongs in a header file (such as
> include/utils/builtins.h).
>
>> +typedef struct {
>> +    int                buffer;
>> +    AttInMetadata    *attinmeta;
>> +    BufferDesc        *bufhdr;
>> +    RelFileNode        rnode;
>> +    char            *values[3];
>> +} dumpcache_fctx;
>
>
> This should be values[4], no?
>

Arrg... surprised there wasn't crashes everywhere....

> This is trivial, but I think most type names use camel case
> (NamesLikeThis).
>
> Why does `rnode' need to be in the struct? You can also fetch the buffer
> number from the buffer desc, so you needn't store another copy of it.
>
I thought it made readability a bit better, but yeah, could take it out.

>> +        /* allocate the strings for tuple formation */
>> +        fctx->values[0] = (char *) palloc(NAMEDATALEN + 1);
>> +        fctx->values[1] = (char *) palloc(NAMEDATALEN + 1);
>> +        fctx->values[2] = (char *) palloc(NAMEDATALEN + 1);
>> +        fctx->values[3] = (char *) palloc(NAMEDATALEN + 1);
>
>
> Is there a reason for choosing NAMEDATALEN + 1 as the size of these
> buffers? (There is no relation between NAMEDATALEN and the number of
> characters an OID will consume when converted via sprintf("%u") )

Hmmm... good spot, originally I was returning TEXTOID and relation names
etc...and then it was "big enough" after converting to oid during
testing, so err, yes - I will change that (as well) !
>
> The patch doesn't treat unassigned buffers specially (i.e. those buffers
> whose tag contains of InvalidOids). Perhaps it should return NULL for
> their OID fields, rather than InvalidOid (which will be 0)? (An
> alternative would be to not include those rows in the result set,
> although perhaps administrators might want this information.)
>
I thought it might be handy to explicitly see unused (or belonging to
another db) buffers. Clearly joining to pg_class will project 'em out!

Finally, thanks for the excellent feedback (fast too)

regards

Mark


Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
Neil Conway wrote:
>
> I don't like accessing shared data structures without acquiring the
> appropriate locks; even if it doesn't break anything, it seems like just
> asking for trouble. In order to be able to inspect a buffer's tag in
> Tom's new locking scheme (not yet in HEAD, but will be in 8.1), you need
> only hold a per-buffer lock. That will mean acquiring and releasing a
> spinlock for each buffer, which isn't _too_ bad...
>
>

I am looking at this bit now.

I take the first part to mean that don't need to wrap
LWLockAcquire(BufMgrLock, LW_EXCLUSIVE) ... LWLockRelease(BufMgrLock)
around the inspection of the buffers (?)

This per buffer lock, are we talking about LockBuffer(buf,
BUFFER_LOCK_SHARE) ... releaseBuffer(buf) ?

If so, it looks like I need to do some stuff with ResourceOwners,
otherwise ReleaseBuffer will fail (or am I wandering up the wrong track
here?). I am using anoncvs from yesterday, so if Tom's new scheme is
*very* new I may be missing it.

Thanks

Mark


Re: Display Pg buffer cache (WIP)

From
Neil Conway
Date:
Mark Kirkwood wrote:
> If so, it looks like I need to do some stuff with ResourceOwners,
> otherwise ReleaseBuffer will fail (or am I wandering up the wrong track
> here?). I am using anoncvs from yesterday, so if Tom's new scheme is
> *very* new I may be missing it.

It's so new, in fact, it's not in CVS yet :) I believe the latest
revision of the patch is here:

http://archives.postgresql.org/pgsql-patches/2005-02/msg00115.php

The locking scheme is described here:

http://archives.postgresql.org/pgsql-hackers/2005-02/msg00391.php

Holding the per-buffer header spinlock should be enough to ensure the
tag doesn't change. To get a globally consistent snapshot of the state
of the bufmgr, I believe it should be sufficient to also share-lock the
BufMappingLock for the duration of the scan. The latter will prevent new
pages from being loaded in the buffer pool, so I'm not sure if it's
worth doing. If you do decide to hold the BufMappingLock, it might make
sense to:

1. allocate an array of NBuffers elements
2. acquire BufferMappingLock in share mode
3. sequentially scan through the buffer pool, copying data into the array
4. release the lock
5. on each subsequent call to the SRF, format and return an element of
the array

Which should reduce the time to lock is held. This will require
allocating NBuffers * size_of_stats memory (where size_of_stats will be
something like 16 bytes).

-Neil

Re: Display Pg buffer cache (WIP)

From
Tom Lane
Date:
Mark Kirkwood <markir@coretech.co.nz> writes:
> I am using anoncvs from yesterday, so if Tom's new scheme is
> *very* new I may be missing it.

It's not committed yet ;-)

            regards, tom lane

Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
Tom Lane wrote:
> Mark Kirkwood <markir@coretech.co.nz> writes:
>
>>I am using anoncvs from yesterday, so if Tom's new scheme is
>>*very* new I may be missing it.
>
>
> It's not committed yet ;-)
>

Yep - that's pretty new, apologies for slow grey matter... I have been
following the discussion about the new buffer design, but didn't think
to connect it to this context!

cheers

Mark


Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
Neil Conway wrote:
> If you do decide to hold the BufMappingLock, it might make
> sense to:
>
> 1. allocate an array of NBuffers elements
> 2. acquire BufferMappingLock in share mode
> 3. sequentially scan through the buffer pool, copying data into the array
> 4. release the lock
> 5. on each subsequent call to the SRF, format and return an element of
> the array
>
> Which should reduce the time to lock is held. This will require
> allocating NBuffers * size_of_stats memory (where size_of_stats will be
> something like 16 bytes).
>

That is a better approach, so I've used it in this new iteration.

In addition to holding the BufMappingLock, each buffer header is (spin)
locked before examining it, hopefully this is correct - BTW, I like the
new buffer lock design.

I'm still using BuildTupleFromCStrings, so there is considerable use of
sprintf conversion and "temporary" char * stuff. I would like this to be
a bit cleaner, so any suggestions welcome.

regards

Mark


diff -Naur pgsql.orig/src/backend/catalog/system_views.sql pgsql/src/backend/catalog/system_views.sql
--- pgsql.orig/src/backend/catalog/system_views.sql    Fri Mar  4 14:23:09 2005
+++ pgsql/src/backend/catalog/system_views.sql    Fri Mar  4 14:21:33 2005
@@ -277,3 +277,9 @@
     DO INSTEAD NOTHING;

 GRANT SELECT, UPDATE ON pg_settings TO PUBLIC;
+
+CREATE VIEW pg_cache_dump AS
+    SELECT D.* FROM pg_cache_dump() AS D
+    (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+     isdirty bool, refcount int4);
+
diff -Naur pgsql.orig/src/backend/utils/adt/cachedump.c pgsql/src/backend/utils/adt/cachedump.c
--- pgsql.orig/src/backend/utils/adt/cachedump.c    Thu Jan  1 12:00:00 1970
+++ pgsql/src/backend/utils/adt/cachedump.c    Sat Mar  5 20:21:45 2005
@@ -0,0 +1,221 @@
+/*-------------------------------------------------------------------------
+ *
+ * cachedump.c
+ *    display some contents of the buffer cache
+ *
+ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *      $PostgreSQL$
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "funcapi.h"
+#include "catalog/pg_type.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "utils/relcache.h"
+#include "utils/builtins.h"
+
+
+#define NUM_CACHE_DUMP_ELEM    6
+
+/*
+ * Record structure holding the to be exposed cache data.
+ */
+typedef struct
+{
+    uint32        bufferid;
+    Oid            relfilenode;
+    Oid            reltablespace;
+    Oid            reldatabase;
+    bool        isdirty;
+    uint32        refcount;
+    BlockNumber    blocknum;
+
+} CacheDumpRec;
+
+
+/*
+ * Function context for data persisting over repeated calls.
+ */
+typedef struct
+{
+    AttInMetadata    *attinmeta;
+    CacheDumpRec    *record;
+    char            *values[NUM_CACHE_DUMP_ELEM];
+} CacheDumpContext;
+
+
+/*
+ * Function returning data from the shared buffer cache - buffer number,
+ * relation node/tablespace/database, dirty indicator and refcount.
+ */
+Datum
+cache_dump(PG_FUNCTION_ARGS)
+{
+    FuncCallContext        *funcctx;
+    Datum                result;
+    MemoryContext        oldcontext;
+    CacheDumpContext    *fctx;        /* User function context. */
+    TupleDesc            tupledesc;
+    HeapTuple            tuple;
+
+    if (SRF_IS_FIRSTCALL())
+    {
+        RelFileNode    rnode;
+        uint32        i;
+        BufferDesc    *bufHdr;
+
+
+        funcctx = SRF_FIRSTCALL_INIT();
+
+        /* Switch context when allocating stuff to be used in later calls */
+        oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+        /* construct a tuple to return */
+        tupledesc = CreateTemplateTupleDesc(NUM_CACHE_DUMP_ELEM, false);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+                                    INT4OID, -1, 0);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+                                    OIDOID, -1, 0);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+                                    OIDOID, -1, 0);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+                                    OIDOID, -1, 0);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 5, "isdirty",
+                                    BOOLOID, -1, 0);
+        TupleDescInitEntry(tupledesc, (AttrNumber) 6, "refcount",
+                                    INT4OID, -1, 0);
+
+        /* Generate attribute metadata needed later to produce tuples */
+        funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc);
+
+        /*
+         * Create a function context for cross-call persistence
+         * and initialize the buffer counters.
+         */
+        fctx = (CacheDumpContext *) palloc(sizeof(CacheDumpContext));
+        funcctx->max_calls = NBuffers;
+        funcctx->user_fctx = fctx;
+
+
+        /* Allocate NBuffers worth of CacheDumpRec records. */
+        fctx->record = (CacheDumpRec *) palloc(sizeof(CacheDumpRec) * NBuffers);
+
+        /* allocate the strings for tuple formation */
+        fctx->values[0] = (char *) palloc(3 * sizeof(uint32) + 1);
+        fctx->values[1] = (char *) palloc(3 * sizeof(uint32) + 1);
+        fctx->values[2] = (char *) palloc(3 * sizeof(uint32) + 1);
+        fctx->values[3] = (char *) palloc(3 * sizeof(uint32) + 1);
+        fctx->values[4] = (char *) palloc(10);
+        fctx->values[5] = (char *) palloc(3 * sizeof(uint32) + 1);
+
+
+        /* Return to original context when allocating transient memory */
+        MemoryContextSwitchTo(oldcontext);
+
+
+        /*
+         * Lock Buffer map and scan though all the buffers, saving the
+         * relevant fields in the fctx->record structure.
+         */
+        LWLockAcquire(BufMappingLock, LW_SHARED);
+
+        for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
+        {
+            /* Lock each buffer header before inspecting. */
+            LockBufHdr(bufHdr);
+
+            rnode = bufHdr->tag.rnode;
+
+            fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
+            fctx->record[i].relfilenode = rnode.relNode;
+            fctx->record[i].reltablespace = rnode.spcNode;
+            fctx->record[i].reldatabase = rnode.dbNode;
+            fctx->record[i].refcount = bufHdr->refcount;
+            fctx->record[i].blocknum = bufHdr->tag.blockNum;
+            if ( bufHdr->flags & BM_DIRTY)
+            {
+                fctx->record[i].isdirty = true;
+            }
+            else
+            {
+                fctx->record[i].isdirty = false;
+            }
+
+            UnlockBufHdr(bufHdr);
+
+        }
+
+        /* Release Buffer map. */
+        LWLockRelease(BufMappingLock);
+    }
+
+    funcctx = SRF_PERCALL_SETUP();
+
+    /* Get the saved state */
+    fctx = funcctx->user_fctx;
+
+
+    if (funcctx->call_cntr < funcctx->max_calls)
+    {
+        uint32         i = funcctx->call_cntr;
+        char        *values[NUM_CACHE_DUMP_ELEM];
+        int            j;
+
+        /*
+         * Use a temporary values array, initially pointing to
+         * fctx->values, so it can be reassigned w/o losing the storage
+         * for subsequent calls.
+         */
+        for (j = 0; j < NUM_CACHE_DUMP_ELEM; j++)
+        {
+            values[j] = fctx->values[j];
+        }
+
+
+        if (fctx->record[i].blocknum == InvalidBlockNumber)
+        {
+
+            sprintf(values[0], "%u", fctx->record[i].bufferid);
+            values[1] = NULL;
+            values[2] = NULL;
+            values[3] = NULL;
+            values[4] = NULL;
+            values[5] = NULL;
+
+        }
+        else
+        {
+
+            sprintf(values[0], "%u", fctx->record[i].bufferid);
+            sprintf(values[1], "%u", fctx->record[i].relfilenode);
+            sprintf(values[2], "%u", fctx->record[i].reltablespace);
+            sprintf(values[3], "%u", fctx->record[i].reldatabase);
+            if (fctx->record[i].isdirty)
+            {
+                strcpy(values[4], "true");
+            }
+            else
+            {
+                strcpy(values[4], "false");
+            }
+            sprintf(values[5], "%u", fctx->record[i].refcount);
+
+        }
+
+
+        /* Build and return the tuple. */
+        tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
+        result = HeapTupleGetDatum(tuple);
+
+
+        SRF_RETURN_NEXT(funcctx, result);
+    }
+    else
+        SRF_RETURN_DONE(funcctx);
+}
+
diff -Naur pgsql.orig/src/include/catalog/pg_proc.h pgsql/src/include/catalog/pg_proc.h
--- pgsql.orig/src/include/catalog/pg_proc.h    Fri Mar  4 14:24:20 2005
+++ pgsql/src/include/catalog/pg_proc.h    Fri Mar  4 14:21:56 2005
@@ -3615,6 +3615,8 @@
 DATA(insert OID = 2558 ( int4                   PGNSP PGUID 12 f f t f i 1  23 "16" _null_    bool_int4 - _null_ ));
 DESCR("convert boolean to int4");

+/* cache dump */
+DATA(insert OID = 2510 (  pg_cache_dump PGNSP PGUID 12 f f t t v 0 2249 "" _null_ cache_dump - _null_ ));

 /*
  * Symbolic values for provolatile column: these indicate whether the result
diff -Naur pgsql.orig/src/include/utils/builtins.h pgsql/src/include/utils/builtins.h
--- pgsql.orig/src/include/utils/builtins.h    Fri Mar  4 14:24:31 2005
+++ pgsql/src/include/utils/builtins.h    Fri Mar  4 14:22:08 2005
@@ -823,4 +823,7 @@
 /* catalog/pg_conversion.c */
 extern Datum pg_convert_using(PG_FUNCTION_ARGS);

+/* cache dump */
+extern Datum cache_dump(PG_FUNCTION_ARGS);
+
 #endif   /* BUILTINS_H */



Re: Display Pg buffer cache (WIP)

From
Tom Lane
Date:
Mark Kirkwood <markir@coretech.co.nz> writes:
> In addition to holding the BufMappingLock, each buffer header is (spin)
> locked before examining it, hopefully this is correct - BTW, I like the
> new buffer lock design.

It'd be possible to dispense with the per-buffer spinlocks so long as
you look only at the tag (and perhaps the TAG_VALID flag bit).  The
tags can't be changing while you hold the BufMappingLock.  I'm dubious
that there's any point in recording information as transient as the
refcounts and dirtybits, and reducing the amount of time you hold the
lock would be a Good Thing.

Also, you're still not dealing with the case of a not-valid buffer in a
reasonable way.

            regards, tom lane

Re: Display Pg buffer cache (WIP)

From
Neil Conway
Date:
Tom Lane wrote:
> It'd be possible to dispense with the per-buffer spinlocks so long as
> you look only at the tag (and perhaps the TAG_VALID flag bit).  The
> tags can't be changing while you hold the BufMappingLock.

That's what I had thought at first, but this comment in buf_internals.h
dissuaded me: "buf_hdr_lock must be held to examine or change the tag,
flags, usage_count, refcount, or wait_backend_id fields." The comment
already notes this isn't true if you've got the buffer pinned; it would
be worth adding another exception for holding the BufMappingLock, IMHO.

> I'm dubious that there's any point in recording information as
> transient as the refcounts and dirtybits

I think it's worth recording dirty bits -- it provides an indication of
the effectiveness of the bgwriter, for example. Reference counts could
be done away with, although I doubt it would have a significant effect
on the time spent holding the lock.

-Neil

Re: Display Pg buffer cache (WIP)

From
Neil Conway
Date:
Only two things to add: you forgot to add `cachedump.o' to the list of
OBJS in the utils/adt Makefile.

Mark Kirkwood wrote:
> +typedef struct
> +{
> +    uint32        bufferid;
> +    Oid            relfilenode;
> +    Oid            reltablespace;
> +    Oid            reldatabase;
> +    bool        isdirty;
> +    uint32        refcount;
> +    BlockNumber    blocknum;
> +
> +} CacheDumpRec;

You should probably make `isdirty' the last member of this struct, so as
to reduce alignment/padding requirements (this won't actually save any
space right now, but it might save some space if more members are added
to the struct in the future).

-Neil

Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
Neil Conway wrote:
> Tom Lane wrote:
>
>> It'd be possible to dispense with the per-buffer spinlocks so long as
>> you look only at the tag (and perhaps the TAG_VALID flag bit).  The
>> tags can't be changing while you hold the BufMappingLock.
>
>
> That's what I had thought at first, but this comment in buf_internals.h
> dissuaded me: "buf_hdr_lock must be held to examine or change the tag,
> flags, usage_count, refcount, or wait_backend_id fields." The comment
> already notes this isn't true if you've got the buffer pinned; it would
> be worth adding another exception for holding the BufMappingLock, IMHO.
>
>> I'm dubious that there's any point in recording information as
>> transient as the refcounts and dirtybits
>
>
> I think it's worth recording dirty bits -- it provides an indication of
> the effectiveness of the bgwriter, for example. Reference counts could
> be done away with, although I doubt it would have a significant effect
> on the time spent holding the lock.
>
>
Let's suppose refcount is eliminated. I will then be examining the tag,
flags and buf_id elements of the buffer. Holding the BufMappingLock
prevents the tag changing, but what about the flags?

In addition Tom pointed out that I am not examining the BM_TAG_VALID or
BM_VALID flag bits (I am only checking if tag.blockNum equals
InvalidBlockNumber). My initial thought is to handle !BM_TAG_VALID or
!BM_VALID similarly to InvalidBlockNumber i.e all non buf_id fields set
to NULL.

Mark



Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
The latest iteration.

I have added documentation and updated the expected output so that the
regression tests pass.

In addition, after looking at the various system view names, I decided
that 'pg_cache_dump' does not fit in nicely - so chose an more Pg
suitable name of 'pg_buffercache'. Some renaming of the backend
functions happened too. Finally, since I was saving blocknum, it went
into the view as well.

Hopefully I am dealing with invalid buffer tags sensibly now. The
per-buffer spin lock is still being held - altho it is obviously trivial
to remove if not actually required.

regards

Mark

P.s : remembered to use diff -c


Mark Kirkwood wrote:
> Neil Conway wrote:
>
>> Tom Lane wrote:
>>
>>> It'd be possible to dispense with the per-buffer spinlocks so long as
>>> you look only at the tag (and perhaps the TAG_VALID flag bit).  The
>>> tags can't be changing while you hold the BufMappingLock.
>>
>>
>>
>> That's what I had thought at first, but this comment in
>> buf_internals.h dissuaded me: "buf_hdr_lock must be held to examine or
>> change the tag, flags, usage_count, refcount, or wait_backend_id
>> fields." The comment already notes this isn't true if you've got the
>> buffer pinned; it would be worth adding another exception for holding
>> the BufMappingLock, IMHO.
>>
>>> I'm dubious that there's any point in recording information as
>>> transient as the refcounts and dirtybits
>>
>>
>>
>> I think it's worth recording dirty bits -- it provides an indication
>> of the effectiveness of the bgwriter, for example. Reference counts
>> could be done away with, although I doubt it would have a significant
>> effect on the time spent holding the lock.
>>
>>
> Let's suppose refcount is eliminated. I will then be examining the tag,
> flags and buf_id elements of the buffer. Holding the BufMappingLock
> prevents the tag changing, but what about the flags?
>
> In addition Tom pointed out that I am not examining the BM_TAG_VALID or
> BM_VALID flag bits (I am only checking if tag.blockNum equals
> InvalidBlockNumber). My initial thought is to handle !BM_TAG_VALID or
> !BM_VALID similarly to InvalidBlockNumber i.e all non buf_id fields set
> to NULL.
>
> Mark
>
>
>
> ---------------------------(end of broadcast)---------------------------
> TIP 2: you can get off all lists at once with the unregister command
>    (send "unregister YourEmailAddressHere" to majordomo@postgresql.org)



diff -Nacr pgsql.orig/doc/src/sgml/catalogs.sgml pgsql/doc/src/sgml/catalogs.sgml
*** pgsql.orig/doc/src/sgml/catalogs.sgml    Mon Mar  7 12:20:17 2005
--- pgsql/doc/src/sgml/catalogs.sgml    Tue Mar  8 12:03:50 2005
***************
*** 3875,3880 ****
--- 3875,3885 ----

      <tbody>
       <row>
+       <entry><link linkend="view-pg-buffercache"><structname>pg_buffercache</structname></link></entry>
+       <entry>shared buffer cache</entry>
+      </row>
+
+      <row>
        <entry><link linkend="view-pg-indexes"><structname>pg_indexes</structname></link></entry>
        <entry>indexes</entry>
       </row>
***************
*** 3917,3922 ****
--- 3922,4021 ----
      </tbody>
     </tgroup>
    </table>
+  </sect1>
+
+  <sect1 id="view-pg-buffercache">
+   <title><structname>pg_buffercache</structname></title>
+
+   <indexterm zone="view-pg-buffercache">
+    <primary>pg_buffercache</primary>
+   </indexterm>
+   <para>
+    The view <structname>pg_buffercache</structname> provides access to
+    some information from the shared buffer cache.
+   </para>
+
+   <para>
+    There is one row for each buffer in the shared cache. Unused buffers are
+    shown with all fields null except <structfield>bufferid</structfield>.
+    Because the cache is shared by all the databases, there are pages from
+    relations not belonging to the current database.
+   </para>
+
+
+   <table>
+    <title><structname>pg_buffercache</structname> Columns</title>
+
+    <tgroup cols=4>
+     <thead>
+      <row>
+       <entry>Name</entry>
+       <entry>Type</entry>
+       <entry>References</entry>
+       <entry>Description</entry>
+      </row>
+     </thead>
+     <tbody>
+      <row>
+       <entry>bufferid</entry>
+       <entry><type>integer</type></entry>
+       <entry></entry>
+       <entry>
+        The buffer number. This is numbered 1 to <varname>shared_buffers</varname>.
+       </entry>
+      </row>
+      <row>
+       <entry>relfilenode</entry>
+       <entry><type>oid</type></entry>
+       <entry><literal><link
linkend="catalog-pg-class"><structname>pg_class</structname></link>.relfilenode</literal></entry>
+       <entry>
+       The on-disk file for the relation that this page came from.
+       </entry>
+      </row>
+      <row>
+       <entry>reltablespace</entry>
+       <entry><type>oid</type></entry>
+       <entry>
+       <literal><link linkend="catalog-pg-tablespace"><structname>pg_tablespace</structname></link>.oid</literal>
+       </entry>
+       <entry>Tablespace the corresponding relation is in.</entry>
+      </row>
+      <row>
+       <entry>reldatabase</entry>
+       <entry><type>oid</type></entry>
+       <entry><literal><link
linkend="catalog-pg-database"><structname>pg_database</structname></link>.oid</literal></entry>
+       <entry>
+        Database the corresponding relation belongs to, or zero if the
+        relation is a globally-shared table</entry>
+      </row>
+      <row>
+       <entry>relblocknumber</entry>
+       <entry><type>numeric</type></entry>
+       <entry></entry>
+       <entry>
+        The page offset within the relation that this buffer page is for.
+       </entry>
+      </row>
+      <row>
+       <entry>isdirty</entry>
+       <entry><type>bool</type></entry>
+       <entry></entry>
+       <entry>True if the buffer is dirty.</entry>
+      </row>
+     </tbody>
+    </tgroup>
+
+   </table>
+
+   <para>
+    When the <structname>pg_buffercache</structname> view is accessed,
+    internal buffer manager locks are taken, and a copy of the buffer cache data
+    is made for the view to display. This ensures that the view produces a
+    consistent set of results, while not blocking normal buffer activity longer
+    than necessary.  Nonetheless there could be some impact on database
+    performance if this view is read often.
+   </para>
+
   </sect1>

   <sect1 id="view-pg-indexes">
diff -Nacr pgsql.orig/src/backend/catalog/system_views.sql pgsql/src/backend/catalog/system_views.sql
*** pgsql.orig/src/backend/catalog/system_views.sql    Fri Mar  4 14:23:09 2005
--- pgsql/src/backend/catalog/system_views.sql    Tue Mar  8 11:45:39 2005
***************
*** 277,279 ****
--- 277,285 ----
      DO INSTEAD NOTHING;

  GRANT SELECT, UPDATE ON pg_settings TO PUBLIC;
+
+ CREATE VIEW pg_buffercache AS
+     SELECT P.* FROM pg_buffercache_pages() AS P
+     (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+      relblocknumber numeric(10), isdirty bool);
+
diff -Nacr pgsql.orig/src/backend/utils/adt/Makefile pgsql/src/backend/utils/adt/Makefile
*** pgsql.orig/src/backend/utils/adt/Makefile    Mon Mar  7 10:16:24 2005
--- pgsql/src/backend/utils/adt/Makefile    Tue Mar  8 11:44:10 2005
***************
*** 24,30 ****
      tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \
      network.o mac.o inet_net_ntop.o inet_net_pton.o \
      ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \
!     ascii.o quote.o pgstatfuncs.o encode.o

  like.o: like.c like_match.c

--- 24,30 ----
      tid.o timestamp.o varbit.o varchar.o varlena.o version.o xid.o \
      network.o mac.o inet_net_ntop.o inet_net_pton.o \
      ri_triggers.o pg_lzcompress.o pg_locale.o formatting.o \
!     ascii.o quote.o pgstatfuncs.o encode.o buffercache.o

  like.o: like.c like_match.c

diff -Nacr pgsql.orig/src/backend/utils/adt/buffercache.c pgsql/src/backend/utils/adt/buffercache.c
*** pgsql.orig/src/backend/utils/adt/buffercache.c    Thu Jan  1 12:00:00 1970
--- pgsql/src/backend/utils/adt/buffercache.c    Tue Mar  8 11:48:56 2005
***************
*** 0 ****
--- 1,237 ----
+ /*-------------------------------------------------------------------------
+  *
+  * buffercache.c
+  *    display some contents of the buffer cache
+  *
+  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
+  * Portions Copyright (c) 1994, Regents of the University of California
+  *
+  *
+  * IDENTIFICATION
+  *      $PostgreSQL$
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ #include "funcapi.h"
+ #include "catalog/pg_type.h"
+ #include "storage/buf_internals.h"
+ #include "storage/bufmgr.h"
+ #include "utils/relcache.h"
+ #include "utils/builtins.h"
+
+
+ #define NUM_BUFFERCACHE_PAGES_ELEM    6
+
+
+ /*
+  * Record structure holding the to be exposed cache data.
+  */
+ typedef struct
+ {
+     uint32        bufferid;
+     Oid            relfilenode;
+     Oid            reltablespace;
+     Oid            reldatabase;
+     BlockNumber    blocknum;
+     bool        isvalid;
+     bool        isdirty;
+
+ } BufferCachePagesRec;
+
+
+ /*
+  * Function context for data persisting over repeated calls.
+  */
+ typedef struct
+ {
+     AttInMetadata    *attinmeta;
+     BufferCachePagesRec    *record;
+     char            *values[NUM_BUFFERCACHE_PAGES_ELEM];
+ } BufferCachePagesContext;
+
+
+ /*
+  * Function returning data from the shared buffer cache - buffer number,
+  * relation node/tablespace/database/blocknum and dirty indicator.
+  */
+ Datum
+ pg_buffercache_pages(PG_FUNCTION_ARGS)
+ {
+     FuncCallContext        *funcctx;
+     Datum                result;
+     MemoryContext        oldcontext;
+     BufferCachePagesContext    *fctx;        /* User function context. */
+     TupleDesc            tupledesc;
+     HeapTuple            tuple;
+
+     if (SRF_IS_FIRSTCALL())
+     {
+         RelFileNode    rnode;
+         uint32        i;
+         BufferDesc    *bufHdr;
+
+
+         funcctx = SRF_FIRSTCALL_INIT();
+
+         /* Switch context when allocating stuff to be used in later calls */
+         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+         /* construct a tuple to return */
+         tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+                                     INT4OID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+                                     OIDOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+                                     OIDOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+                                     OIDOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber",
+                                     NUMERICOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 6, "isdirty",
+                                     BOOLOID, -1, 0);
+
+         /* Generate attribute metadata needed later to produce tuples */
+         funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc);
+
+         /*
+          * Create a function context for cross-call persistence
+          * and initialize the buffer counters.
+          */
+         fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
+         funcctx->max_calls = NBuffers;
+         funcctx->user_fctx = fctx;
+
+
+         /* Allocate NBuffers worth of BufferCachePagesRec records. */
+         fctx->record = (BufferCachePagesRec *) palloc(sizeof(BufferCachePagesRec) * NBuffers);
+
+         /* allocate the strings for tuple formation */
+         fctx->values[0] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[1] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[2] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[3] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[4] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[5] = (char *) palloc(10);
+
+
+         /* Return to original context when allocating transient memory */
+         MemoryContextSwitchTo(oldcontext);
+
+
+         /*
+          * Lock Buffer map and scan though all the buffers, saving the
+          * relevant fields in the fctx->record structure.
+          */
+         LWLockAcquire(BufMappingLock, LW_SHARED);
+
+         for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
+         {
+             /* Lock each buffer header before inspecting. */
+             LockBufHdr(bufHdr);
+
+             rnode = bufHdr->tag.rnode;
+
+             fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
+             fctx->record[i].relfilenode = rnode.relNode;
+             fctx->record[i].reltablespace = rnode.spcNode;
+             fctx->record[i].reldatabase = rnode.dbNode;
+             fctx->record[i].blocknum = bufHdr->tag.blockNum;
+
+             if ( bufHdr->flags & BM_DIRTY)
+             {
+                 fctx->record[i].isdirty = true;
+             }
+             else
+             {
+                 fctx->record[i].isdirty = false;
+             }
+
+             /* Note if the buffer is valid, and has storage created */
+             if ( (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
+             {
+                 fctx->record[i].isvalid = true;
+             }
+             else
+             {
+                 fctx->record[i].isvalid = false;
+             }
+
+             UnlockBufHdr(bufHdr);
+
+         }
+
+         /* Release Buffer map. */
+         LWLockRelease(BufMappingLock);
+     }
+
+     funcctx = SRF_PERCALL_SETUP();
+
+     /* Get the saved state */
+     fctx = funcctx->user_fctx;
+
+
+     if (funcctx->call_cntr < funcctx->max_calls)
+     {
+         uint32         i = funcctx->call_cntr;
+         char        *values[NUM_BUFFERCACHE_PAGES_ELEM];
+         int            j;
+
+         /*
+          * Use a temporary values array, initially pointing to
+          * fctx->values, so it can be reassigned w/o losing the storage
+          * for subsequent calls.
+          */
+         for (j = 0; j < NUM_BUFFERCACHE_PAGES_ELEM; j++)
+         {
+             values[j] = fctx->values[j];
+         }
+
+
+         /*
+          * Set all fields except the bufferid to null if the buffer is
+          * unused or not valid.
+          */
+         if (fctx->record[i].blocknum == InvalidBlockNumber ||
+             fctx->record[i].isvalid == false )
+         {
+
+             sprintf(values[0], "%u", fctx->record[i].bufferid);
+             values[1] = NULL;
+             values[2] = NULL;
+             values[3] = NULL;
+             values[4] = NULL;
+             values[5] = NULL;
+
+         }
+         else
+         {
+
+             sprintf(values[0], "%u", fctx->record[i].bufferid);
+             sprintf(values[1], "%u", fctx->record[i].relfilenode);
+             sprintf(values[2], "%u", fctx->record[i].reltablespace);
+             sprintf(values[3], "%u", fctx->record[i].reldatabase);
+             sprintf(values[4], "%u", fctx->record[i].blocknum);
+             if (fctx->record[i].isdirty)
+             {
+                 strcpy(values[5], "true");
+             }
+             else
+             {
+                 strcpy(values[5], "false");
+             }
+
+         }
+
+
+         /* Build and return the tuple. */
+         tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
+         result = HeapTupleGetDatum(tuple);
+
+
+         SRF_RETURN_NEXT(funcctx, result);
+     }
+     else
+         SRF_RETURN_DONE(funcctx);
+ }
+
diff -Nacr pgsql.orig/src/include/catalog/pg_proc.h pgsql/src/include/catalog/pg_proc.h
*** pgsql.orig/src/include/catalog/pg_proc.h    Fri Mar  4 14:24:20 2005
--- pgsql/src/include/catalog/pg_proc.h    Tue Mar  8 11:46:06 2005
***************
*** 3615,3620 ****
--- 3615,3622 ----
  DATA(insert OID = 2558 ( int4                   PGNSP PGUID 12 f f t f i 1  23 "16" _null_    bool_int4 - _null_ ));
  DESCR("convert boolean to int4");

+ /* builtin for cache internals view */
+ DATA(insert OID = 2510 ( pg_buffercache_pages PGNSP PGUID 12 f f t t v 0 2249 "" _null_ pg_buffercache_pages - _null_
));

  /*
   * Symbolic values for provolatile column: these indicate whether the result
diff -Nacr pgsql.orig/src/include/utils/builtins.h pgsql/src/include/utils/builtins.h
*** pgsql.orig/src/include/utils/builtins.h    Fri Mar  4 14:24:31 2005
--- pgsql/src/include/utils/builtins.h    Tue Mar  8 11:46:26 2005
***************
*** 823,826 ****
--- 823,829 ----
  /* catalog/pg_conversion.c */
  extern Datum pg_convert_using(PG_FUNCTION_ARGS);

+ /* cache dump */
+ extern Datum pg_buffercache_pages(PG_FUNCTION_ARGS);
+
  #endif   /* BUILTINS_H */
diff -Nacr pgsql.orig/src/test/regress/expected/rules.out pgsql/src/test/regress/expected/rules.out
*** pgsql.orig/src/test/regress/expected/rules.out    Tue Mar  8 13:00:00 2005
--- pgsql/src/test/regress/expected/rules.out    Tue Mar  8 12:59:24 2005
***************
*** 1275,1280 ****
--- 1275,1281 ----
           viewname         |




definition



                                                                                                        

--------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
   iexit                    | SELECT ih.name, ih.thepath, interpt_pp(ih.thepath, r.thepath) AS exit FROM ihighway ih,
rampr WHERE (ih.thepath ## r.thepath); 
+  pg_buffercache           | SELECT p.bufferid, p.relfilenode, p.reltablespace, p.reldatabase, p.relblocknumber,
p.isdirtyFROM pg_buffercache_pages() p(bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
relblocknumbernumeric(10,0),isdirty boolean); 
   pg_indexes               | SELECT n.nspname AS schemaname, c.relname AS tablename, i.relname AS indexname, t.spcname
AS"tablespace", pg_get_indexdef(i.oid) AS indexdef FROM ((((pg_index x JOIN pg_class c ON ((c.oid = x.indrelid))) JOIN
pg_classi ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) LEFT JOIN pg_tablespace
tON ((t.oid = i.reltablespace))) WHERE ((c.relkind = 'r'::"char") AND (i.relkind = 'i'::"char")); 
   pg_locks                 | SELECT l.relation, l."database", l."transaction", l.pid, l."mode", l.granted FROM
pg_lock_status()l(relation oid, "database" oid, "transaction" xid, pid integer, "mode" text, granted boolean); 
   pg_rules                 | SELECT n.nspname AS schemaname, c.relname AS tablename, r.rulename, pg_get_ruledef(r.oid)
ASdefinition FROM ((pg_rewrite r JOIN pg_class c ON ((c.oid = r.ev_class))) LEFT JOIN pg_namespace n ON ((n.oid =
c.relnamespace)))WHERE (r.rulename <> '_RETURN'::name); 
***************
*** 1314,1320 ****
   shoelace_obsolete        | SELECT shoelace.sl_name, shoelace.sl_avail, shoelace.sl_color, shoelace.sl_len,
shoelace.sl_unit,shoelace.sl_len_cm FROM shoelace WHERE (NOT (EXISTS (SELECT shoe.shoename FROM shoe WHERE
(shoe.slcolor= shoelace.sl_color)))); 
   street                   | SELECT r.name, r.thepath, c.cname FROM ONLY road r, real_city c WHERE (c.outline ##
r.thepath);
   toyemp                   | SELECT emp.name, emp.age, emp."location", (12 * emp.salary) AS annualsal FROM emp;
! (40 rows)

  SELECT tablename, rulename, definition FROM pg_rules
      ORDER BY tablename, rulename;
--- 1315,1321 ----
   shoelace_obsolete        | SELECT shoelace.sl_name, shoelace.sl_avail, shoelace.sl_color, shoelace.sl_len,
shoelace.sl_unit,shoelace.sl_len_cm FROM shoelace WHERE (NOT (EXISTS (SELECT shoe.shoename FROM shoe WHERE
(shoe.slcolor= shoelace.sl_color)))); 
   street                   | SELECT r.name, r.thepath, c.cname FROM ONLY road r, real_city c WHERE (c.outline ##
r.thepath);
   toyemp                   | SELECT emp.name, emp.age, emp."location", (12 * emp.salary) AS annualsal FROM emp;
! (41 rows)

  SELECT tablename, rulename, definition FROM pg_rules
      ORDER BY tablename, rulename;



Re: Display Pg buffer cache (WIP)

From
Neil Conway
Date:
Mark Kirkwood wrote:
> +         tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false);
> +         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
> +                                     INT4OID, -1, 0);
> +         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
> +                                     OIDOID, -1, 0);
> +         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
> +                                     OIDOID, -1, 0);
> +         TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
> +                                     OIDOID, -1, 0);
> +         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber",
> +                                     NUMERICOID, -1, 0);

I think this should be an int4, not numeric.

Otherwise, looks good to me. Barring any objections, I'll apply this
with a few stylistic tweaks and the numeric -> int4 change tomorrow.

-Neil

Re: Display Pg buffer cache (WIP)

From
Tom Lane
Date:
Neil Conway <neilc@samurai.com> writes:
> Mark Kirkwood wrote:
>> +         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber",
>> +                                     NUMERICOID, -1, 0);

> I think this should be an int4, not numeric.

needs spell check too ;-)

More generally, BlockNumber is unsigned and so int4 is not at all an
accurate conversion.  Perhaps OID would be a good choice even though
it's horribly wrong on one level.

> Otherwise, looks good to me. Barring any objections, I'll apply this
> with a few stylistic tweaks and the numeric -> int4 change tomorrow.

I would rather see this as a contrib module.  There has been *zero*
consensus that we need this in core, nor any discussion about whether
it might be a security hole.

            regards, tom lane

Re: Display Pg buffer cache (WIP)

From
Neil Conway
Date:
Tom Lane wrote:
> Perhaps OID would be a good choice even though it's horribly wrong on
> one level.

Either that or add unsigned numeric types to PG :-)

> I would rather see this as a contrib module.

I don't really have an opinion either way. Does anyone else have
thoughts on this?

> There has been *zero* consensus that we need this in core, nor any
> discussion about whether it might be a security hole.

ISTM if it can only be invoked by the superuser, it should be safe enough.

-Neil

Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
Neil Conway wrote:
> Mark Kirkwood wrote:

>> +         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber",
>> +                                     NUMERICOID, -1, 0);
>
> I think this should be an int4, not numeric.

I was looking for an UINT4OID :-), but numeric seemed the best
compromise (safer than overflowing int4). I guess I could add a type
'blocknumber' that is actually the same as 'oid', but it seems liks a
lot of effort for one column.

cheers

Mark


Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
Tom Lane wrote:
> I would rather see this as a contrib module.  There has been *zero*
> consensus that we need this in core, nor any discussion about whether
> it might be a security hole.

Hmmm, I guess my motivation for thinking it might be useful in core was
that it was similar to 'pg_locks' and 'pg_stats', i.e. exposing some
internals in a convenient queryable manner (useful for problem solving).

On the security front, I should have added a 'REVOKE ALL ...FROM PUBLIC'
on the view to at least control access (fortunately easy to add).

cheers

Mark


Re: Display Pg buffer cache (WIP)

From
Tom Lane
Date:
Mark Kirkwood <markir@coretech.co.nz> writes:
> Tom Lane wrote:
>> I would rather see this as a contrib module.  There has been *zero*
>> consensus that we need this in core, nor any discussion about whether
>> it might be a security hole.

> Hmmm, I guess my motivation for thinking it might be useful in core was
> that it was similar to 'pg_locks' and 'pg_stats', i.e. exposing some
> internals in a convenient queryable manner (useful for problem solving).

One reason for making it contrib is that I don't think you've got it
entirely right yet, and there will be several iterations before it
settles down.  In a contrib module that is no problem, in core it's a
forced initdb each time.

            regards, tom lane

Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
Tom Lane wrote:

> One reason for making it contrib is that I don't think you've got it
> entirely right yet, and there will be several iterations before it
> settles down.  In a contrib module that is no problem, in core it's a
> forced initdb each time.

Yeah - certainly less intrusive as a contrib if amendments are required!

Barring a huge groundswell of support for it in core :-) I will resubmit
a patch for it as a contrib module.

cheers

Mark


Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
Mark Kirkwood wrote:
> Tom Lane wrote:
>
>> One reason for making it contrib is that I don't think you've got it
>> entirely right yet, and there will be several iterations before it
>> settles down.  In a contrib module that is no problem, in core it's a
>> forced initdb each time.
>
>
> Yeah - certainly less intrusive as a contrib if amendments are required!
>
> Barring a huge groundswell of support for it in core :-) I will resubmit
> a patch for it as a contrib module.
>

And here it is...

Mark



diff -Nacr pgsql.orig/contrib/Makefile pgsql/contrib/Makefile
*** pgsql.orig/contrib/Makefile    Thu Mar  3 11:29:53 2005
--- pgsql/contrib/Makefile    Wed Mar  9 10:10:41 2005
***************
*** 26,31 ****
--- 26,32 ----
          noupdate    \
          oid2name    \
          pg_autovacuum    \
+         pg_buffercache    \
          pg_dumplo    \
          pg_trgm        \
          pgbench        \
diff -Nacr pgsql.orig/contrib/pg_buffercache/Makefile pgsql/contrib/pg_buffercache/Makefile
*** pgsql.orig/contrib/pg_buffercache/Makefile    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/Makefile    Wed Mar  9 11:42:05 2005
***************
*** 0 ****
--- 1,21 ----
+ # $PostgreSQL$
+
+ MODULE_big = pg_buffercache
+ PG_CPPFLAGS = -I$(libpq_srcdir)
+ OBJS    = pg_buffercache_pages.o
+ SHLIB_LINK = $(libpq)
+
+ DATA_built = pg_buffercache.sql
+ DOCS = README.pg_buffercache
+ REGRESS = pg_buffercache
+
+
+ ifdef USE_PGXS
+ PGXS = $(shell pg_config --pgxs)
+ include $(PGXS)
+ else
+ subdir = contrib/pg_buffercache
+ top_builddir = ../..
+ include $(top_builddir)/src/Makefile.global
+ include $(top_srcdir)/contrib/contrib-global.mk
+ endif
diff -Nacr pgsql.orig/contrib/pg_buffercache/README.pg_buffercache pgsql/contrib/pg_buffercache/README.pg_buffercache
*** pgsql.orig/contrib/pg_buffercache/README.pg_buffercache    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/README.pg_buffercache    Wed Mar  9 12:26:56 2005
***************
*** 0 ****
--- 1,112 ----
+ Pg_buffercache - Real-time queries on the shared buffer cache.
+ --------------
+
+   This module consists of a C function 'pg_buffercache_pages()' that returns
+   a set of records, plus a view 'pg_buffercache' to wrapper the function.
+
+   The intent is to do for the buffercache what pg_locks does for locks, i.e -
+   ability to examine what is happening at any given time without having to
+   restart or rebuild the server with debugging code added.
+
+   By default public access is REVOKED from both of these, just in case there
+   are security issues lurking.
+
+
+ Installation
+ ------------
+
+   Build and install the main Postgresql source, then this contrib module:
+
+   $ cd contrib/pg_buffercache
+   $ gmake
+   $ gmake install
+
+
+   To register the functions:
+
+   $ psql -d <database> -f pg_buffercache.sql
+
+
+ Notes
+ -----
+
+   The definition of the columns exposed in the view is:
+
+        Column     |  references          | Description
+   ----------------+----------------------+------------------------------------
+    bufferid       |                      | Id, 1->shared_buffers.
+    relfilenode    | pg_class.relfilenode | Refilenode of the relation.
+    reltablespace  | pg_tablespace.oid    | Tablespace oid of the relation.
+    reldatabase    | pg_database.oid      | Database for the relation.
+    relblocknumber |                      | Offset of the page in the relation.
+    isdirty        |                      | Is the page dirty?
+
+
+   There is one row for each buffer in the shared cache. Unused buffers are
+   shown with all fields null except bufferid.
+
+   Because the cache is shared by all the databases, there are pages from
+   relations not belonging to the current database.
+
+   When the pg_buffercache view is accessed, internal buffer manager locks are
+   taken, and a copy of the buffer cache data is made for the view to display.
+   This ensures that the view produces a consistent set of results, while not
+   blocking normal buffer activity longer than necessary.  Nonetheless there
+   could be some impact on database performance if this view is read often.
+
+
+ Sample output
+ -------------
+
+   regression=# \d pg_buffercache;
+        View "public.pg_buffercache"
+        Column     |  Type   | Modifiers
+   ----------------+---------+-----------
+    bufferid       | integer |
+    relfilenode    | oid     |
+    reltablespace  | oid     |
+    reldatabase    | oid     |
+    relblocknumber | numeric |
+    isdirty        | boolean |
+   View definition:
+    SELECT p.bufferid, p.relfilenode, p.reltablespace, p.reldatabase,
+           p.relblocknumber, p.isdirty
+      FROM pg_buffercache_pages() p(bufferid integer, relfilenode oid,
+      reltablespace oid, reldatabase oid, relblocknumber numeric(10,0),
+      isdirty boolean);
+
+   regression=# SELECT c.relname, count(*) AS buffers
+                FROM pg_class c, pg_buffercache b
+                WHERE b.relfilenode = c.relfilenode
+                GROUP BY c.relname
+                ORDER BY 2 DESC LIMIT 10;
+                relname             | buffers
+   ---------------------------------+---------
+    tenk2                           |     345
+    tenk1                           |     141
+    pg_proc                         |      46
+    pg_class                        |      45
+    pg_attribute                    |      43
+    pg_class_relname_nsp_index      |      30
+    pg_proc_proname_args_nsp_index  |      28
+    pg_attribute_relid_attnam_index |      26
+    pg_depend                       |      22
+    pg_depend_reference_index       |      20
+   (10 rows)
+
+   regression=#
+
+
+ Author
+ ------
+
+   * Mark Kirkwood <markir@coretech.co.nz *or* markir@paradise.net.nz>
+
+
+ Help
+ ----
+
+   * Design suggestions : Neil Conway <neilc@samurai.com>
+   * Debugging advice : Tom Lane <tgl@sss.pgh.pa.us>
+
+   Thanks guys!
diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache.sql.in pgsql/contrib/pg_buffercache/pg_buffercache.sql.in
*** pgsql.orig/contrib/pg_buffercache/pg_buffercache.sql.in    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/pg_buffercache.sql.in    Wed Mar  9 11:45:32 2005
***************
*** 0 ****
--- 1,18 ----
+ -- Adjust this setting to control where the objects get created.
+ SET search_path = public;
+
+ -- Register the function.
+ CREATE OR REPLACE FUNCTION pg_buffercache_pages()
+ RETURNS SETOF RECORD
+ AS 'MODULE_PATHNAME', 'pg_buffercache_pages'
+ LANGUAGE 'C';
+
+ -- Create a view for convenient access.
+ CREATE VIEW pg_buffercache AS
+     SELECT P.* FROM pg_buffercache_pages() AS P
+      (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+       relblocknumber numeric(10), isdirty bool);
+
+ -- Don't want these to be available at public.
+ REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC;
+ REVOKE ALL ON pg_buffercache FROM PUBLIC;
diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.c pgsql/contrib/pg_buffercache/pg_buffercache_pages.c
*** pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.c    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/pg_buffercache_pages.c    Wed Mar  9 13:18:30 2005
***************
*** 0 ****
--- 1,239 ----
+ /*-------------------------------------------------------------------------
+  *
+  * pg_buffercache_pages.c
+  *    display some contents of the buffer cache
+  *
+  *      $PostgreSQL$
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ #include "libpq-fe.h"
+ #include "funcapi.h"
+ #include "catalog/pg_type.h"
+ #include "storage/buf_internals.h"
+ #include "storage/bufmgr.h"
+ #include "utils/relcache.h"
+ #include "pg_buffercache_pages.h"
+
+
+ #define NUM_BUFFERCACHE_PAGES_ELEM    6
+
+
+ /*
+  * Record structure holding the to be exposed cache data.
+  */
+ typedef struct
+ {
+
+     uint32        bufferid;
+     Oid            relfilenode;
+     Oid            reltablespace;
+     Oid            reldatabase;
+     BlockNumber    blocknum;
+     bool        isvalid;
+     bool        isdirty;
+
+ } BufferCachePagesRec;
+
+
+ /*
+  * Function context for data persisting over repeated calls.
+  */
+ typedef struct
+ {
+
+     AttInMetadata        *attinmeta;
+     BufferCachePagesRec    *record;
+     char                *values[NUM_BUFFERCACHE_PAGES_ELEM];
+
+ } BufferCachePagesContext;
+
+
+ /*
+  * Function returning data from the shared buffer cache - buffer number,
+  * relation node/tablespace/database/blocknum and dirty indicator.
+  */
+ PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+ Datum
+ pg_buffercache_pages(PG_FUNCTION_ARGS)
+ {
+
+     FuncCallContext        *funcctx;
+     Datum                result;
+     MemoryContext        oldcontext;
+     BufferCachePagesContext    *fctx;        /* User function context. */
+     TupleDesc            tupledesc;
+     HeapTuple            tuple;
+
+     if (SRF_IS_FIRSTCALL())
+     {
+         RelFileNode    rnode;
+         uint32        i;
+         BufferDesc    *bufHdr;
+
+
+         funcctx = SRF_FIRSTCALL_INIT();
+
+         /* Switch context when allocating stuff to be used in later calls */
+         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+         /* Construct a tuple to return. */
+         tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+                                     INT4OID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+                                     OIDOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+                                     OIDOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+                                     OIDOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber",
+                                     NUMERICOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 6, "isdirty",
+                                     BOOLOID, -1, 0);
+
+         /* Generate attribute metadata needed later to produce tuples */
+         funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc);
+
+         /*
+          * Create a function context for cross-call persistence
+          * and initialize the buffer counters.
+          */
+         fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
+         funcctx->max_calls = NBuffers;
+         funcctx->user_fctx = fctx;
+
+
+         /* Allocate NBuffers worth of BufferCachePagesRec records. */
+         fctx->record = (BufferCachePagesRec *) palloc(sizeof(BufferCachePagesRec) * NBuffers);
+
+         /* allocate the strings for tuple formation */
+         fctx->values[0] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[1] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[2] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[3] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[4] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[5] = (char *) palloc(2);
+
+
+         /* Return to original context when allocating transient memory */
+         MemoryContextSwitchTo(oldcontext);
+
+
+         /*
+          * Lock Buffer map and scan though all the buffers, saving the
+          * relevant fields in the fctx->record structure.
+          */
+         LWLockAcquire(BufMappingLock, LW_SHARED);
+
+         for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
+         {
+             /* Lock each buffer header before inspecting. */
+             LockBufHdr(bufHdr);
+
+             rnode = bufHdr->tag.rnode;
+
+             fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
+             fctx->record[i].relfilenode = rnode.relNode;
+             fctx->record[i].reltablespace = rnode.spcNode;
+             fctx->record[i].reldatabase = rnode.dbNode;
+             fctx->record[i].blocknum = bufHdr->tag.blockNum;
+
+             if ( bufHdr->flags & BM_DIRTY)
+             {
+                 fctx->record[i].isdirty = true;
+             }
+             else
+             {
+                 fctx->record[i].isdirty = false;
+             }
+
+             /* Note if the buffer is valid, and has storage created */
+             if ( (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
+             {
+                 fctx->record[i].isvalid = true;
+             }
+             else
+             {
+                 fctx->record[i].isvalid = false;
+             }
+
+             UnlockBufHdr(bufHdr);
+
+         }
+
+         /* Release Buffer map. */
+         LWLockRelease(BufMappingLock);
+     }
+
+     funcctx = SRF_PERCALL_SETUP();
+
+     /* Get the saved state */
+     fctx = funcctx->user_fctx;
+
+
+     if (funcctx->call_cntr < funcctx->max_calls)
+     {
+         uint32         i = funcctx->call_cntr;
+         char        *values[NUM_BUFFERCACHE_PAGES_ELEM];
+         int            j;
+
+         /*
+          * Use a temporary values array, initially pointing to
+          * fctx->values, so it can be reassigned w/o losing the storage
+          * for subsequent calls.
+          */
+         for (j = 0; j < NUM_BUFFERCACHE_PAGES_ELEM; j++)
+         {
+             values[j] = fctx->values[j];
+         }
+
+
+         /*
+          * Set all fields except the bufferid to null if the buffer is
+          * unused or not valid.
+          */
+         if (fctx->record[i].blocknum == InvalidBlockNumber ||
+             fctx->record[i].isvalid == false )
+         {
+
+             sprintf(values[0], "%u", fctx->record[i].bufferid);
+             values[1] = NULL;
+             values[2] = NULL;
+             values[3] = NULL;
+             values[4] = NULL;
+             values[5] = NULL;
+
+         }
+         else
+         {
+
+             sprintf(values[0], "%u", fctx->record[i].bufferid);
+             sprintf(values[1], "%u", fctx->record[i].relfilenode);
+             sprintf(values[2], "%u", fctx->record[i].reltablespace);
+             sprintf(values[3], "%u", fctx->record[i].reldatabase);
+             sprintf(values[4], "%u", fctx->record[i].blocknum);
+             if (fctx->record[i].isdirty)
+             {
+                 strcpy(values[5], "t");
+             }
+             else
+             {
+                 strcpy(values[5], "f");
+             }
+
+         }
+
+
+         /* Build and return the tuple. */
+         tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
+         result = HeapTupleGetDatum(tuple);
+
+
+         SRF_RETURN_NEXT(funcctx, result);
+     }
+     else
+         SRF_RETURN_DONE(funcctx);
+
+ }
+
diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.h pgsql/contrib/pg_buffercache/pg_buffercache_pages.h
*** pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.h    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/pg_buffercache_pages.h    Wed Mar  9 11:42:20 2005
***************
*** 0 ****
--- 1,18 ----
+ /*-------------------------------------------------------------------------
+  *
+  * pg_buffercache_pages.h
+  *    Prototypes for pg_buffercache_pages
+  *
+  *
+  *    $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+
+
+ #ifndef PG_BUFFERCACHE_PAGES_H
+ #define PG_BUFFERCACHE_PAGES_H
+
+ extern Datum pg_buffercache_pages(PG_FUNCTION_ARGS);
+
+ #endif  /* PG_BUFFERCACHE_PAGES_H */



Re: Display Pg buffer cache (WIP)

From
Mark Kirkwood
Date:
A couple of minor amendments here:

- remove link to libpq (from cut+past of dblnk's Makefile)
- add comment for pg_buffercache module in contrib/README
- change my listed email to this one (I have resigned....)

regards

Mark
diff -Nacr pgsql.orig/contrib/Makefile pgsql/contrib/Makefile
*** pgsql.orig/contrib/Makefile    Thu Mar  3 11:29:53 2005
--- pgsql/contrib/Makefile    Wed Mar  9 10:10:41 2005
***************
*** 26,31 ****
--- 26,32 ----
          noupdate    \
          oid2name    \
          pg_autovacuum    \
+         pg_buffercache    \
          pg_dumplo    \
          pg_trgm        \
          pgbench        \
diff -Nacr pgsql.orig/contrib/README pgsql/contrib/README
*** pgsql.orig/contrib/README    Thu Mar  3 11:29:53 2005
--- pgsql/contrib/README    Fri Mar 11 12:11:18 2005
***************
*** 136,141 ****
--- 136,145 ----
      Automatically performs vacuum
      by Matthew T. O'Connor <matthew@zeut.net>

+ pg_buffercace -
+     Real time queries on the shared buffer cache.
+     by Mark Kirkwood <markir@paradise.net.nz>
+
  pg_dumplo -
      Dump large objects
      by Karel Zak <zakkr@zf.jcu.cz>
diff -Nacr pgsql.orig/contrib/pg_buffercache/Makefile pgsql/contrib/pg_buffercache/Makefile
*** pgsql.orig/contrib/pg_buffercache/Makefile    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/Makefile    Thu Mar 10 08:19:20 2005
***************
*** 0 ****
--- 1,19 ----
+ # $PostgreSQL$
+
+ MODULE_big = pg_buffercache
+ OBJS    = pg_buffercache_pages.o
+
+ DATA_built = pg_buffercache.sql
+ DOCS = README.pg_buffercache
+ REGRESS = pg_buffercache
+
+
+ ifdef USE_PGXS
+ PGXS = $(shell pg_config --pgxs)
+ include $(PGXS)
+ else
+ subdir = contrib/pg_buffercache
+ top_builddir = ../..
+ include $(top_builddir)/src/Makefile.global
+ include $(top_srcdir)/contrib/contrib-global.mk
+ endif
diff -Nacr pgsql.orig/contrib/pg_buffercache/README.pg_buffercache pgsql/contrib/pg_buffercache/README.pg_buffercache
*** pgsql.orig/contrib/pg_buffercache/README.pg_buffercache    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/README.pg_buffercache    Fri Mar 11 12:11:03 2005
***************
*** 0 ****
--- 1,112 ----
+ Pg_buffercache - Real time queries on the shared buffer cache.
+ --------------
+
+   This module consists of a C function 'pg_buffercache_pages()' that returns
+   a set of records, plus a view 'pg_buffercache' to wrapper the function.
+
+   The intent is to do for the buffercache what pg_locks does for locks, i.e -
+   ability to examine what is happening at any given time without having to
+   restart or rebuild the server with debugging code added.
+
+   By default public access is REVOKED from both of these, just in case there
+   are security issues lurking.
+
+
+ Installation
+ ------------
+
+   Build and install the main Postgresql source, then this contrib module:
+
+   $ cd contrib/pg_buffercache
+   $ gmake
+   $ gmake install
+
+
+   To register the functions:
+
+   $ psql -d <database> -f pg_buffercache.sql
+
+
+ Notes
+ -----
+
+   The definition of the columns exposed in the view is:
+
+        Column     |  references          | Description
+   ----------------+----------------------+------------------------------------
+    bufferid       |                      | Id, 1->shared_buffers.
+    relfilenode    | pg_class.relfilenode | Refilenode of the relation.
+    reltablespace  | pg_tablespace.oid    | Tablespace oid of the relation.
+    reldatabase    | pg_database.oid      | Database for the relation.
+    relblocknumber |                      | Offset of the page in the relation.
+    isdirty        |                      | Is the page dirty?
+
+
+   There is one row for each buffer in the shared cache. Unused buffers are
+   shown with all fields null except bufferid.
+
+   Because the cache is shared by all the databases, there are pages from
+   relations not belonging to the current database.
+
+   When the pg_buffercache view is accessed, internal buffer manager locks are
+   taken, and a copy of the buffer cache data is made for the view to display.
+   This ensures that the view produces a consistent set of results, while not
+   blocking normal buffer activity longer than necessary.  Nonetheless there
+   could be some impact on database performance if this view is read often.
+
+
+ Sample output
+ -------------
+
+   regression=# \d pg_buffercache;
+        View "public.pg_buffercache"
+        Column     |  Type   | Modifiers
+   ----------------+---------+-----------
+    bufferid       | integer |
+    relfilenode    | oid     |
+    reltablespace  | oid     |
+    reldatabase    | oid     |
+    relblocknumber | numeric |
+    isdirty        | boolean |
+   View definition:
+    SELECT p.bufferid, p.relfilenode, p.reltablespace, p.reldatabase,
+           p.relblocknumber, p.isdirty
+      FROM pg_buffercache_pages() p(bufferid integer, relfilenode oid,
+      reltablespace oid, reldatabase oid, relblocknumber numeric(10,0),
+      isdirty boolean);
+
+   regression=# SELECT c.relname, count(*) AS buffers
+                FROM pg_class c, pg_buffercache b
+                WHERE b.relfilenode = c.relfilenode
+                GROUP BY c.relname
+                ORDER BY 2 DESC LIMIT 10;
+                relname             | buffers
+   ---------------------------------+---------
+    tenk2                           |     345
+    tenk1                           |     141
+    pg_proc                         |      46
+    pg_class                        |      45
+    pg_attribute                    |      43
+    pg_class_relname_nsp_index      |      30
+    pg_proc_proname_args_nsp_index  |      28
+    pg_attribute_relid_attnam_index |      26
+    pg_depend                       |      22
+    pg_depend_reference_index       |      20
+   (10 rows)
+
+   regression=#
+
+
+ Author
+ ------
+
+   * Mark Kirkwood <markir@paradise.net.nz>
+
+
+ Help
+ ----
+
+   * Design suggestions : Neil Conway <neilc@samurai.com>
+   * Debugging advice : Tom Lane <tgl@sss.pgh.pa.us>
+
+   Thanks guys!
diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache.sql.in pgsql/contrib/pg_buffercache/pg_buffercache.sql.in
*** pgsql.orig/contrib/pg_buffercache/pg_buffercache.sql.in    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/pg_buffercache.sql.in    Wed Mar  9 11:45:32 2005
***************
*** 0 ****
--- 1,18 ----
+ -- Adjust this setting to control where the objects get created.
+ SET search_path = public;
+
+ -- Register the function.
+ CREATE OR REPLACE FUNCTION pg_buffercache_pages()
+ RETURNS SETOF RECORD
+ AS 'MODULE_PATHNAME', 'pg_buffercache_pages'
+ LANGUAGE 'C';
+
+ -- Create a view for convenient access.
+ CREATE VIEW pg_buffercache AS
+     SELECT P.* FROM pg_buffercache_pages() AS P
+      (bufferid integer, relfilenode oid, reltablespace oid, reldatabase oid,
+       relblocknumber numeric(10), isdirty bool);
+
+ -- Don't want these to be available at public.
+ REVOKE ALL ON FUNCTION pg_buffercache_pages() FROM PUBLIC;
+ REVOKE ALL ON pg_buffercache FROM PUBLIC;
diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.c pgsql/contrib/pg_buffercache/pg_buffercache_pages.c
*** pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.c    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/pg_buffercache_pages.c    Thu Mar 10 08:19:13 2005
***************
*** 0 ****
--- 1,238 ----
+ /*-------------------------------------------------------------------------
+  *
+  * pg_buffercache_pages.c
+  *    display some contents of the buffer cache
+  *
+  *      $PostgreSQL$
+  *-------------------------------------------------------------------------
+  */
+ #include "postgres.h"
+ #include "funcapi.h"
+ #include "catalog/pg_type.h"
+ #include "storage/buf_internals.h"
+ #include "storage/bufmgr.h"
+ #include "utils/relcache.h"
+ #include "pg_buffercache_pages.h"
+
+
+ #define NUM_BUFFERCACHE_PAGES_ELEM    6
+
+
+ /*
+  * Record structure holding the to be exposed cache data.
+  */
+ typedef struct
+ {
+
+     uint32        bufferid;
+     Oid            relfilenode;
+     Oid            reltablespace;
+     Oid            reldatabase;
+     BlockNumber    blocknum;
+     bool        isvalid;
+     bool        isdirty;
+
+ } BufferCachePagesRec;
+
+
+ /*
+  * Function context for data persisting over repeated calls.
+  */
+ typedef struct
+ {
+
+     AttInMetadata        *attinmeta;
+     BufferCachePagesRec    *record;
+     char                *values[NUM_BUFFERCACHE_PAGES_ELEM];
+
+ } BufferCachePagesContext;
+
+
+ /*
+  * Function returning data from the shared buffer cache - buffer number,
+  * relation node/tablespace/database/blocknum and dirty indicator.
+  */
+ PG_FUNCTION_INFO_V1(pg_buffercache_pages);
+ Datum
+ pg_buffercache_pages(PG_FUNCTION_ARGS)
+ {
+
+     FuncCallContext        *funcctx;
+     Datum                result;
+     MemoryContext        oldcontext;
+     BufferCachePagesContext    *fctx;        /* User function context. */
+     TupleDesc            tupledesc;
+     HeapTuple            tuple;
+
+     if (SRF_IS_FIRSTCALL())
+     {
+         RelFileNode    rnode;
+         uint32        i;
+         BufferDesc    *bufHdr;
+
+
+         funcctx = SRF_FIRSTCALL_INIT();
+
+         /* Switch context when allocating stuff to be used in later calls */
+         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+         /* Construct a tuple to return. */
+         tupledesc = CreateTemplateTupleDesc(NUM_BUFFERCACHE_PAGES_ELEM, false);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 1, "bufferid",
+                                     INT4OID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 2, "relfilenode",
+                                     OIDOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 3, "reltablespace",
+                                     OIDOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 4, "reldatabase",
+                                     OIDOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 5, "relblockbumber",
+                                     NUMERICOID, -1, 0);
+         TupleDescInitEntry(tupledesc, (AttrNumber) 6, "isdirty",
+                                     BOOLOID, -1, 0);
+
+         /* Generate attribute metadata needed later to produce tuples */
+         funcctx->attinmeta = TupleDescGetAttInMetadata(tupledesc);
+
+         /*
+          * Create a function context for cross-call persistence
+          * and initialize the buffer counters.
+          */
+         fctx = (BufferCachePagesContext *) palloc(sizeof(BufferCachePagesContext));
+         funcctx->max_calls = NBuffers;
+         funcctx->user_fctx = fctx;
+
+
+         /* Allocate NBuffers worth of BufferCachePagesRec records. */
+         fctx->record = (BufferCachePagesRec *) palloc(sizeof(BufferCachePagesRec) * NBuffers);
+
+         /* allocate the strings for tuple formation */
+         fctx->values[0] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[1] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[2] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[3] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[4] = (char *) palloc(3 * sizeof(uint32) + 1);
+         fctx->values[5] = (char *) palloc(2);
+
+
+         /* Return to original context when allocating transient memory */
+         MemoryContextSwitchTo(oldcontext);
+
+
+         /*
+          * Lock Buffer map and scan though all the buffers, saving the
+          * relevant fields in the fctx->record structure.
+          */
+         LWLockAcquire(BufMappingLock, LW_SHARED);
+
+         for (i = 0, bufHdr = BufferDescriptors; i < NBuffers; i++, bufHdr++)
+         {
+             /* Lock each buffer header before inspecting. */
+             LockBufHdr(bufHdr);
+
+             rnode = bufHdr->tag.rnode;
+
+             fctx->record[i].bufferid = BufferDescriptorGetBuffer(bufHdr);
+             fctx->record[i].relfilenode = rnode.relNode;
+             fctx->record[i].reltablespace = rnode.spcNode;
+             fctx->record[i].reldatabase = rnode.dbNode;
+             fctx->record[i].blocknum = bufHdr->tag.blockNum;
+
+             if ( bufHdr->flags & BM_DIRTY)
+             {
+                 fctx->record[i].isdirty = true;
+             }
+             else
+             {
+                 fctx->record[i].isdirty = false;
+             }
+
+             /* Note if the buffer is valid, and has storage created */
+             if ( (bufHdr->flags & BM_VALID) && (bufHdr->flags & BM_TAG_VALID))
+             {
+                 fctx->record[i].isvalid = true;
+             }
+             else
+             {
+                 fctx->record[i].isvalid = false;
+             }
+
+             UnlockBufHdr(bufHdr);
+
+         }
+
+         /* Release Buffer map. */
+         LWLockRelease(BufMappingLock);
+     }
+
+     funcctx = SRF_PERCALL_SETUP();
+
+     /* Get the saved state */
+     fctx = funcctx->user_fctx;
+
+
+     if (funcctx->call_cntr < funcctx->max_calls)
+     {
+         uint32         i = funcctx->call_cntr;
+         char        *values[NUM_BUFFERCACHE_PAGES_ELEM];
+         int            j;
+
+         /*
+          * Use a temporary values array, initially pointing to
+          * fctx->values, so it can be reassigned w/o losing the storage
+          * for subsequent calls.
+          */
+         for (j = 0; j < NUM_BUFFERCACHE_PAGES_ELEM; j++)
+         {
+             values[j] = fctx->values[j];
+         }
+
+
+         /*
+          * Set all fields except the bufferid to null if the buffer is
+          * unused or not valid.
+          */
+         if (fctx->record[i].blocknum == InvalidBlockNumber ||
+             fctx->record[i].isvalid == false )
+         {
+
+             sprintf(values[0], "%u", fctx->record[i].bufferid);
+             values[1] = NULL;
+             values[2] = NULL;
+             values[3] = NULL;
+             values[4] = NULL;
+             values[5] = NULL;
+
+         }
+         else
+         {
+
+             sprintf(values[0], "%u", fctx->record[i].bufferid);
+             sprintf(values[1], "%u", fctx->record[i].relfilenode);
+             sprintf(values[2], "%u", fctx->record[i].reltablespace);
+             sprintf(values[3], "%u", fctx->record[i].reldatabase);
+             sprintf(values[4], "%u", fctx->record[i].blocknum);
+             if (fctx->record[i].isdirty)
+             {
+                 strcpy(values[5], "t");
+             }
+             else
+             {
+                 strcpy(values[5], "f");
+             }
+
+         }
+
+
+         /* Build and return the tuple. */
+         tuple = BuildTupleFromCStrings(funcctx->attinmeta, values);
+         result = HeapTupleGetDatum(tuple);
+
+
+         SRF_RETURN_NEXT(funcctx, result);
+     }
+     else
+         SRF_RETURN_DONE(funcctx);
+
+ }
+
diff -Nacr pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.h pgsql/contrib/pg_buffercache/pg_buffercache_pages.h
*** pgsql.orig/contrib/pg_buffercache/pg_buffercache_pages.h    Thu Jan  1 12:00:00 1970
--- pgsql/contrib/pg_buffercache/pg_buffercache_pages.h    Wed Mar  9 11:42:20 2005
***************
*** 0 ****
--- 1,18 ----
+ /*-------------------------------------------------------------------------
+  *
+  * pg_buffercache_pages.h
+  *    Prototypes for pg_buffercache_pages
+  *
+  *
+  *    $PostgreSQL$
+  *
+  *-------------------------------------------------------------------------
+  */
+
+
+ #ifndef PG_BUFFERCACHE_PAGES_H
+ #define PG_BUFFERCACHE_PAGES_H
+
+ extern Datum pg_buffercache_pages(PG_FUNCTION_ARGS);
+
+ #endif  /* PG_BUFFERCACHE_PAGES_H */

Re: Display Pg buffer cache (WIP)

From
Neil Conway
Date:
Mark Kirkwood wrote:
> A couple of minor amendments here [...]

Barring any objections, I'll apply this tomorrow.

-Neil

Re: Display Pg buffer cache (WIP)

From
Neil Conway
Date:
Mark Kirkwood wrote:
> A couple of minor amendments here:
>
> - remove link to libpq (from cut+past of dblnk's Makefile)
> - add comment for pg_buffercache module in contrib/README
> - change my listed email to this one (I have resigned....)

Applied, thanks for the patch.

BTW, I removed the "REGRESS=..." line from the Makefile, as that caused
`make installcheck' of contrib/ to fail on my machine.

-Neil