Thread: Additional DTrace Probes

Additional DTrace Probes

From
Robert Lor
Date:
The attached patch contains the probes (originally came from Theo
Schlossnagle) that were removed from the previous submitted patch.
Zdenek had some concerns  about the way the probes were implemented
http://archives.postgresql.org/pgsql-hackers/2008-07/msg01168.php. If
there are specific recommendations, I'd be more than happy to make the
changes and get them resubmitted.

-Robert

Index: src/backend/access/transam/slru.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/slru.c,v
retrieving revision 1.45
diff -u -3 -p -r1.45 slru.c
--- src/backend/access/transam/slru.c    1 Jan 2009 17:23:36 -0000    1.45
+++ src/backend/access/transam/slru.c    6 Mar 2009 04:01:56 -0000
@@ -57,6 +57,7 @@
 #include "storage/fd.h"
 #include "storage/shmem.h"
 #include "miscadmin.h"
+#include "pg_trace.h"


 /*
@@ -372,6 +373,7 @@ SimpleLruReadPage(SlruCtl ctl, int pagen
 {
     SlruShared    shared = ctl->shared;

+    TRACE_POSTGRESQL_SLRU_READPAGE_START((uintptr_t)ctl, pageno, write_ok, xid);
     /* Outer loop handles restart if we must wait for someone else's I/O */
     for (;;)
     {
@@ -399,6 +401,7 @@ SimpleLruReadPage(SlruCtl ctl, int pagen
             }
             /* Otherwise, it's ready to use */
             SlruRecentlyUsed(shared, slotno);
+            TRACE_POSTGRESQL_SLRU_READPAGE_DONE(slotno);
             return slotno;
         }

@@ -446,6 +449,7 @@ SimpleLruReadPage(SlruCtl ctl, int pagen
             SlruReportIOError(ctl, pageno, xid);

         SlruRecentlyUsed(shared, slotno);
+        TRACE_POSTGRESQL_SLRU_READPAGE_DONE(slotno);
         return slotno;
     }
 }
@@ -470,6 +474,8 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl,
     SlruShared    shared = ctl->shared;
     int            slotno;

+    TRACE_POSTGRESQL_SLRU_READPAGE_READONLY((uintptr_t)ctl, pageno, xid);
+
     /* Try to find the page while holding only shared lock */
     LWLockAcquire(shared->ControlLock, LW_SHARED);

@@ -511,6 +517,8 @@ SimpleLruWritePage(SlruCtl ctl, int slot
     int            pageno = shared->page_number[slotno];
     bool        ok;

+    TRACE_POSTGRESQL_SLRU_WRITEPAGE_START((uintptr_t)ctl, pageno, slotno);
+
     /* If a write is in progress, wait for it to finish */
     while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
            shared->page_number[slotno] == pageno)
@@ -525,7 +533,10 @@ SimpleLruWritePage(SlruCtl ctl, int slot
     if (!shared->page_dirty[slotno] ||
         shared->page_status[slotno] != SLRU_PAGE_VALID ||
         shared->page_number[slotno] != pageno)
+    {
+        TRACE_POSTGRESQL_SLRU_WRITEPAGE_DONE();
         return;
+    }

     /*
      * Mark the slot write-busy, and clear the dirtybit.  After this point, a
@@ -569,6 +580,8 @@ SimpleLruWritePage(SlruCtl ctl, int slot
     /* Now it's okay to ereport if we failed */
     if (!ok)
         SlruReportIOError(ctl, pageno, InvalidTransactionId);
+
+    TRACE_POSTGRESQL_SLRU_WRITEPAGE_DONE();
 }

 /*
@@ -593,6 +606,8 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa

     SlruFileName(ctl, path, segno);

+    TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_START((uintptr_t)ctl, path, pageno, slotno);
+
     /*
      * In a crash-and-restart situation, it's possible for us to receive
      * commands to set the commit status of transactions whose bits are in
@@ -607,6 +622,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
         {
             slru_errcause = SLRU_OPEN_FAILED;
             slru_errno = errno;
+            TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
             return false;
         }

@@ -614,6 +630,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
                 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
                         path)));
         MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, slru_errcause, slru_errno);
         return true;
     }

@@ -622,6 +639,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
         slru_errcause = SLRU_SEEK_FAILED;
         slru_errno = errno;
         close(fd);
+        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

@@ -631,6 +649,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
         slru_errcause = SLRU_READ_FAILED;
         slru_errno = errno;
         close(fd);
+        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

@@ -638,9 +657,12 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
     {
         slru_errcause = SLRU_CLOSE_FAILED;
         slru_errno = errno;
+        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

+    TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, slru_errcause, slru_errno);
+
     return true;
 }

@@ -668,6 +690,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
     char        path[MAXPGPATH];
     int            fd = -1;

+    TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_START((uintptr_t)ctl, pageno, slotno);
+
     /*
      * Honor the write-WAL-before-data rule, if appropriate, so that we do not
      * write out data before associated WAL records.  This is the same action
@@ -753,6 +777,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
         {
             slru_errcause = SLRU_OPEN_FAILED;
             slru_errno = errno;
+            TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
             return false;
         }

@@ -781,6 +806,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
         slru_errno = errno;
         if (!fdata)
             close(fd);
+        TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

@@ -794,6 +820,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
         slru_errno = errno;
         if (!fdata)
             close(fd);
+        TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

@@ -808,6 +835,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
             slru_errcause = SLRU_FSYNC_FAILED;
             slru_errno = errno;
             close(fd);
+            TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
             return false;
         }

@@ -815,10 +843,12 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
         {
             slru_errcause = SLRU_CLOSE_FAILED;
             slru_errno = errno;
+            TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
             return false;
         }
     }

+    TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(true, slru_errcause,slru_errno);
     return true;
 }

Index: src/backend/executor/execScan.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/execScan.c,v
retrieving revision 1.44
diff -u -3 -p -r1.44 execScan.c
--- src/backend/executor/execScan.c    1 Jan 2009 17:23:41 -0000    1.44
+++ src/backend/executor/execScan.c    6 Mar 2009 04:01:56 -0000
@@ -21,6 +21,7 @@
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 static bool tlist_matches_tupdesc(PlanState *ps, List *tlist, Index varno, TupleDesc tupdesc);
@@ -60,6 +61,8 @@ ExecScan(ScanState *node,
     qual = node->ps.qual;
     projInfo = node->ps.ps_ProjInfo;

+    TRACE_POSTGRESQL_EXECUTOR_SCAN((uintptr_t)node, ((Scan *)node->ps.plan)->scanrelid, (uintptr_t)accessMtd);
+
     /*
      * If we have neither a qual to check nor a projection to do, just skip
      * all the overhead and return the raw scan tuple.
Index: src/backend/executor/nodeAgg.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeAgg.c,v
retrieving revision 1.164
diff -u -3 -p -r1.164 nodeAgg.c
--- src/backend/executor/nodeAgg.c    1 Jan 2009 17:23:41 -0000    1.164
+++ src/backend/executor/nodeAgg.c    6 Mar 2009 04:01:56 -0000
@@ -86,6 +86,7 @@
 #include "utils/syscache.h"
 #include "utils/tuplesort.h"
 #include "utils/datum.h"
+#include "pg_trace.h"


 /*
@@ -814,6 +815,8 @@ ExecAgg(AggState *node)
     if (node->agg_done)
         return NULL;

+    TRACE_POSTGRESQL_EXECUTOR_AGG((uintptr_t)node, ((Agg *) node->ss.ps.plan)->aggstrategy);
+
     /*
      * Check to see if we're still projecting out tuples from a previous agg
      * tuple (because there is a function-returning-set in the projection
Index: src/backend/executor/nodeGroup.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeGroup.c,v
retrieving revision 1.73
diff -u -3 -p -r1.73 nodeGroup.c
--- src/backend/executor/nodeGroup.c    1 Jan 2009 17:23:41 -0000    1.73
+++ src/backend/executor/nodeGroup.c    6 Mar 2009 04:01:56 -0000
@@ -24,6 +24,7 @@

 #include "executor/executor.h"
 #include "executor/nodeGroup.h"
+#include "pg_trace.h"


 /*
@@ -49,6 +50,8 @@ ExecGroup(GroupState *node)
     numCols = ((Group *) node->ss.ps.plan)->numCols;
     grpColIdx = ((Group *) node->ss.ps.plan)->grpColIdx;

+    TRACE_POSTGRESQL_EXECUTOR_GROUP((uintptr_t)node, numCols);
+
     /*
      * Check to see if we're still projecting out tuples from a previous group
      * tuple (because there is a function-returning-set in the projection
Index: src/backend/executor/nodeHash.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHash.c,v
retrieving revision 1.117
diff -u -3 -p -r1.117 nodeHash.c
--- src/backend/executor/nodeHash.c    1 Jan 2009 17:23:41 -0000    1.117
+++ src/backend/executor/nodeHash.c    6 Mar 2009 04:01:56 -0000
@@ -35,6 +35,7 @@
 #include "utils/dynahash.h"
 #include "utils/memutils.h"
 #include "utils/lsyscache.h"
+#include "pg_trace.h"


 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
@@ -70,6 +71,8 @@ MultiExecHash(HashState *node)
     ExprContext *econtext;
     uint32        hashvalue;

+    TRACE_POSTGRESQL_EXECUTOR_HASH_MULTI((uintptr_t)node);
+
     /* must provide our own instrumentation support */
     if (node->ps.instrument)
         InstrStartNode(node->ps.instrument);
Index: src/backend/executor/nodeHashjoin.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v
retrieving revision 1.97
diff -u -3 -p -r1.97 nodeHashjoin.c
--- src/backend/executor/nodeHashjoin.c    1 Jan 2009 17:23:41 -0000    1.97
+++ src/backend/executor/nodeHashjoin.c    6 Mar 2009 04:01:56 -0000
@@ -20,6 +20,7 @@
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 /* Returns true for JOIN_LEFT and JOIN_ANTI jointypes */
@@ -61,6 +62,8 @@ ExecHashJoin(HashJoinState *node)
     uint32        hashvalue;
     int            batchno;

+    TRACE_POSTGRESQL_EXECUTOR_HASHJOIN((uintptr_t)node);
+
     /*
      * get information from HashJoin node
      */
Index: src/backend/executor/nodeLimit.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeLimit.c,v
retrieving revision 1.36
diff -u -3 -p -r1.36 nodeLimit.c
--- src/backend/executor/nodeLimit.c    4 Mar 2009 10:55:00 -0000    1.36
+++ src/backend/executor/nodeLimit.c    6 Mar 2009 04:01:56 -0000
@@ -23,6 +23,7 @@

 #include "executor/executor.h"
 #include "executor/nodeLimit.h"
+#include "pg_trace.h"

 static void recompute_limits(LimitState *node);

@@ -41,6 +42,8 @@ ExecLimit(LimitState *node)
     TupleTableSlot *slot;
     PlanState  *outerPlan;

+    TRACE_POSTGRESQL_EXECUTOR_LIMIT((uintptr_t)node);
+
     /*
      * get information from the node
      */
Index: src/backend/executor/nodeMaterial.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeMaterial.c,v
retrieving revision 1.65
diff -u -3 -p -r1.65 nodeMaterial.c
--- src/backend/executor/nodeMaterial.c    1 Jan 2009 17:23:42 -0000    1.65
+++ src/backend/executor/nodeMaterial.c    6 Mar 2009 04:01:56 -0000
@@ -24,6 +24,7 @@
 #include "executor/executor.h"
 #include "executor/nodeMaterial.h"
 #include "miscadmin.h"
+#include "pg_trace.h"

 /* ----------------------------------------------------------------
  *        ExecMaterial
@@ -45,6 +46,8 @@ ExecMaterial(MaterialState *node)
     bool        eof_tuplestore;
     TupleTableSlot *slot;

+    TRACE_POSTGRESQL_EXECUTOR_MATERIAL((uintptr_t)node);
+
     /*
      * get state info from node
      */
Index: src/backend/executor/nodeMergejoin.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeMergejoin.c,v
retrieving revision 1.94
diff -u -3 -p -r1.94 nodeMergejoin.c
--- src/backend/executor/nodeMergejoin.c    1 Jan 2009 17:23:42 -0000    1.94
+++ src/backend/executor/nodeMergejoin.c    6 Mar 2009 04:01:56 -0000
@@ -102,6 +102,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/syscache.h"
+#include "pg_trace.h"


 /*
@@ -565,6 +566,8 @@ ExecMergeJoin(MergeJoinState *node)
     bool        doFillOuter;
     bool        doFillInner;

+    TRACE_POSTGRESQL_EXECUTOR_MERGEJOIN((uintptr_t)node);
+
     /*
      * get information from node
      */
Index: src/backend/executor/nodeNestloop.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeNestloop.c,v
retrieving revision 1.50
diff -u -3 -p -r1.50 nodeNestloop.c
--- src/backend/executor/nodeNestloop.c    1 Jan 2009 17:23:42 -0000    1.50
+++ src/backend/executor/nodeNestloop.c    6 Mar 2009 04:01:56 -0000
@@ -24,6 +24,7 @@
 #include "executor/execdebug.h"
 #include "executor/nodeNestloop.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 /* ----------------------------------------------------------------
@@ -67,6 +68,8 @@ ExecNestLoop(NestLoopState *node)
     List       *otherqual;
     ExprContext *econtext;

+    TRACE_POSTGRESQL_EXECUTOR_NESTLOOP((uintptr_t)node);
+
     /*
      * get information from the node
      */
Index: src/backend/executor/nodeSetOp.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSetOp.c,v
retrieving revision 1.28
diff -u -3 -p -r1.28 nodeSetOp.c
--- src/backend/executor/nodeSetOp.c    1 Jan 2009 17:23:42 -0000    1.28
+++ src/backend/executor/nodeSetOp.c    6 Mar 2009 04:01:56 -0000
@@ -47,6 +47,7 @@
 #include "executor/executor.h"
 #include "executor/nodeSetOp.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 /*
@@ -196,6 +197,8 @@ ExecSetOp(SetOpState *node)
     SetOp       *plannode = (SetOp *) node->ps.plan;
     TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;

+    TRACE_POSTGRESQL_EXECUTOR_SETOP((uintptr_t)node);
+
     /*
      * If the previously-returned tuple needs to be returned more than once,
      * keep returning it.
Index: src/backend/executor/nodeSort.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSort.c,v
retrieving revision 1.63
diff -u -3 -p -r1.63 nodeSort.c
--- src/backend/executor/nodeSort.c    1 Jan 2009 17:23:42 -0000    1.63
+++ src/backend/executor/nodeSort.c    6 Mar 2009 04:01:57 -0000
@@ -19,6 +19,7 @@
 #include "executor/nodeSort.h"
 #include "miscadmin.h"
 #include "utils/tuplesort.h"
+#include "pg_trace.h"


 /* ----------------------------------------------------------------
@@ -53,6 +54,8 @@ ExecSort(SortState *node)
     dir = estate->es_direction;
     tuplesortstate = (Tuplesortstate *) node->tuplesortstate;

+    TRACE_POSTGRESQL_EXECUTOR_SORT((uintptr_t)node, dir);
+
     /*
      * If first time through, read all tuples from outer plan and pass them to
      * tuplesort.c. Subsequent calls just fetch tuples from tuplesort.
Index: src/backend/executor/nodeSubplan.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSubplan.c,v
retrieving revision 1.96
diff -u -3 -p -r1.96 nodeSubplan.c
--- src/backend/executor/nodeSubplan.c    1 Jan 2009 17:23:42 -0000    1.96
+++ src/backend/executor/nodeSubplan.c    6 Mar 2009 04:01:57 -0000
@@ -27,6 +27,7 @@
 #include "utils/array.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 static Datum ExecSubPlan(SubPlanState *node,
@@ -92,6 +93,8 @@ ExecHashSubPlan(SubPlanState *node,
     ExprContext *innerecontext = node->innerecontext;
     TupleTableSlot *slot;

+    TRACE_POSTGRESQL_EXECUTOR_SUBPLAN_HASH((uintptr_t)node);
+
     /* Shouldn't have any direct correlation Vars */
     if (subplan->parParam != NIL || node->args != NIL)
         elog(ERROR, "hashed subplan with direct correlation not supported");
@@ -227,6 +230,8 @@ ExecScanSubPlan(SubPlanState *node,
     ListCell   *l;
     ArrayBuildState *astate = NULL;

+    TRACE_POSTGRESQL_EXECUTOR_SUBPLAN_SCAN((uintptr_t)node);
+
     /*
      * We are probably in a short-lived expression-evaluation context. Switch
      * to the per-query context for manipulating the child plan's chgParam,
Index: src/backend/executor/nodeUnique.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeUnique.c,v
retrieving revision 1.58
diff -u -3 -p -r1.58 nodeUnique.c
--- src/backend/executor/nodeUnique.c    1 Jan 2009 17:23:42 -0000    1.58
+++ src/backend/executor/nodeUnique.c    6 Mar 2009 04:01:57 -0000
@@ -36,6 +36,7 @@
 #include "executor/executor.h"
 #include "executor/nodeUnique.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 /* ----------------------------------------------------------------
@@ -50,6 +51,8 @@ ExecUnique(UniqueState *node)
     TupleTableSlot *slot;
     PlanState  *outerPlan;

+    TRACE_POSTGRESQL_EXECUTOR_UNIQUE((uintptr_t)node);
+
     /*
      * get information from the node
      */
Index: src/backend/utils/probes.d
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/probes.d,v
retrieving revision 1.6
diff -u -3 -p -r1.6 probes.d
--- src/backend/utils/probes.d    1 Jan 2009 17:23:48 -0000    1.6
+++ src/backend/utils/probes.d    6 Mar 2009 04:01:57 -0000
@@ -10,6 +10,7 @@

 /* typedefs used in PostgreSQL */
 #define LocalTransactionId unsigned int
+#define TransactionId unsigned int
 #define LWLockId int
 #define LWLockMode int
 #define LOCKMODE int
@@ -91,4 +92,29 @@ provider postgresql {
     probe xlog__switch();
     probe wal__buffer__write__dirty__start();
     probe wal__buffer__write__dirty__done();
+
+    probe slru__readpage__start(unsigned int, int, bool, TransactionId);
+    probe slru__readpage__done(int);
+    probe slru__readpage__readonly(unsigned int, int, TransactionId);
+    probe slru__writepage__start(unsigned int, int, int);
+    probe slru__writepage__done();
+    probe slru__readpage__physical__start(unsigned int, char *, int, int);
+    probe slru__readpage__physical__done(int, int, int);
+    probe slru__writepage__physical__start(unsigned int, int, int);
+    probe slru__writepage__physical__done(int, int, int);
+
+    probe executor__scan(unsigned int, unsigned int, unsigned int);
+    probe executor__agg(unsigned int, int);
+    probe executor__group(unsigned int, int);
+    probe executor__hash__multi(unsigned int);
+    probe executor__hashjoin(unsigned int);
+    probe executor__limit(unsigned int);
+    probe executor__material(unsigned int);
+    probe executor__mergejoin(unsigned int);
+    probe executor__nestloop(unsigned int);
+    probe executor__setop(unsigned int);
+    probe executor__sort(unsigned int, int);
+    probe executor__subplan__hash(unsigned int);
+    probe executor__subplan__scan(unsigned int);
+    probe executor__unique(unsigned int);
 };

Re: Additional DTrace Probes

From
ITAGAKI Takahiro
Date:
Hi,

Robert Lor <Robert.Lor@Sun.COM> wrote:

> The attached patch contains the probes (originally came from Theo 
> Schlossnagle) that were removed from the previous submitted patch. 
> Zdenek had some concerns  about the way the probes were implemented  
> http://archives.postgresql.org/pgsql-hackers/2008-07/msg01168.php. If 
> there are specific recommendations, I'd be more than happy to make the 
> changes and get them resubmitted.

This will introduce SLRU and Executor probes.
We already have Lock, LWLock, Buffer, I/O and XLogs probes.

I'd like to have the following probes, too. In my experience,
they could be bottlenecks or consume much time in some situations.
   - idle in transaction   - spinlock wait-and-retry   - CPU: Trigger execution   - CPU: Encoding conversion   -
Network:send() and recv()   - smgr: lseek() calls   - Tempfile reads and writes
 

Regards,
---
ITAGAKI Takahiro
NTT Open Source Software Center




Re: Additional DTrace Probes

From
Zdenek Kotala
Date:
Hi Robert,

I tested your patch and it looks good. however I have several
comments/questions:

1) probes contains pointer but in probe.h it is declared as int. Is it
correct?

2) Maybe

TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, -1, -1);

would be better. Because  slru_errcause, slru_errno can contains garbage
in situation when everything goes fine. Same for write. 
I think it is committable for 8.4.

Zdenek



Robert Lor píše v čt 05. 03. 2009 v 22:39 -0600:
> The attached patch contains the probes (originally came from Theo 
> Schlossnagle) that were removed from the previous submitted patch. 
> Zdenek had some concerns  about the way the probes were implemented  
> http://archives.postgresql.org/pgsql-hackers/2008-07/msg01168.php. If 
> there are specific recommendations, I'd be more than happy to make the 
> changes and get them resubmitted.
> 
> -Robert
> 
> Prostý textový dokument příloha (dtrace-probes.patch)
> Index: src/backend/access/transam/slru.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/slru.c,v
> retrieving revision 1.45
> diff -u -3 -p -r1.45 slru.c
> --- src/backend/access/transam/slru.c    1 Jan 2009 17:23:36 -0000    1.45
> +++ src/backend/access/transam/slru.c    6 Mar 2009 04:01:56 -0000
> @@ -57,6 +57,7 @@
>  #include "storage/fd.h"
>  #include "storage/shmem.h"
>  #include "miscadmin.h"
> +#include "pg_trace.h"
>  
> 
>  /*
> @@ -372,6 +373,7 @@ SimpleLruReadPage(SlruCtl ctl, int pagen
>  {
>      SlruShared    shared = ctl->shared;
>  
> +    TRACE_POSTGRESQL_SLRU_READPAGE_START((uintptr_t)ctl, pageno, write_ok, xid);
>      /* Outer loop handles restart if we must wait for someone else's I/O */
>      for (;;)
>      {
> @@ -399,6 +401,7 @@ SimpleLruReadPage(SlruCtl ctl, int pagen
>              }
>              /* Otherwise, it's ready to use */
>              SlruRecentlyUsed(shared, slotno);
> +            TRACE_POSTGRESQL_SLRU_READPAGE_DONE(slotno);
>              return slotno;
>          }
>  
> @@ -446,6 +449,7 @@ SimpleLruReadPage(SlruCtl ctl, int pagen
>              SlruReportIOError(ctl, pageno, xid);
>  
>          SlruRecentlyUsed(shared, slotno);
> +        TRACE_POSTGRESQL_SLRU_READPAGE_DONE(slotno);
>          return slotno;
>      }
>  }
> @@ -470,6 +474,8 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, 
>      SlruShared    shared = ctl->shared;
>      int            slotno;
>  
> +    TRACE_POSTGRESQL_SLRU_READPAGE_READONLY((uintptr_t)ctl, pageno, xid);
> +
>      /* Try to find the page while holding only shared lock */
>      LWLockAcquire(shared->ControlLock, LW_SHARED);
>  
> @@ -511,6 +517,8 @@ SimpleLruWritePage(SlruCtl ctl, int slot
>      int            pageno = shared->page_number[slotno];
>      bool        ok;
>  
> +    TRACE_POSTGRESQL_SLRU_WRITEPAGE_START((uintptr_t)ctl, pageno, slotno);
> +
>      /* If a write is in progress, wait for it to finish */
>      while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
>             shared->page_number[slotno] == pageno)
> @@ -525,7 +533,10 @@ SimpleLruWritePage(SlruCtl ctl, int slot
>      if (!shared->page_dirty[slotno] ||
>          shared->page_status[slotno] != SLRU_PAGE_VALID ||
>          shared->page_number[slotno] != pageno)
> +    {
> +        TRACE_POSTGRESQL_SLRU_WRITEPAGE_DONE();
>          return;
> +    }
>  
>      /*
>       * Mark the slot write-busy, and clear the dirtybit.  After this point, a
> @@ -569,6 +580,8 @@ SimpleLruWritePage(SlruCtl ctl, int slot
>      /* Now it's okay to ereport if we failed */
>      if (!ok)
>          SlruReportIOError(ctl, pageno, InvalidTransactionId);
> +
> +    TRACE_POSTGRESQL_SLRU_WRITEPAGE_DONE();
>  }
>  
>  /*
> @@ -593,6 +606,8 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
>  
>      SlruFileName(ctl, path, segno);
>  
> +    TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_START((uintptr_t)ctl, path, pageno, slotno);
> +
>      /*
>       * In a crash-and-restart situation, it's possible for us to receive
>       * commands to set the commit status of transactions whose bits are in
> @@ -607,6 +622,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
>          {
>              slru_errcause = SLRU_OPEN_FAILED;
>              slru_errno = errno;
> +            TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
>              return false;
>          }
>  
> @@ -614,6 +630,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
>                  (errmsg("file \"%s\" doesn't exist, reading as zeroes",
>                          path)));
>          MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
> +        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, slru_errcause, slru_errno);
>          return true;
>      }
>  
> @@ -622,6 +639,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
>          slru_errcause = SLRU_SEEK_FAILED;
>          slru_errno = errno;
>          close(fd);
> +        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
>          return false;
>      }
>  
> @@ -631,6 +649,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
>          slru_errcause = SLRU_READ_FAILED;
>          slru_errno = errno;
>          close(fd);
> +        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
>          return false;
>      }
>  
> @@ -638,9 +657,12 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
>      {
>          slru_errcause = SLRU_CLOSE_FAILED;
>          slru_errno = errno;
> +        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
>          return false;
>      }
>  
> +    TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, slru_errcause, slru_errno);
> +
>      return true;
>  }
>  
> @@ -668,6 +690,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
>      char        path[MAXPGPATH];
>      int            fd = -1;
>  
> +    TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_START((uintptr_t)ctl, pageno, slotno);
> +
>      /*
>       * Honor the write-WAL-before-data rule, if appropriate, so that we do not
>       * write out data before associated WAL records.  This is the same action
> @@ -753,6 +777,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
>          {
>              slru_errcause = SLRU_OPEN_FAILED;
>              slru_errno = errno;
> +            TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
>              return false;
>          }
>  
> @@ -781,6 +806,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
>          slru_errno = errno;
>          if (!fdata)
>              close(fd);
> +        TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
>          return false;
>      }
>  
> @@ -794,6 +820,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
>          slru_errno = errno;
>          if (!fdata)
>              close(fd);
> +        TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
>          return false;
>      }
>  
> @@ -808,6 +835,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
>              slru_errcause = SLRU_FSYNC_FAILED;
>              slru_errno = errno;
>              close(fd);
> +            TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
>              return false;
>          }
>  
> @@ -815,10 +843,12 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
>          {
>              slru_errcause = SLRU_CLOSE_FAILED;
>              slru_errno = errno;
> +            TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
>              return false;
>          }
>      }
>  
> +    TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(true, slru_errcause,slru_errno);
>      return true;
>  }
>  
> Index: src/backend/executor/execScan.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/execScan.c,v
> retrieving revision 1.44
> diff -u -3 -p -r1.44 execScan.c
> --- src/backend/executor/execScan.c    1 Jan 2009 17:23:41 -0000    1.44
> +++ src/backend/executor/execScan.c    6 Mar 2009 04:01:56 -0000
> @@ -21,6 +21,7 @@
>  #include "executor/executor.h"
>  #include "miscadmin.h"
>  #include "utils/memutils.h"
> +#include "pg_trace.h"
>  
> 
>  static bool tlist_matches_tupdesc(PlanState *ps, List *tlist, Index varno, TupleDesc tupdesc);
> @@ -60,6 +61,8 @@ ExecScan(ScanState *node,
>      qual = node->ps.qual;
>      projInfo = node->ps.ps_ProjInfo;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_SCAN((uintptr_t)node, ((Scan *)node->ps.plan)->scanrelid, (uintptr_t)accessMtd);
> +
>      /*
>       * If we have neither a qual to check nor a projection to do, just skip
>       * all the overhead and return the raw scan tuple.
> Index: src/backend/executor/nodeAgg.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeAgg.c,v
> retrieving revision 1.164
> diff -u -3 -p -r1.164 nodeAgg.c
> --- src/backend/executor/nodeAgg.c    1 Jan 2009 17:23:41 -0000    1.164
> +++ src/backend/executor/nodeAgg.c    6 Mar 2009 04:01:56 -0000
> @@ -86,6 +86,7 @@
>  #include "utils/syscache.h"
>  #include "utils/tuplesort.h"
>  #include "utils/datum.h"
> +#include "pg_trace.h"
>  
> 
>  /*
> @@ -814,6 +815,8 @@ ExecAgg(AggState *node)
>      if (node->agg_done)
>          return NULL;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_AGG((uintptr_t)node, ((Agg *) node->ss.ps.plan)->aggstrategy);
> +
>      /*
>       * Check to see if we're still projecting out tuples from a previous agg
>       * tuple (because there is a function-returning-set in the projection
> Index: src/backend/executor/nodeGroup.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeGroup.c,v
> retrieving revision 1.73
> diff -u -3 -p -r1.73 nodeGroup.c
> --- src/backend/executor/nodeGroup.c    1 Jan 2009 17:23:41 -0000    1.73
> +++ src/backend/executor/nodeGroup.c    6 Mar 2009 04:01:56 -0000
> @@ -24,6 +24,7 @@
>  
>  #include "executor/executor.h"
>  #include "executor/nodeGroup.h"
> +#include "pg_trace.h"
>  
> 
>  /*
> @@ -49,6 +50,8 @@ ExecGroup(GroupState *node)
>      numCols = ((Group *) node->ss.ps.plan)->numCols;
>      grpColIdx = ((Group *) node->ss.ps.plan)->grpColIdx;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_GROUP((uintptr_t)node, numCols);
> +
>      /*
>       * Check to see if we're still projecting out tuples from a previous group
>       * tuple (because there is a function-returning-set in the projection
> Index: src/backend/executor/nodeHash.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHash.c,v
> retrieving revision 1.117
> diff -u -3 -p -r1.117 nodeHash.c
> --- src/backend/executor/nodeHash.c    1 Jan 2009 17:23:41 -0000    1.117
> +++ src/backend/executor/nodeHash.c    6 Mar 2009 04:01:56 -0000
> @@ -35,6 +35,7 @@
>  #include "utils/dynahash.h"
>  #include "utils/memutils.h"
>  #include "utils/lsyscache.h"
> +#include "pg_trace.h"
>  
> 
>  static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
> @@ -70,6 +71,8 @@ MultiExecHash(HashState *node)
>      ExprContext *econtext;
>      uint32        hashvalue;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_HASH_MULTI((uintptr_t)node);
> +
>      /* must provide our own instrumentation support */
>      if (node->ps.instrument)
>          InstrStartNode(node->ps.instrument);
> Index: src/backend/executor/nodeHashjoin.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v
> retrieving revision 1.97
> diff -u -3 -p -r1.97 nodeHashjoin.c
> --- src/backend/executor/nodeHashjoin.c    1 Jan 2009 17:23:41 -0000    1.97
> +++ src/backend/executor/nodeHashjoin.c    6 Mar 2009 04:01:56 -0000
> @@ -20,6 +20,7 @@
>  #include "executor/nodeHash.h"
>  #include "executor/nodeHashjoin.h"
>  #include "utils/memutils.h"
> +#include "pg_trace.h"
>  
> 
>  /* Returns true for JOIN_LEFT and JOIN_ANTI jointypes */
> @@ -61,6 +62,8 @@ ExecHashJoin(HashJoinState *node)
>      uint32        hashvalue;
>      int            batchno;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_HASHJOIN((uintptr_t)node);
> +
>      /*
>       * get information from HashJoin node
>       */
> Index: src/backend/executor/nodeLimit.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeLimit.c,v
> retrieving revision 1.36
> diff -u -3 -p -r1.36 nodeLimit.c
> --- src/backend/executor/nodeLimit.c    4 Mar 2009 10:55:00 -0000    1.36
> +++ src/backend/executor/nodeLimit.c    6 Mar 2009 04:01:56 -0000
> @@ -23,6 +23,7 @@
>  
>  #include "executor/executor.h"
>  #include "executor/nodeLimit.h"
> +#include "pg_trace.h"
>  
>  static void recompute_limits(LimitState *node);
>  
> @@ -41,6 +42,8 @@ ExecLimit(LimitState *node)
>      TupleTableSlot *slot;
>      PlanState  *outerPlan;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_LIMIT((uintptr_t)node);
> +
>      /*
>       * get information from the node
>       */
> Index: src/backend/executor/nodeMaterial.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeMaterial.c,v
> retrieving revision 1.65
> diff -u -3 -p -r1.65 nodeMaterial.c
> --- src/backend/executor/nodeMaterial.c    1 Jan 2009 17:23:42 -0000    1.65
> +++ src/backend/executor/nodeMaterial.c    6 Mar 2009 04:01:56 -0000
> @@ -24,6 +24,7 @@
>  #include "executor/executor.h"
>  #include "executor/nodeMaterial.h"
>  #include "miscadmin.h"
> +#include "pg_trace.h"
>  
>  /* ----------------------------------------------------------------
>   *        ExecMaterial
> @@ -45,6 +46,8 @@ ExecMaterial(MaterialState *node)
>      bool        eof_tuplestore;
>      TupleTableSlot *slot;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_MATERIAL((uintptr_t)node);
> +
>      /*
>       * get state info from node
>       */
> Index: src/backend/executor/nodeMergejoin.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeMergejoin.c,v
> retrieving revision 1.94
> diff -u -3 -p -r1.94 nodeMergejoin.c
> --- src/backend/executor/nodeMergejoin.c    1 Jan 2009 17:23:42 -0000    1.94
> +++ src/backend/executor/nodeMergejoin.c    6 Mar 2009 04:01:56 -0000
> @@ -102,6 +102,7 @@
>  #include "utils/lsyscache.h"
>  #include "utils/memutils.h"
>  #include "utils/syscache.h"
> +#include "pg_trace.h"
>  
> 
>  /*
> @@ -565,6 +566,8 @@ ExecMergeJoin(MergeJoinState *node)
>      bool        doFillOuter;
>      bool        doFillInner;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_MERGEJOIN((uintptr_t)node);
> +
>      /*
>       * get information from node
>       */
> Index: src/backend/executor/nodeNestloop.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeNestloop.c,v
> retrieving revision 1.50
> diff -u -3 -p -r1.50 nodeNestloop.c
> --- src/backend/executor/nodeNestloop.c    1 Jan 2009 17:23:42 -0000    1.50
> +++ src/backend/executor/nodeNestloop.c    6 Mar 2009 04:01:56 -0000
> @@ -24,6 +24,7 @@
>  #include "executor/execdebug.h"
>  #include "executor/nodeNestloop.h"
>  #include "utils/memutils.h"
> +#include "pg_trace.h"
>  
> 
>  /* ----------------------------------------------------------------
> @@ -67,6 +68,8 @@ ExecNestLoop(NestLoopState *node)
>      List       *otherqual;
>      ExprContext *econtext;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_NESTLOOP((uintptr_t)node);
> +
>      /*
>       * get information from the node
>       */
> Index: src/backend/executor/nodeSetOp.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSetOp.c,v
> retrieving revision 1.28
> diff -u -3 -p -r1.28 nodeSetOp.c
> --- src/backend/executor/nodeSetOp.c    1 Jan 2009 17:23:42 -0000    1.28
> +++ src/backend/executor/nodeSetOp.c    6 Mar 2009 04:01:56 -0000
> @@ -47,6 +47,7 @@
>  #include "executor/executor.h"
>  #include "executor/nodeSetOp.h"
>  #include "utils/memutils.h"
> +#include "pg_trace.h"
>  
> 
>  /*
> @@ -196,6 +197,8 @@ ExecSetOp(SetOpState *node)
>      SetOp       *plannode = (SetOp *) node->ps.plan;
>      TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_SETOP((uintptr_t)node);
> +
>      /*
>       * If the previously-returned tuple needs to be returned more than once,
>       * keep returning it.
> Index: src/backend/executor/nodeSort.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSort.c,v
> retrieving revision 1.63
> diff -u -3 -p -r1.63 nodeSort.c
> --- src/backend/executor/nodeSort.c    1 Jan 2009 17:23:42 -0000    1.63
> +++ src/backend/executor/nodeSort.c    6 Mar 2009 04:01:57 -0000
> @@ -19,6 +19,7 @@
>  #include "executor/nodeSort.h"
>  #include "miscadmin.h"
>  #include "utils/tuplesort.h"
> +#include "pg_trace.h"
>  
> 
>  /* ----------------------------------------------------------------
> @@ -53,6 +54,8 @@ ExecSort(SortState *node)
>      dir = estate->es_direction;
>      tuplesortstate = (Tuplesortstate *) node->tuplesortstate;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_SORT((uintptr_t)node, dir);
> +
>      /*
>       * If first time through, read all tuples from outer plan and pass them to
>       * tuplesort.c. Subsequent calls just fetch tuples from tuplesort.
> Index: src/backend/executor/nodeSubplan.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSubplan.c,v
> retrieving revision 1.96
> diff -u -3 -p -r1.96 nodeSubplan.c
> --- src/backend/executor/nodeSubplan.c    1 Jan 2009 17:23:42 -0000    1.96
> +++ src/backend/executor/nodeSubplan.c    6 Mar 2009 04:01:57 -0000
> @@ -27,6 +27,7 @@
>  #include "utils/array.h"
>  #include "utils/lsyscache.h"
>  #include "utils/memutils.h"
> +#include "pg_trace.h"
>  
> 
>  static Datum ExecSubPlan(SubPlanState *node,
> @@ -92,6 +93,8 @@ ExecHashSubPlan(SubPlanState *node,
>      ExprContext *innerecontext = node->innerecontext;
>      TupleTableSlot *slot;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_SUBPLAN_HASH((uintptr_t)node);
> +
>      /* Shouldn't have any direct correlation Vars */
>      if (subplan->parParam != NIL || node->args != NIL)
>          elog(ERROR, "hashed subplan with direct correlation not supported");
> @@ -227,6 +230,8 @@ ExecScanSubPlan(SubPlanState *node,
>      ListCell   *l;
>      ArrayBuildState *astate = NULL;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_SUBPLAN_SCAN((uintptr_t)node);
> +
>      /*
>       * We are probably in a short-lived expression-evaluation context. Switch
>       * to the per-query context for manipulating the child plan's chgParam,
> Index: src/backend/executor/nodeUnique.c
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeUnique.c,v
> retrieving revision 1.58
> diff -u -3 -p -r1.58 nodeUnique.c
> --- src/backend/executor/nodeUnique.c    1 Jan 2009 17:23:42 -0000    1.58
> +++ src/backend/executor/nodeUnique.c    6 Mar 2009 04:01:57 -0000
> @@ -36,6 +36,7 @@
>  #include "executor/executor.h"
>  #include "executor/nodeUnique.h"
>  #include "utils/memutils.h"
> +#include "pg_trace.h"
>  
> 
>  /* ----------------------------------------------------------------
> @@ -50,6 +51,8 @@ ExecUnique(UniqueState *node)
>      TupleTableSlot *slot;
>      PlanState  *outerPlan;
>  
> +    TRACE_POSTGRESQL_EXECUTOR_UNIQUE((uintptr_t)node);
> +
>      /*
>       * get information from the node
>       */
> Index: src/backend/utils/probes.d
> ===================================================================
> RCS file: /projects/cvsroot/pgsql/src/backend/utils/probes.d,v
> retrieving revision 1.6
> diff -u -3 -p -r1.6 probes.d
> --- src/backend/utils/probes.d    1 Jan 2009 17:23:48 -0000    1.6
> +++ src/backend/utils/probes.d    6 Mar 2009 04:01:57 -0000
> @@ -10,6 +10,7 @@
>  
>  /* typedefs used in PostgreSQL */
>  #define LocalTransactionId unsigned int
> +#define TransactionId unsigned int
>  #define LWLockId int
>  #define LWLockMode int
>  #define LOCKMODE int
> @@ -91,4 +92,29 @@ provider postgresql {
>      probe xlog__switch();
>      probe wal__buffer__write__dirty__start();
>      probe wal__buffer__write__dirty__done();
> +
> +    probe slru__readpage__start(unsigned int, int, bool, TransactionId);
> +    probe slru__readpage__done(int);
> +    probe slru__readpage__readonly(unsigned int, int, TransactionId);
> +    probe slru__writepage__start(unsigned int, int, int);
> +    probe slru__writepage__done();
> +    probe slru__readpage__physical__start(unsigned int, char *, int, int);
> +    probe slru__readpage__physical__done(int, int, int);
> +    probe slru__writepage__physical__start(unsigned int, int, int);
> +    probe slru__writepage__physical__done(int, int, int);
> + 
> +    probe executor__scan(unsigned int, unsigned int, unsigned int);
> +    probe executor__agg(unsigned int, int);
> +    probe executor__group(unsigned int, int);
> +    probe executor__hash__multi(unsigned int);
> +    probe executor__hashjoin(unsigned int);
> +    probe executor__limit(unsigned int);
> +    probe executor__material(unsigned int);
> +    probe executor__mergejoin(unsigned int);
> +    probe executor__nestloop(unsigned int);
> +    probe executor__setop(unsigned int);
> +    probe executor__sort(unsigned int, int);
> +    probe executor__subplan__hash(unsigned int);
> +    probe executor__subplan__scan(unsigned int);
> +    probe executor__unique(unsigned int);
>  };



Re: Additional DTrace Probes

From
Zdenek Kotala
Date:
ITAGAKI Takahiro píše v pá 06. 03. 2009 v 17:03 +0900:
> Hi,
> 
> Robert Lor <Robert.Lor@Sun.COM> wrote:
> 
> > The attached patch contains the probes (originally came from Theo 
> > Schlossnagle) that were removed from the previous submitted patch. 
> > Zdenek had some concerns  about the way the probes were implemented  
> > http://archives.postgresql.org/pgsql-hackers/2008-07/msg01168.php. If 
> > there are specific recommendations, I'd be more than happy to make the 
> > changes and get them resubmitted.
> 
> This will introduce SLRU and Executor probes.
> We already have Lock, LWLock, Buffer, I/O and XLogs probes.
> 
> I'd like to have the following probes, too. In my experience,
> they could be bottlenecks or consume much time in some situations.

Of course, there are lot of places where probes can be added and it
needs to determine correct place regrading to data which we want to
collect.

>     - idle in transaction

This probes lost in a action. I remember that I had some comments about
them. It requires small code refactoring (some extra if or something
like this). 

>     - spinlock wait-and-retry

Can you specify a place in code or what you need to measure?

>     - CPU: Trigger execution
>     - CPU: Encoding conversion

Maybe function call start and done could cover both cases.

>     - Network: send() and recv()

I think there could be more probes on network like login/logout, login
handshake itself and so on.

>     - smgr: lseek() calls

Is any reason for it? You can monitoring syscall directly.

>     - Tempfile reads and writes

I think that it is possible now. last arg in buffer_read function
contains information, that it is local(temp) buffer.

Zdenek




Re: Additional DTrace Probes

From
Robert Lor
Date:
Zdenek Kotala wrote:
> I tested your patch and it looks good. however I have several
> comments/questions:
>
> 1) probes contains pointer but in probe.h it is declared as int. Is it
> correct?
>

The probes cast the pointer to uintptr_t, so the correct type that will
work for both ILP32 and LP64 models is unsigned long. I've made the
correction from unsigned int to unsigned long. The reason uintptr_t is
not used in the probe definition is because it causes compilation
problem on Mac OS X. This is a known problem in the OS X's DTrace
implementation.
> 2) Maybe
>
> TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, -1, -1);
>
> would be better. Because  slru_errcause, slru_errno can contains garbage
> in situation when everything goes fine. Same for write.
>

I've made the changes per your suggestion although one can argue that
the script can check arg0, and if it's true, avoid using arg1 and arg2
as they are meaningless.
>
> I think it is committable for 8.4.
>

That would be awesome!

-Robert

Index: src/backend/access/transam/slru.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/slru.c,v
retrieving revision 1.45
diff -u -3 -p -r1.45 slru.c
--- src/backend/access/transam/slru.c    1 Jan 2009 17:23:36 -0000    1.45
+++ src/backend/access/transam/slru.c    8 Mar 2009 20:39:01 -0000
@@ -57,6 +57,7 @@
 #include "storage/fd.h"
 #include "storage/shmem.h"
 #include "miscadmin.h"
+#include "pg_trace.h"


 /*
@@ -372,6 +373,7 @@ SimpleLruReadPage(SlruCtl ctl, int pagen
 {
     SlruShared    shared = ctl->shared;

+    TRACE_POSTGRESQL_SLRU_READPAGE_START((uintptr_t)ctl, pageno, write_ok, xid);
     /* Outer loop handles restart if we must wait for someone else's I/O */
     for (;;)
     {
@@ -399,6 +401,7 @@ SimpleLruReadPage(SlruCtl ctl, int pagen
             }
             /* Otherwise, it's ready to use */
             SlruRecentlyUsed(shared, slotno);
+            TRACE_POSTGRESQL_SLRU_READPAGE_DONE(slotno);
             return slotno;
         }

@@ -446,6 +449,7 @@ SimpleLruReadPage(SlruCtl ctl, int pagen
             SlruReportIOError(ctl, pageno, xid);

         SlruRecentlyUsed(shared, slotno);
+        TRACE_POSTGRESQL_SLRU_READPAGE_DONE(slotno);
         return slotno;
     }
 }
@@ -470,6 +474,8 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl,
     SlruShared    shared = ctl->shared;
     int            slotno;

+    TRACE_POSTGRESQL_SLRU_READPAGE_READONLY((uintptr_t)ctl, pageno, xid);
+
     /* Try to find the page while holding only shared lock */
     LWLockAcquire(shared->ControlLock, LW_SHARED);

@@ -511,6 +517,8 @@ SimpleLruWritePage(SlruCtl ctl, int slot
     int            pageno = shared->page_number[slotno];
     bool        ok;

+    TRACE_POSTGRESQL_SLRU_WRITEPAGE_START((uintptr_t)ctl, pageno, slotno);
+
     /* If a write is in progress, wait for it to finish */
     while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
            shared->page_number[slotno] == pageno)
@@ -525,7 +533,10 @@ SimpleLruWritePage(SlruCtl ctl, int slot
     if (!shared->page_dirty[slotno] ||
         shared->page_status[slotno] != SLRU_PAGE_VALID ||
         shared->page_number[slotno] != pageno)
+    {
+        TRACE_POSTGRESQL_SLRU_WRITEPAGE_DONE();
         return;
+    }

     /*
      * Mark the slot write-busy, and clear the dirtybit.  After this point, a
@@ -569,6 +580,8 @@ SimpleLruWritePage(SlruCtl ctl, int slot
     /* Now it's okay to ereport if we failed */
     if (!ok)
         SlruReportIOError(ctl, pageno, InvalidTransactionId);
+
+    TRACE_POSTGRESQL_SLRU_WRITEPAGE_DONE();
 }

 /*
@@ -593,6 +606,8 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa

     SlruFileName(ctl, path, segno);

+    TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_START((uintptr_t)ctl, path, pageno, slotno);
+
     /*
      * In a crash-and-restart situation, it's possible for us to receive
      * commands to set the commit status of transactions whose bits are in
@@ -607,6 +622,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
         {
             slru_errcause = SLRU_OPEN_FAILED;
             slru_errno = errno;
+            TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
             return false;
         }

@@ -614,6 +630,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
                 (errmsg("file \"%s\" doesn't exist, reading as zeroes",
                         path)));
         MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, -1, -1);
         return true;
     }

@@ -622,6 +639,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
         slru_errcause = SLRU_SEEK_FAILED;
         slru_errno = errno;
         close(fd);
+        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

@@ -631,6 +649,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
         slru_errcause = SLRU_READ_FAILED;
         slru_errno = errno;
         close(fd);
+        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

@@ -638,9 +657,12 @@ SlruPhysicalReadPage(SlruCtl ctl, int pa
     {
         slru_errcause = SLRU_CLOSE_FAILED;
         slru_errno = errno;
+        TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

+    TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, -1, -1);
+
     return true;
 }

@@ -668,6 +690,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
     char        path[MAXPGPATH];
     int            fd = -1;

+    TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_START((uintptr_t)ctl, pageno, slotno);
+
     /*
      * Honor the write-WAL-before-data rule, if appropriate, so that we do not
      * write out data before associated WAL records.  This is the same action
@@ -753,6 +777,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
         {
             slru_errcause = SLRU_OPEN_FAILED;
             slru_errno = errno;
+            TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
             return false;
         }

@@ -781,6 +806,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
         slru_errno = errno;
         if (!fdata)
             close(fd);
+        TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

@@ -794,6 +820,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
         slru_errno = errno;
         if (!fdata)
             close(fd);
+        TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
         return false;
     }

@@ -808,6 +835,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
             slru_errcause = SLRU_FSYNC_FAILED;
             slru_errno = errno;
             close(fd);
+            TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
             return false;
         }

@@ -815,10 +843,12 @@ SlruPhysicalWritePage(SlruCtl ctl, int p
         {
             slru_errcause = SLRU_CLOSE_FAILED;
             slru_errno = errno;
+            TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
             return false;
         }
     }

+    TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(true, -1, -1);
     return true;
 }

Index: src/backend/executor/execScan.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/execScan.c,v
retrieving revision 1.44
diff -u -3 -p -r1.44 execScan.c
--- src/backend/executor/execScan.c    1 Jan 2009 17:23:41 -0000    1.44
+++ src/backend/executor/execScan.c    8 Mar 2009 20:39:02 -0000
@@ -21,6 +21,7 @@
 #include "executor/executor.h"
 #include "miscadmin.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 static bool tlist_matches_tupdesc(PlanState *ps, List *tlist, Index varno, TupleDesc tupdesc);
@@ -60,6 +61,8 @@ ExecScan(ScanState *node,
     qual = node->ps.qual;
     projInfo = node->ps.ps_ProjInfo;

+    TRACE_POSTGRESQL_EXECUTOR_SCAN((uintptr_t)node, ((Scan *)node->ps.plan)->scanrelid, (uintptr_t)accessMtd);
+
     /*
      * If we have neither a qual to check nor a projection to do, just skip
      * all the overhead and return the raw scan tuple.
Index: src/backend/executor/nodeAgg.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeAgg.c,v
retrieving revision 1.164
diff -u -3 -p -r1.164 nodeAgg.c
--- src/backend/executor/nodeAgg.c    1 Jan 2009 17:23:41 -0000    1.164
+++ src/backend/executor/nodeAgg.c    8 Mar 2009 20:39:02 -0000
@@ -86,6 +86,7 @@
 #include "utils/syscache.h"
 #include "utils/tuplesort.h"
 #include "utils/datum.h"
+#include "pg_trace.h"


 /*
@@ -814,6 +815,8 @@ ExecAgg(AggState *node)
     if (node->agg_done)
         return NULL;

+    TRACE_POSTGRESQL_EXECUTOR_AGG((uintptr_t)node, ((Agg *) node->ss.ps.plan)->aggstrategy);
+
     /*
      * Check to see if we're still projecting out tuples from a previous agg
      * tuple (because there is a function-returning-set in the projection
Index: src/backend/executor/nodeGroup.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeGroup.c,v
retrieving revision 1.73
diff -u -3 -p -r1.73 nodeGroup.c
--- src/backend/executor/nodeGroup.c    1 Jan 2009 17:23:41 -0000    1.73
+++ src/backend/executor/nodeGroup.c    8 Mar 2009 20:39:02 -0000
@@ -24,6 +24,7 @@

 #include "executor/executor.h"
 #include "executor/nodeGroup.h"
+#include "pg_trace.h"


 /*
@@ -49,6 +50,8 @@ ExecGroup(GroupState *node)
     numCols = ((Group *) node->ss.ps.plan)->numCols;
     grpColIdx = ((Group *) node->ss.ps.plan)->grpColIdx;

+    TRACE_POSTGRESQL_EXECUTOR_GROUP((uintptr_t)node, numCols);
+
     /*
      * Check to see if we're still projecting out tuples from a previous group
      * tuple (because there is a function-returning-set in the projection
Index: src/backend/executor/nodeHash.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHash.c,v
retrieving revision 1.117
diff -u -3 -p -r1.117 nodeHash.c
--- src/backend/executor/nodeHash.c    1 Jan 2009 17:23:41 -0000    1.117
+++ src/backend/executor/nodeHash.c    8 Mar 2009 20:39:02 -0000
@@ -35,6 +35,7 @@
 #include "utils/dynahash.h"
 #include "utils/memutils.h"
 #include "utils/lsyscache.h"
+#include "pg_trace.h"


 static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
@@ -70,6 +71,8 @@ MultiExecHash(HashState *node)
     ExprContext *econtext;
     uint32        hashvalue;

+    TRACE_POSTGRESQL_EXECUTOR_HASH_MULTI((uintptr_t)node);
+
     /* must provide our own instrumentation support */
     if (node->ps.instrument)
         InstrStartNode(node->ps.instrument);
Index: src/backend/executor/nodeHashjoin.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v
retrieving revision 1.97
diff -u -3 -p -r1.97 nodeHashjoin.c
--- src/backend/executor/nodeHashjoin.c    1 Jan 2009 17:23:41 -0000    1.97
+++ src/backend/executor/nodeHashjoin.c    8 Mar 2009 20:39:02 -0000
@@ -20,6 +20,7 @@
 #include "executor/nodeHash.h"
 #include "executor/nodeHashjoin.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 /* Returns true for JOIN_LEFT and JOIN_ANTI jointypes */
@@ -61,6 +62,8 @@ ExecHashJoin(HashJoinState *node)
     uint32        hashvalue;
     int            batchno;

+    TRACE_POSTGRESQL_EXECUTOR_HASHJOIN((uintptr_t)node);
+
     /*
      * get information from HashJoin node
      */
Index: src/backend/executor/nodeLimit.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeLimit.c,v
retrieving revision 1.36
diff -u -3 -p -r1.36 nodeLimit.c
--- src/backend/executor/nodeLimit.c    4 Mar 2009 10:55:00 -0000    1.36
+++ src/backend/executor/nodeLimit.c    8 Mar 2009 20:39:02 -0000
@@ -23,6 +23,7 @@

 #include "executor/executor.h"
 #include "executor/nodeLimit.h"
+#include "pg_trace.h"

 static void recompute_limits(LimitState *node);

@@ -41,6 +42,8 @@ ExecLimit(LimitState *node)
     TupleTableSlot *slot;
     PlanState  *outerPlan;

+    TRACE_POSTGRESQL_EXECUTOR_LIMIT((uintptr_t)node);
+
     /*
      * get information from the node
      */
Index: src/backend/executor/nodeMaterial.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeMaterial.c,v
retrieving revision 1.65
diff -u -3 -p -r1.65 nodeMaterial.c
--- src/backend/executor/nodeMaterial.c    1 Jan 2009 17:23:42 -0000    1.65
+++ src/backend/executor/nodeMaterial.c    8 Mar 2009 20:39:02 -0000
@@ -24,6 +24,7 @@
 #include "executor/executor.h"
 #include "executor/nodeMaterial.h"
 #include "miscadmin.h"
+#include "pg_trace.h"

 /* ----------------------------------------------------------------
  *        ExecMaterial
@@ -45,6 +46,8 @@ ExecMaterial(MaterialState *node)
     bool        eof_tuplestore;
     TupleTableSlot *slot;

+    TRACE_POSTGRESQL_EXECUTOR_MATERIAL((uintptr_t)node);
+
     /*
      * get state info from node
      */
Index: src/backend/executor/nodeMergejoin.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeMergejoin.c,v
retrieving revision 1.94
diff -u -3 -p -r1.94 nodeMergejoin.c
--- src/backend/executor/nodeMergejoin.c    1 Jan 2009 17:23:42 -0000    1.94
+++ src/backend/executor/nodeMergejoin.c    8 Mar 2009 20:39:02 -0000
@@ -102,6 +102,7 @@
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/syscache.h"
+#include "pg_trace.h"


 /*
@@ -565,6 +566,8 @@ ExecMergeJoin(MergeJoinState *node)
     bool        doFillOuter;
     bool        doFillInner;

+    TRACE_POSTGRESQL_EXECUTOR_MERGEJOIN((uintptr_t)node);
+
     /*
      * get information from node
      */
Index: src/backend/executor/nodeNestloop.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeNestloop.c,v
retrieving revision 1.50
diff -u -3 -p -r1.50 nodeNestloop.c
--- src/backend/executor/nodeNestloop.c    1 Jan 2009 17:23:42 -0000    1.50
+++ src/backend/executor/nodeNestloop.c    8 Mar 2009 20:39:02 -0000
@@ -24,6 +24,7 @@
 #include "executor/execdebug.h"
 #include "executor/nodeNestloop.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 /* ----------------------------------------------------------------
@@ -67,6 +68,8 @@ ExecNestLoop(NestLoopState *node)
     List       *otherqual;
     ExprContext *econtext;

+    TRACE_POSTGRESQL_EXECUTOR_NESTLOOP((uintptr_t)node);
+
     /*
      * get information from the node
      */
Index: src/backend/executor/nodeSetOp.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSetOp.c,v
retrieving revision 1.28
diff -u -3 -p -r1.28 nodeSetOp.c
--- src/backend/executor/nodeSetOp.c    1 Jan 2009 17:23:42 -0000    1.28
+++ src/backend/executor/nodeSetOp.c    8 Mar 2009 20:39:02 -0000
@@ -47,6 +47,7 @@
 #include "executor/executor.h"
 #include "executor/nodeSetOp.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 /*
@@ -196,6 +197,8 @@ ExecSetOp(SetOpState *node)
     SetOp       *plannode = (SetOp *) node->ps.plan;
     TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;

+    TRACE_POSTGRESQL_EXECUTOR_SETOP((uintptr_t)node);
+
     /*
      * If the previously-returned tuple needs to be returned more than once,
      * keep returning it.
Index: src/backend/executor/nodeSort.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSort.c,v
retrieving revision 1.63
diff -u -3 -p -r1.63 nodeSort.c
--- src/backend/executor/nodeSort.c    1 Jan 2009 17:23:42 -0000    1.63
+++ src/backend/executor/nodeSort.c    8 Mar 2009 20:39:02 -0000
@@ -19,6 +19,7 @@
 #include "executor/nodeSort.h"
 #include "miscadmin.h"
 #include "utils/tuplesort.h"
+#include "pg_trace.h"


 /* ----------------------------------------------------------------
@@ -53,6 +54,8 @@ ExecSort(SortState *node)
     dir = estate->es_direction;
     tuplesortstate = (Tuplesortstate *) node->tuplesortstate;

+    TRACE_POSTGRESQL_EXECUTOR_SORT((uintptr_t)node, dir);
+
     /*
      * If first time through, read all tuples from outer plan and pass them to
      * tuplesort.c. Subsequent calls just fetch tuples from tuplesort.
Index: src/backend/executor/nodeSubplan.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeSubplan.c,v
retrieving revision 1.96
diff -u -3 -p -r1.96 nodeSubplan.c
--- src/backend/executor/nodeSubplan.c    1 Jan 2009 17:23:42 -0000    1.96
+++ src/backend/executor/nodeSubplan.c    8 Mar 2009 20:39:02 -0000
@@ -27,6 +27,7 @@
 #include "utils/array.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 static Datum ExecSubPlan(SubPlanState *node,
@@ -92,6 +93,8 @@ ExecHashSubPlan(SubPlanState *node,
     ExprContext *innerecontext = node->innerecontext;
     TupleTableSlot *slot;

+    TRACE_POSTGRESQL_EXECUTOR_SUBPLAN_HASH((uintptr_t)node);
+
     /* Shouldn't have any direct correlation Vars */
     if (subplan->parParam != NIL || node->args != NIL)
         elog(ERROR, "hashed subplan with direct correlation not supported");
@@ -227,6 +230,8 @@ ExecScanSubPlan(SubPlanState *node,
     ListCell   *l;
     ArrayBuildState *astate = NULL;

+    TRACE_POSTGRESQL_EXECUTOR_SUBPLAN_SCAN((uintptr_t)node);
+
     /*
      * We are probably in a short-lived expression-evaluation context. Switch
      * to the per-query context for manipulating the child plan's chgParam,
Index: src/backend/executor/nodeUnique.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/executor/nodeUnique.c,v
retrieving revision 1.58
diff -u -3 -p -r1.58 nodeUnique.c
--- src/backend/executor/nodeUnique.c    1 Jan 2009 17:23:42 -0000    1.58
+++ src/backend/executor/nodeUnique.c    8 Mar 2009 20:39:02 -0000
@@ -36,6 +36,7 @@
 #include "executor/executor.h"
 #include "executor/nodeUnique.h"
 #include "utils/memutils.h"
+#include "pg_trace.h"


 /* ----------------------------------------------------------------
@@ -50,6 +51,8 @@ ExecUnique(UniqueState *node)
     TupleTableSlot *slot;
     PlanState  *outerPlan;

+    TRACE_POSTGRESQL_EXECUTOR_UNIQUE((uintptr_t)node);
+
     /*
      * get information from the node
      */
Index: src/backend/utils/probes.d
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/probes.d,v
retrieving revision 1.6
diff -u -3 -p -r1.6 probes.d
--- src/backend/utils/probes.d    1 Jan 2009 17:23:48 -0000    1.6
+++ src/backend/utils/probes.d    8 Mar 2009 20:39:07 -0000
@@ -10,6 +10,7 @@

 /* typedefs used in PostgreSQL */
 #define LocalTransactionId unsigned int
+#define TransactionId unsigned int
 #define LWLockId int
 #define LWLockMode int
 #define LOCKMODE int
@@ -91,4 +92,29 @@ provider postgresql {
     probe xlog__switch();
     probe wal__buffer__write__dirty__start();
     probe wal__buffer__write__dirty__done();
+
+    probe slru__readpage__start(unsigned long, int, bool, TransactionId);
+    probe slru__readpage__done(int);
+    probe slru__readpage__readonly(unsigned long, int, TransactionId);
+    probe slru__writepage__start(unsigned long, int, int);
+    probe slru__writepage__done();
+    probe slru__readpage__physical__start(unsigned long, char *, int, int);
+    probe slru__readpage__physical__done(int, int, int);
+    probe slru__writepage__physical__start(unsigned long, int, int);
+    probe slru__writepage__physical__done(int, int, int);
+
+    probe executor__scan(unsigned long, unsigned int, unsigned long);
+    probe executor__agg(unsigned long, int);
+    probe executor__group(unsigned long, int);
+    probe executor__hash__multi(unsigned long);
+    probe executor__hashjoin(unsigned long);
+    probe executor__limit(unsigned long);
+    probe executor__material(unsigned long);
+    probe executor__mergejoin(unsigned long);
+    probe executor__nestloop(unsigned long);
+    probe executor__setop(unsigned long);
+    probe executor__sort(unsigned long, int);
+    probe executor__subplan__hash(unsigned long);
+    probe executor__subplan__scan(unsigned long);
+    probe executor__unique(unsigned long);
 };

Re: Additional DTrace Probes

From
Robert Lor
Date:
ITAGAKI Takahiro wrote:
> This will introduce SLRU and Executor probes.
> We already have Lock, LWLock, Buffer, I/O and XLogs probes.
>
> I'd like to have the following probes, too. In my experience,
> they could be bottlenecks or consume much time in some situations.
>
>     - idle in transaction
>     - spinlock wait-and-retry
>     - CPU: Trigger execution
>     - CPU: Encoding conversion
>     - Network: send() and recv()
>     - smgr: lseek() calls
>     - Tempfile reads and writes
>
>   
Great suggestions. If you have particular use cases in mind, please send 
them to the list. This will help determine what kind of data to be made 
available via the probes. It'll also be helpful if you know the 
locations for the probes.

-Robert


Re: Additional DTrace Probes

From
Zdenek Kotala
Date:
It looks OK now.
Zdenek

Dne  8.03.09 21:58, Robert Lor napsal(a):

>> 2) Maybe
>>
>> TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, -1, -1);
>>
>> would be better. Because  slru_errcause, slru_errno can contains garbage
>> in situation when everything goes fine. Same for write.   
> 
> I've made the changes per your suggestion although one can argue that 
> the script can check arg0, and if it's true, avoid using arg1 and arg2 
> as they are meaningless.

Thanks.

>>  
>> I think it is committable for 8.4.
>>   
> 
> That would be awesome!

yeah, but it depends on commiters :-)
    Zdenek


Re: Additional DTrace Probes

From
Bruce Momjian
Date:
OK, updated patch attached and applied;  thanks.

---------------------------------------------------------------------------

Zdenek Kotala wrote:
> It looks OK now.
>
>     Zdenek
>
> Dne  8.03.09 21:58, Robert Lor napsal(a):
>
> >> 2) Maybe
> >>
> >> TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, -1, -1);
> >>
> >> would be better. Because  slru_errcause, slru_errno can contains garbage
> >> in situation when everything goes fine. Same for write.
> >
> > I've made the changes per your suggestion although one can argue that
> > the script can check arg0, and if it's true, avoid using arg1 and arg2
> > as they are meaningless.
>
> Thanks.
>
> >>
> >> I think it is committable for 8.4.
> >>
> >
> > That would be awesome!
>
> yeah, but it depends on commiters :-)
>
>         Zdenek
>
> --
> Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
> To make changes to your subscription:
> http://www.postgresql.org/mailpref/pgsql-hackers

--
  Bruce Momjian  <bruce@momjian.us>        http://momjian.us
  EnterpriseDB                             http://enterprisedb.com

  + If your life is a hard drive, Christ can be your backup. +
Index: src/backend/access/transam/slru.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/transam/slru.c,v
retrieving revision 1.45
diff -c -c -r1.45 slru.c
*** src/backend/access/transam/slru.c    1 Jan 2009 17:23:36 -0000    1.45
--- src/backend/access/transam/slru.c    2 Apr 2009 19:01:55 -0000
***************
*** 57,62 ****
--- 57,63 ----
  #include "storage/fd.h"
  #include "storage/shmem.h"
  #include "miscadmin.h"
+ #include "pg_trace.h"


  /*
***************
*** 372,377 ****
--- 373,379 ----
  {
      SlruShared    shared = ctl->shared;

+     TRACE_POSTGRESQL_SLRU_READPAGE_START((uintptr_t)ctl, pageno, write_ok, xid);
      /* Outer loop handles restart if we must wait for someone else's I/O */
      for (;;)
      {
***************
*** 399,404 ****
--- 401,407 ----
              }
              /* Otherwise, it's ready to use */
              SlruRecentlyUsed(shared, slotno);
+             TRACE_POSTGRESQL_SLRU_READPAGE_DONE(slotno);
              return slotno;
          }

***************
*** 446,451 ****
--- 449,455 ----
              SlruReportIOError(ctl, pageno, xid);

          SlruRecentlyUsed(shared, slotno);
+         TRACE_POSTGRESQL_SLRU_READPAGE_DONE(slotno);
          return slotno;
      }
  }
***************
*** 470,475 ****
--- 474,481 ----
      SlruShared    shared = ctl->shared;
      int            slotno;

+     TRACE_POSTGRESQL_SLRU_READPAGE_READONLY((uintptr_t)ctl, pageno, xid);
+
      /* Try to find the page while holding only shared lock */
      LWLockAcquire(shared->ControlLock, LW_SHARED);

***************
*** 511,516 ****
--- 517,524 ----
      int            pageno = shared->page_number[slotno];
      bool        ok;

+     TRACE_POSTGRESQL_SLRU_WRITEPAGE_START((uintptr_t)ctl, pageno, slotno);
+
      /* If a write is in progress, wait for it to finish */
      while (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS &&
             shared->page_number[slotno] == pageno)
***************
*** 525,531 ****
--- 533,542 ----
      if (!shared->page_dirty[slotno] ||
          shared->page_status[slotno] != SLRU_PAGE_VALID ||
          shared->page_number[slotno] != pageno)
+     {
+         TRACE_POSTGRESQL_SLRU_WRITEPAGE_DONE();
          return;
+     }

      /*
       * Mark the slot write-busy, and clear the dirtybit.  After this point, a
***************
*** 569,574 ****
--- 580,587 ----
      /* Now it's okay to ereport if we failed */
      if (!ok)
          SlruReportIOError(ctl, pageno, InvalidTransactionId);
+
+     TRACE_POSTGRESQL_SLRU_WRITEPAGE_DONE();
  }

  /*
***************
*** 593,598 ****
--- 606,613 ----

      SlruFileName(ctl, path, segno);

+     TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_START((uintptr_t)ctl, path, pageno, slotno);
+
      /*
       * In a crash-and-restart situation, it's possible for us to receive
       * commands to set the commit status of transactions whose bits are in
***************
*** 607,612 ****
--- 622,628 ----
          {
              slru_errcause = SLRU_OPEN_FAILED;
              slru_errno = errno;
+             TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
              return false;
          }

***************
*** 614,619 ****
--- 630,636 ----
                  (errmsg("file \"%s\" doesn't exist, reading as zeroes",
                          path)));
          MemSet(shared->page_buffer[slotno], 0, BLCKSZ);
+         TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, -1, -1);
          return true;
      }

***************
*** 622,627 ****
--- 639,645 ----
          slru_errcause = SLRU_SEEK_FAILED;
          slru_errno = errno;
          close(fd);
+         TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
          return false;
      }

***************
*** 631,636 ****
--- 649,655 ----
          slru_errcause = SLRU_READ_FAILED;
          slru_errno = errno;
          close(fd);
+         TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
          return false;
      }

***************
*** 638,646 ****
--- 657,668 ----
      {
          slru_errcause = SLRU_CLOSE_FAILED;
          slru_errno = errno;
+         TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
          return false;
      }

+     TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, -1, -1);
+
      return true;
  }

***************
*** 668,673 ****
--- 690,697 ----
      char        path[MAXPGPATH];
      int            fd = -1;

+     TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_START((uintptr_t)ctl, pageno, slotno);
+
      /*
       * Honor the write-WAL-before-data rule, if appropriate, so that we do not
       * write out data before associated WAL records.  This is the same action
***************
*** 753,758 ****
--- 777,783 ----
          {
              slru_errcause = SLRU_OPEN_FAILED;
              slru_errno = errno;
+             TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
              return false;
          }

***************
*** 781,786 ****
--- 806,812 ----
          slru_errno = errno;
          if (!fdata)
              close(fd);
+         TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
          return false;
      }

***************
*** 794,799 ****
--- 820,826 ----
          slru_errno = errno;
          if (!fdata)
              close(fd);
+         TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
          return false;
      }

***************
*** 808,813 ****
--- 835,841 ----
              slru_errcause = SLRU_FSYNC_FAILED;
              slru_errno = errno;
              close(fd);
+             TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
              return false;
          }

***************
*** 815,824 ****
--- 843,854 ----
          {
              slru_errcause = SLRU_CLOSE_FAILED;
              slru_errno = errno;
+             TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(false, slru_errcause, slru_errno);
              return false;
          }
      }

+     TRACE_POSTGRESQL_SLRU_WRITEPAGE_PHYSICAL_DONE(true, -1, -1);
      return true;
  }

Index: src/backend/executor/execScan.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/execScan.c,v
retrieving revision 1.44
diff -c -c -r1.44 execScan.c
*** src/backend/executor/execScan.c    1 Jan 2009 17:23:41 -0000    1.44
--- src/backend/executor/execScan.c    2 Apr 2009 19:01:55 -0000
***************
*** 20,25 ****
--- 20,26 ----

  #include "executor/executor.h"
  #include "miscadmin.h"
+ #include "pg_trace.h"
  #include "utils/memutils.h"


***************
*** 60,65 ****
--- 61,68 ----
      qual = node->ps.qual;
      projInfo = node->ps.ps_ProjInfo;

+     TRACE_POSTGRESQL_EXECUTOR_SCAN((uintptr_t)node, ((Scan *)node->ps.plan)->scanrelid, (uintptr_t)accessMtd);
+
      /*
       * If we have neither a qual to check nor a projection to do, just skip
       * all the overhead and return the raw scan tuple.
Index: src/backend/executor/nodeAgg.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeAgg.c,v
retrieving revision 1.164
diff -c -c -r1.164 nodeAgg.c
*** src/backend/executor/nodeAgg.c    1 Jan 2009 17:23:41 -0000    1.164
--- src/backend/executor/nodeAgg.c    2 Apr 2009 19:01:55 -0000
***************
*** 79,84 ****
--- 79,85 ----
  #include "parser/parse_agg.h"
  #include "parser/parse_coerce.h"
  #include "parser/parse_oper.h"
+ #include "pg_trace.h"
  #include "utils/acl.h"
  #include "utils/builtins.h"
  #include "utils/lsyscache.h"
***************
*** 814,819 ****
--- 815,822 ----
      if (node->agg_done)
          return NULL;

+     TRACE_POSTGRESQL_EXECUTOR_AGG((uintptr_t)node, ((Agg *) node->ss.ps.plan)->aggstrategy);
+
      /*
       * Check to see if we're still projecting out tuples from a previous agg
       * tuple (because there is a function-returning-set in the projection
Index: src/backend/executor/nodeGroup.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeGroup.c,v
retrieving revision 1.73
diff -c -c -r1.73 nodeGroup.c
*** src/backend/executor/nodeGroup.c    1 Jan 2009 17:23:41 -0000    1.73
--- src/backend/executor/nodeGroup.c    2 Apr 2009 19:01:55 -0000
***************
*** 24,29 ****
--- 24,30 ----

  #include "executor/executor.h"
  #include "executor/nodeGroup.h"
+ #include "pg_trace.h"


  /*
***************
*** 49,54 ****
--- 50,57 ----
      numCols = ((Group *) node->ss.ps.plan)->numCols;
      grpColIdx = ((Group *) node->ss.ps.plan)->grpColIdx;

+     TRACE_POSTGRESQL_EXECUTOR_GROUP((uintptr_t)node, numCols);
+
      /*
       * Check to see if we're still projecting out tuples from a previous group
       * tuple (because there is a function-returning-set in the projection
Index: src/backend/executor/nodeHash.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeHash.c,v
retrieving revision 1.118
diff -c -c -r1.118 nodeHash.c
*** src/backend/executor/nodeHash.c    21 Mar 2009 00:04:38 -0000    1.118
--- src/backend/executor/nodeHash.c    2 Apr 2009 19:01:55 -0000
***************
*** 33,38 ****
--- 33,39 ----
  #include "executor/nodeHashjoin.h"
  #include "miscadmin.h"
  #include "parser/parse_expr.h"
+ #include "pg_trace.h"
  #include "utils/dynahash.h"
  #include "utils/memutils.h"
  #include "utils/lsyscache.h"
***************
*** 79,84 ****
--- 80,87 ----
      ExprContext *econtext;
      uint32        hashvalue;

+     TRACE_POSTGRESQL_EXECUTOR_HASH_MULTI((uintptr_t)node);
+
      /* must provide our own instrumentation support */
      if (node->ps.instrument)
          InstrStartNode(node->ps.instrument);
Index: src/backend/executor/nodeHashjoin.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeHashjoin.c,v
retrieving revision 1.98
diff -c -c -r1.98 nodeHashjoin.c
*** src/backend/executor/nodeHashjoin.c    21 Mar 2009 00:04:38 -0000    1.98
--- src/backend/executor/nodeHashjoin.c    2 Apr 2009 19:01:55 -0000
***************
*** 19,24 ****
--- 19,25 ----
  #include "executor/hashjoin.h"
  #include "executor/nodeHash.h"
  #include "executor/nodeHashjoin.h"
+ #include "pg_trace.h"
  #include "utils/memutils.h"


***************
*** 61,66 ****
--- 62,69 ----
      uint32        hashvalue;
      int            batchno;

+     TRACE_POSTGRESQL_EXECUTOR_HASHJOIN((uintptr_t)node);
+
      /*
       * get information from HashJoin node
       */
Index: src/backend/executor/nodeLimit.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeLimit.c,v
retrieving revision 1.36
diff -c -c -r1.36 nodeLimit.c
*** src/backend/executor/nodeLimit.c    4 Mar 2009 10:55:00 -0000    1.36
--- src/backend/executor/nodeLimit.c    2 Apr 2009 19:01:55 -0000
***************
*** 23,28 ****
--- 23,29 ----

  #include "executor/executor.h"
  #include "executor/nodeLimit.h"
+ #include "pg_trace.h"

  static void recompute_limits(LimitState *node);

***************
*** 41,46 ****
--- 42,49 ----
      TupleTableSlot *slot;
      PlanState  *outerPlan;

+     TRACE_POSTGRESQL_EXECUTOR_LIMIT((uintptr_t)node);
+
      /*
       * get information from the node
       */
Index: src/backend/executor/nodeMaterial.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeMaterial.c,v
retrieving revision 1.66
diff -c -c -r1.66 nodeMaterial.c
*** src/backend/executor/nodeMaterial.c    27 Mar 2009 18:30:21 -0000    1.66
--- src/backend/executor/nodeMaterial.c    2 Apr 2009 19:01:55 -0000
***************
*** 24,29 ****
--- 24,30 ----
  #include "executor/executor.h"
  #include "executor/nodeMaterial.h"
  #include "miscadmin.h"
+ #include "pg_trace.h"

  /* ----------------------------------------------------------------
   *        ExecMaterial
***************
*** 45,50 ****
--- 46,53 ----
      bool        eof_tuplestore;
      TupleTableSlot *slot;

+     TRACE_POSTGRESQL_EXECUTOR_MATERIAL((uintptr_t)node);
+
      /*
       * get state info from node
       */
Index: src/backend/executor/nodeMergejoin.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeMergejoin.c,v
retrieving revision 1.94
diff -c -c -r1.94 nodeMergejoin.c
*** src/backend/executor/nodeMergejoin.c    1 Jan 2009 17:23:42 -0000    1.94
--- src/backend/executor/nodeMergejoin.c    2 Apr 2009 19:01:55 -0000
***************
*** 98,103 ****
--- 98,104 ----
  #include "executor/execdefs.h"
  #include "executor/nodeMergejoin.h"
  #include "miscadmin.h"
+ #include "pg_trace.h"
  #include "utils/acl.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
***************
*** 565,570 ****
--- 566,573 ----
      bool        doFillOuter;
      bool        doFillInner;

+     TRACE_POSTGRESQL_EXECUTOR_MERGEJOIN((uintptr_t)node);
+
      /*
       * get information from node
       */
Index: src/backend/executor/nodeNestloop.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeNestloop.c,v
retrieving revision 1.50
diff -c -c -r1.50 nodeNestloop.c
*** src/backend/executor/nodeNestloop.c    1 Jan 2009 17:23:42 -0000    1.50
--- src/backend/executor/nodeNestloop.c    2 Apr 2009 19:01:55 -0000
***************
*** 23,28 ****
--- 23,29 ----

  #include "executor/execdebug.h"
  #include "executor/nodeNestloop.h"
+ #include "pg_trace.h"
  #include "utils/memutils.h"


***************
*** 67,72 ****
--- 68,75 ----
      List       *otherqual;
      ExprContext *econtext;

+     TRACE_POSTGRESQL_EXECUTOR_NESTLOOP((uintptr_t)node);
+
      /*
       * get information from the node
       */
Index: src/backend/executor/nodeSetOp.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeSetOp.c,v
retrieving revision 1.28
diff -c -c -r1.28 nodeSetOp.c
*** src/backend/executor/nodeSetOp.c    1 Jan 2009 17:23:42 -0000    1.28
--- src/backend/executor/nodeSetOp.c    2 Apr 2009 19:01:55 -0000
***************
*** 46,51 ****
--- 46,52 ----

  #include "executor/executor.h"
  #include "executor/nodeSetOp.h"
+ #include "pg_trace.h"
  #include "utils/memutils.h"


***************
*** 196,201 ****
--- 197,204 ----
      SetOp       *plannode = (SetOp *) node->ps.plan;
      TupleTableSlot *resultTupleSlot = node->ps.ps_ResultTupleSlot;

+     TRACE_POSTGRESQL_EXECUTOR_SETOP((uintptr_t)node);
+
      /*
       * If the previously-returned tuple needs to be returned more than once,
       * keep returning it.
Index: src/backend/executor/nodeSort.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v
retrieving revision 1.63
diff -c -c -r1.63 nodeSort.c
*** src/backend/executor/nodeSort.c    1 Jan 2009 17:23:42 -0000    1.63
--- src/backend/executor/nodeSort.c    2 Apr 2009 19:01:55 -0000
***************
*** 18,23 ****
--- 18,24 ----
  #include "executor/execdebug.h"
  #include "executor/nodeSort.h"
  #include "miscadmin.h"
+ #include "pg_trace.h"
  #include "utils/tuplesort.h"


***************
*** 53,58 ****
--- 54,61 ----
      dir = estate->es_direction;
      tuplesortstate = (Tuplesortstate *) node->tuplesortstate;

+     TRACE_POSTGRESQL_EXECUTOR_SORT((uintptr_t)node, dir);
+
      /*
       * If first time through, read all tuples from outer plan and pass them to
       * tuplesort.c. Subsequent calls just fetch tuples from tuplesort.
Index: src/backend/executor/nodeSubplan.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeSubplan.c,v
retrieving revision 1.96
diff -c -c -r1.96 nodeSubplan.c
*** src/backend/executor/nodeSubplan.c    1 Jan 2009 17:23:42 -0000    1.96
--- src/backend/executor/nodeSubplan.c    2 Apr 2009 19:01:55 -0000
***************
*** 24,29 ****
--- 24,30 ----
  #include "executor/nodeSubplan.h"
  #include "nodes/makefuncs.h"
  #include "optimizer/clauses.h"
+ #include "pg_trace.h"
  #include "utils/array.h"
  #include "utils/lsyscache.h"
  #include "utils/memutils.h"
***************
*** 92,97 ****
--- 93,100 ----
      ExprContext *innerecontext = node->innerecontext;
      TupleTableSlot *slot;

+     TRACE_POSTGRESQL_EXECUTOR_SUBPLAN_HASH((uintptr_t)node);
+
      /* Shouldn't have any direct correlation Vars */
      if (subplan->parParam != NIL || node->args != NIL)
          elog(ERROR, "hashed subplan with direct correlation not supported");
***************
*** 227,232 ****
--- 230,237 ----
      ListCell   *l;
      ArrayBuildState *astate = NULL;

+     TRACE_POSTGRESQL_EXECUTOR_SUBPLAN_SCAN((uintptr_t)node);
+
      /*
       * We are probably in a short-lived expression-evaluation context. Switch
       * to the per-query context for manipulating the child plan's chgParam,
Index: src/backend/executor/nodeUnique.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/executor/nodeUnique.c,v
retrieving revision 1.58
diff -c -c -r1.58 nodeUnique.c
*** src/backend/executor/nodeUnique.c    1 Jan 2009 17:23:42 -0000    1.58
--- src/backend/executor/nodeUnique.c    2 Apr 2009 19:01:55 -0000
***************
*** 35,40 ****
--- 35,41 ----

  #include "executor/executor.h"
  #include "executor/nodeUnique.h"
+ #include "pg_trace.h"
  #include "utils/memutils.h"


***************
*** 50,55 ****
--- 51,58 ----
      TupleTableSlot *slot;
      PlanState  *outerPlan;

+     TRACE_POSTGRESQL_EXECUTOR_UNIQUE((uintptr_t)node);
+
      /*
       * get information from the node
       */
Index: src/backend/utils/probes.d
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/probes.d,v
retrieving revision 1.9
diff -c -c -r1.9 probes.d
*** src/backend/utils/probes.d    23 Mar 2009 01:52:38 -0000    1.9
--- src/backend/utils/probes.d    2 Apr 2009 19:01:55 -0000
***************
*** 15,20 ****
--- 15,21 ----
   * in probe definitions, as they cause compilation errors on Mac OS X 10.5.
   */
  #define LocalTransactionId unsigned int
+ #define TransactionId unsigned int
  #define LWLockId int
  #define LWLockMode int
  #define LOCKMODE int
***************
*** 90,93 ****
--- 91,119 ----
      probe xlog__switch();
      probe wal__buffer__write__dirty__start();
      probe wal__buffer__write__dirty__done();
+
+     probe slru__readpage__start(unsigned long, int, bool, TransactionId);
+     probe slru__readpage__done(int);
+     probe slru__readpage__readonly(unsigned long, int, TransactionId);
+     probe slru__writepage__start(unsigned long, int, int);
+     probe slru__writepage__done();
+     probe slru__readpage__physical__start(unsigned long, char *, int, int);
+     probe slru__readpage__physical__done(int, int, int);
+     probe slru__writepage__physical__start(unsigned long, int, int);
+     probe slru__writepage__physical__done(int, int, int);
+
+     probe executor__scan(unsigned long, unsigned int, unsigned long);
+     probe executor__agg(unsigned long, int);
+     probe executor__group(unsigned long, int);
+     probe executor__hash__multi(unsigned long);
+     probe executor__hashjoin(unsigned long);
+     probe executor__limit(unsigned long);
+     probe executor__material(unsigned long);
+     probe executor__mergejoin(unsigned long);
+     probe executor__nestloop(unsigned long);
+     probe executor__setop(unsigned long);
+     probe executor__sort(unsigned long, int);
+     probe executor__subplan__hash(unsigned long);
+     probe executor__subplan__scan(unsigned long);
+     probe executor__unique(unsigned long);
  };

Re: Additional DTrace Probes

From
Bruce Momjian
Date:
After talking to Tom, I have reverted this patch.  It does not contain
documentation, and are not properly placed, and was submitted in
February.  I was hoping it would be an easy addition but obviously not
so please resubmit for 8.5.  Sorry.

---------------------------------------------------------------------------

bruce wrote:
> 
> OK, updated patch attached and applied;  thanks.
> 
> ---------------------------------------------------------------------------
> 
> Zdenek Kotala wrote:
> > It looks OK now.
> > 
> >     Zdenek
> > 
> > Dne  8.03.09 21:58, Robert Lor napsal(a):
> > 
> > >> 2) Maybe
> > >>
> > >> TRACE_POSTGRESQL_SLRU_READPAGE_PHYSICAL_DONE(true, -1, -1);
> > >>
> > >> would be better. Because  slru_errcause, slru_errno can contains garbage
> > >> in situation when everything goes fine. Same for write.   
> > > 
> > > I've made the changes per your suggestion although one can argue that 
> > > the script can check arg0, and if it's true, avoid using arg1 and arg2 
> > > as they are meaningless.
> > 
> > Thanks.
> > 
> > >>  
> > >> I think it is committable for 8.4.
> > >>   
> > > 
> > > That would be awesome!
> > 
> > yeah, but it depends on commiters :-)
> > 
> >         Zdenek
> > 
> > -- 
> > Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
> > To make changes to your subscription:
> > http://www.postgresql.org/mailpref/pgsql-hackers
> 
> -- 
>   Bruce Momjian  <bruce@momjian.us>        http://momjian.us
>   EnterpriseDB                             http://enterprisedb.com
> 
>   + If your life is a hard drive, Christ can be your backup. +


--  Bruce Momjian  <bruce@momjian.us>        http://momjian.us EnterpriseDB
http://enterprisedb.com
 + If your life is a hard drive, Christ can be your backup. +


Re: Additional DTrace Probes

From
Tom Lane
Date:
Bruce Momjian <bruce@momjian.us> writes:
> After talking to Tom, I have reverted this patch.  It does not contain
> documentation, and are not properly placed, and was submitted in
> February.  I was hoping it would be an easy addition but obviously not
> so please resubmit for 8.5.  Sorry.

Aside from the lack of documentation, I thought the executor probes
were suffering from a lack of design clarity.  If the intention was
to probe every executor node, why did the patch miss a lot of node
types?  Why not solve it once and for all with a single probe in
ExecProcNode()?  Perhaps the idea was to make it less painful to
trace the usage of particular node types, which is sensible, but then
why did you put a probe in execScan rather than the individual calling
scan-type nodes?  Surely the ability to easily count seqscan vs
indexscan would have to rank mighty high in any evaluation of whether
it's easy to count particular node types.

I was also pretty unhappy with passing the node pointers to the probes.
What are probe routines realistically going to do with those?  We feel
free to whack the contents of executor node types around often, even in
stable branches sometimes.  I do *not* want to encourage probe authors
to write code that depends on the contents of those struct types.
(Heck, we probably shouldn't encourage them to assume the values of enum
NodeTag even, so having a single probe in ExecProcNode is likely not
a good solution.  I will certainly not hold still for any future
suggestions that we should avoid renumbering NodeTag or changing struct
types because someone has written a probe that depends on it.)

So this needs much more thought about where executor probes should go
and what their arguments should be.

I'm also a bit worried about the speed issue.  The existing DTrace
probes are tracking relatively expensive operations like I/O calls.
We *know* that a kernel call per ExecProcNode iteration is highly
expensive; see experience with EXPLAIN ANALYZE overhead on any platform
that hasn't got a fast path for gettimeofday.  I don't think that
dropping probes into the executor loop is something we should do
without careful analysis of what they're good for and whether anyone
would really use them.
        regards, tom lane