Re: cache estimates, cache access cost - Mailing list pgsql-hackers

From Greg Smith
Subject Re: cache estimates, cache access cost
Date
Msg-id 4DD09F98.9050406@2ndquadrant.com
Whole thread Raw
In response to cache estimates, cache access cost  (Cédric Villemain <cedric.villemain.debian@gmail.com>)
Responses Re: cache estimates, cache access cost  (Robert Haas <robertmhaas@gmail.com>)
[WIP] cache estimates, cache access cost  (Cédric Villemain <cedric.villemain.debian@gmail.com>)
List pgsql-hackers
Cédric Villemain wrote:
> http://git.postgresql.org/gitweb?p=users/c2main/postgres.git;a=shortlog;h=refs/heads/analyze_cache
>

This rebases easily to make Cedric's changes move to the end; I just
pushed a version with that change to
https://github.com/greg2ndQuadrant/postgres/tree/analyze_cache if anyone
wants a cleaner one to browse.  I've attached a patch too if that's more
your thing.

I'd recommend not getting too stuck on the particular hook Cédric has
added here to compute the cache estimate, which uses mmap and mincore to
figure it out.  It's possible to compute similar numbers, albeit less
accurate, using an approach similar to how pg_buffercache inspects
things.  And I even once wrote a background writer extension that
collected this sort of data as it was running the LRU scan anyway.
Discussions of this idea seem to focus on how the "what's in the cache?"
data is collected, which as far as I'm concerned is the least important
part.  There are multiple options, some work better than others, and
there's no reason that can't be swapped out later.  The more important
question is how to store the data collected and then use it for
optimizing queries.

--
Greg Smith   2ndQuadrant US    greg@2ndQuadrant.com   Baltimore, MD
PostgreSQL Training, Services, and 24x7 Support  www.2ndQuadrant.us


diff --git a/contrib/Makefile b/contrib/Makefile
index 6967767..47652d5 100644
--- a/contrib/Makefile
+++ b/contrib/Makefile
@@ -27,6 +27,7 @@ SUBDIRS = \
         lo        \
         ltree        \
         oid2name    \
+        oscache    \
         pageinspect    \
         passwordcheck    \
         pg_archivecleanup \
diff --git a/contrib/oscache/Makefile b/contrib/oscache/Makefile
new file mode 100644
index 0000000..8d8dcc5
--- /dev/null
+++ b/contrib/oscache/Makefile
@@ -0,0 +1,15 @@
+# contrib/oscache/Makefile
+
+MODULE_big = oscache
+OBJS = oscache.o
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = contrib/oscache
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/contrib/oscache/oscache.c b/contrib/oscache/oscache.c
new file mode 100644
index 0000000..1ad7dc2
--- /dev/null
+++ b/contrib/oscache/oscache.c
@@ -0,0 +1,151 @@
+/*-------------------------------------------------------------------------
+ *
+ * oscache.c
+ *
+ *
+ * Copyright (c) 2011, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *      contrib/oscache/oscache.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/* { POSIX stuff */
+#include <stdlib.h> /* exit, calloc, free */
+#include <sys/stat.h> /* stat, fstat */
+#include <sys/types.h> /* size_t, mincore */
+#include <unistd.h> /* sysconf, close */
+#include <sys/mman.h> /* mmap, mincore */
+/* } */
+
+/* { PostgreSQL stuff */
+#include "postgres.h" /* general Postgres declarations */
+#include "utils/rel.h" /* Relation */
+#include "storage/bufmgr.h"
+#include "catalog/catalog.h" /* relpath */
+/* } */
+
+PG_MODULE_MAGIC;
+
+void        _PG_init(void);
+
+float4 oscache(Relation, ForkNumber);
+
+/*
+ * Module load callback
+ */
+void
+_PG_init(void)
+{
+    /* Install hook. */
+    OSCache_hook = &oscache;
+}
+
+/*
+ * oscache process the os cache inspection for the relation.
+ * It returns the percentage of blocks in OS cache.
+ */
+float4
+oscache(Relation relation, ForkNumber forkNum)
+{
+    int  segment = 0;
+    char *relationpath;
+    char filename[MAXPGPATH];
+    int fd;
+    int64  total_block_disk = 0;
+    int64  total_block_mem  = 0;
+
+    /* OS things */
+    int64 pageSize  = sysconf(_SC_PAGESIZE); /* Page size */
+    register int64 pageIndex;
+
+    relationpath = relpathperm(relation->rd_node, forkNum);
+
+    /*
+     * For each segment of the relation
+     */
+    snprintf(filename, MAXPGPATH, "%s", relationpath);
+    while ((fd = open(filename, O_RDONLY)) != -1)
+    {
+        // for stat file
+        struct stat st;
+        // for mmap file
+        void *pa = (char *)0;
+        // for calloc file
+        unsigned char *vec = (unsigned char *)0;
+        int64  block_disk = 0;
+        int64  block_mem  = 0;
+
+        if (fstat(fd, &st) == -1)
+        {
+            close(fd);
+            elog(ERROR, "Can not stat object file : %s",
+                filename);
+            return 0;
+        }
+
+        /*
+        * if file ok
+        * then process
+        */
+        if (st.st_size != 0)
+        {
+            /* number of block in the current file */
+            block_disk = st.st_size/pageSize;
+
+            /* TODO We need to split mmap size to be sure (?) to be able to mmap */
+            pa = mmap(NULL, st.st_size, PROT_NONE, MAP_SHARED, fd, 0);
+            if (pa == MAP_FAILED)
+            {
+                close(fd);
+                elog(ERROR, "Can not mmap object file : %s, errno = %i,%s\nThis error can happen if there is not
enoughtspace in memory to do the projection. Please mail cedric@2ndQuadrant.fr with '[oscache] ENOMEM' as subject.", 
+                    filename, errno, strerror(errno));
+                return 0;
+            }
+
+            /* Prepare our vector containing all blocks information */
+            vec = calloc(1, (st.st_size+pageSize-1)/pageSize);
+            if ((void *)0 == vec)
+            {
+                munmap(pa, st.st_size);
+                close(fd);
+                elog(ERROR, "Can not calloc object file : %s",
+                    filename);
+                return 0;
+            }
+
+            /* Affect vec with mincore */
+            if (mincore(pa, st.st_size, vec) != 0)
+            {
+                free(vec);
+                munmap(pa, st.st_size);
+                close(fd);
+                elog(ERROR, "mincore(%p, %lld, %p): %s\n",
+                    pa, (int64)st.st_size, vec, strerror(errno));
+                return 0;
+            }
+
+            /* handle the results */
+            for (pageIndex = 0; pageIndex <= st.st_size/pageSize; pageIndex++)
+            {
+                // block in memory
+                if (vec[pageIndex] & 1)
+                {
+                    block_mem++;
+                }
+            }
+        }
+        elog(DEBUG1, "oscache %s: %lld of %lld block in linux cache",
+            filename, block_mem,  block_disk);
+
+        //   free things
+        free(vec);
+        munmap(pa, st.st_size);
+        close(fd);
+        total_block_mem += block_mem;
+        total_block_disk += block_disk;
+
+        snprintf(filename, MAXPGPATH, "%s.%u", relationpath, segment++);
+    }
+    return (float4)(total_block_mem*100/(total_block_disk+1));
+}
diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 7b62818..25338d0 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1634,6 +1634,26 @@
      </row>

      <row>
+      <entry><structfield>reloscache</structfield></entry>
+      <entry><type>float4</type></entry>
+      <entry></entry>
+      <entry>
+       Percentage of the files in OS cache.  This is only an estimate used by
+       the planner.  It is updated by <command>ANALYZE OSCACHE</command>.
+      </entry>
+     </row>
+
+     <row>
+      <entry><structfield>relpgcache</structfield></entry>
+      <entry><type>float4</type></entry>
+      <entry></entry>
+      <entry>
+       Percentage of the files in PostgreSQL cache.  This is only an estimate used by
+       the planner.  It is updated by <command>ANALYZE PGCACHE</command>.
+      </entry>
+     </row>
+
+     <row>
       <entry><structfield>reltoastrelid</structfield></entry>
       <entry><type>oid</type></entry>
       <entry><literal><link linkend="catalog-pg-class"><structname>pg_class</structname></link>.oid</literal></entry>
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 4cb29b2..7f39a93 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -54,6 +54,8 @@ hashbuild(PG_FUNCTION_ARGS)
     IndexBuildResult *result;
     BlockNumber relpages;
     double        reltuples;
+    float4        reloscache;
+    float4        relpgcache;
     uint32        num_buckets;
     HashBuildState buildstate;

@@ -66,7 +68,7 @@ hashbuild(PG_FUNCTION_ARGS)
              RelationGetRelationName(index));

     /* Estimate the number of rows currently present in the table */
-    estimate_rel_size(heap, NULL, &relpages, &reltuples);
+    estimate_rel_size(heap, NULL, &relpages, &reltuples, &reloscache, &relpgcache);

     /* Initialize the hash index metadata page and initial buckets */
     num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 71c9931..73ba67b 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -756,6 +756,8 @@ InsertPgClassTuple(Relation pg_class_desc,
     values[Anum_pg_class_reltablespace - 1] = ObjectIdGetDatum(rd_rel->reltablespace);
     values[Anum_pg_class_relpages - 1] = Int32GetDatum(rd_rel->relpages);
     values[Anum_pg_class_reltuples - 1] = Float4GetDatum(rd_rel->reltuples);
+    values[Anum_pg_class_reloscache - 1] = Float4GetDatum(rd_rel->reloscache);
+    values[Anum_pg_class_relpgcache - 1] = Float4GetDatum(rd_rel->relpgcache);
     values[Anum_pg_class_reltoastrelid - 1] = ObjectIdGetDatum(rd_rel->reltoastrelid);
     values[Anum_pg_class_reltoastidxid - 1] = ObjectIdGetDatum(rd_rel->reltoastidxid);
     values[Anum_pg_class_relhasindex - 1] = BoolGetDatum(rd_rel->relhasindex);
diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 0568a1b..284ab5d 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -86,6 +86,8 @@ static BufferAccessStrategy vac_strategy;

 static void do_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
                bool update_reltuples, bool inh);
+static void do_cache_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
+               bool update_reltuples, bool inh);
 static void BlockSampler_Init(BlockSampler bs, BlockNumber nblocks,
                   int samplesize);
 static bool BlockSampler_HasMore(BlockSampler bs);
@@ -238,13 +240,21 @@ analyze_rel(Oid relid, VacuumStmt *vacstmt,
     /*
      * Do the normal non-recursive ANALYZE.
      */
-    do_analyze_rel(onerel, vacstmt, update_reltuples, false);
+    if (vacstmt->options & (VACOPT_CACHE))
+        do_cache_analyze_rel(onerel, vacstmt, update_reltuples, false);
+    else
+        do_analyze_rel(onerel, vacstmt, update_reltuples, false);

     /*
      * If there are child tables, do recursive ANALYZE.
      */
     if (onerel->rd_rel->relhassubclass)
-        do_analyze_rel(onerel, vacstmt, false, true);
+    {
+        if (vacstmt->options & (VACOPT_CACHE))
+            do_cache_analyze_rel(onerel, vacstmt, false, true);
+        else
+            do_analyze_rel(onerel, vacstmt, false, true);
+    }

     /*
      * Close source relation now, but keep lock so that no one deletes it
@@ -640,6 +650,120 @@ cleanup:
 }

 /*
+ *    do_analyze_rel() -- analyze one relation, recursively or not
+ */
+static void
+do_cache_analyze_rel(Relation onerel, VacuumStmt *vacstmt,
+                     bool update_relcache, bool inh)
+{
+    int            ind;
+    Relation   *Irel;
+    int            nindexes;
+    bool        hasindex;
+    AnlIndexData *indexdata;
+    PGRUsage    ru0;
+    TimestampTz starttime = 0;
+    MemoryContext caller_context;
+    int            save_nestlevel;
+
+    if (inh)
+        ereport(elevel,
+                (errmsg("cache analyzing \"%s.%s\" inheritance tree",
+                        get_namespace_name(RelationGetNamespace(onerel)),
+                        RelationGetRelationName(onerel))));
+    else
+        ereport(elevel,
+                (errmsg("cache analyzing \"%s.%s\"",
+                        get_namespace_name(RelationGetNamespace(onerel)),
+                        RelationGetRelationName(onerel))));
+
+    /*
+     * Set up a working context so that we can easily free whatever junk gets
+     * created.
+     */
+    anl_context = AllocSetContextCreate(CurrentMemoryContext,
+                                        "Analyze",
+                                        ALLOCSET_DEFAULT_MINSIZE,
+                                        ALLOCSET_DEFAULT_INITSIZE,
+                                        ALLOCSET_DEFAULT_MAXSIZE);
+    caller_context = MemoryContextSwitchTo(anl_context);
+
+    /*
+     * Arrange to make GUC variable changes local to this command.
+     */
+    save_nestlevel = NewGUCNestLevel();
+
+    /* measure elapsed time iff autovacuum logging requires it */
+    if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
+    {
+        pg_rusage_init(&ru0);
+        if (Log_autovacuum_min_duration > 0)
+            starttime = GetCurrentTimestamp();
+    }
+
+    /*
+     * Open all indexes of the relation, and see if there are any analyzable
+     * columns in the indexes.    We do not analyze index columns if there was
+     * an explicit column list in the ANALYZE command, however.  If we are
+     * doing a recursive scan, we don't want to touch the parent's indexes at
+     * all.
+     */
+    if (!inh)
+        vac_open_indexes(onerel, AccessShareLock, &nindexes, &Irel);
+    else
+    {
+        Irel = NULL;
+        nindexes = 0;
+    }
+    hasindex = (nindexes > 0);
+    indexdata = NULL;
+
+    /*
+     * Update cache stats in pg_class.
+     */
+    cache_update_relstats(onerel,
+                          RelationGetRelationOSCacheInFork(onerel, MAIN_FORKNUM),
+                          RelationGetRelationPGCacheInFork(onerel, MAIN_FORKNUM),
+                          InvalidTransactionId);
+
+    /*
+     * Same for indexes.
+     */
+    for (ind = 0; ind < nindexes; ind++)
+    {
+        cache_update_relstats(Irel[ind],
+                              RelationGetRelationOSCacheInFork(Irel[ind], MAIN_FORKNUM),
+                              RelationGetRelationPGCacheInFork(Irel[ind], MAIN_FORKNUM),
+                              InvalidTransactionId);
+    }
+
+    /* Done with indexes */
+    vac_close_indexes(nindexes, Irel, NoLock);
+
+    /* Log the action if appropriate */
+    if (IsAutoVacuumWorkerProcess() && Log_autovacuum_min_duration >= 0)
+    {
+        if (Log_autovacuum_min_duration == 0 ||
+            TimestampDifferenceExceeds(starttime, GetCurrentTimestamp(),
+                                       Log_autovacuum_min_duration))
+            ereport(LOG,
+                    (errmsg("automatic cache analyze of table \"%s.%s.%s\" system usage: %s",
+                            get_database_name(MyDatabaseId),
+                            get_namespace_name(RelationGetNamespace(onerel)),
+                            RelationGetRelationName(onerel),
+                            pg_rusage_show(&ru0))));
+    }
+
+    /* Roll back any GUC changes executed by index functions */
+    AtEOXact_GUC(false, save_nestlevel);
+
+    /* Restore current context and release memory */
+    MemoryContextSwitchTo(caller_context);
+    MemoryContextDelete(anl_context);
+    anl_context = NULL;
+}
+
+/*
  * Compute statistics about indexes of a relation
  */
 static void
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 9606569..b45f012 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1096,3 +1096,61 @@ vacuum_delay_point(void)
         CHECK_FOR_INTERRUPTS();
     }
 }
+
+
+/*
+ *    cache_update_relstats() -- update cache statistics for one relation
+ *
+ *  /!\ Same comment as function vac_update_relstats()
+ */
+void
+cache_update_relstats(Relation relation,
+                      float4 per_oscache, float4 per_pgcache,
+                      TransactionId frozenxid)
+{
+    Oid            relid = RelationGetRelid(relation);
+    Relation    rd;
+    HeapTuple    ctup;
+    Form_pg_class pgcform;
+    bool        dirty;
+
+    rd = heap_open(RelationRelationId, RowExclusiveLock);
+
+    /* Fetch a copy of the tuple to scribble on */
+    ctup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
+    if (!HeapTupleIsValid(ctup))
+        elog(ERROR, "pg_class entry for relid %u vanished during cache analyze",
+             relid);
+    pgcform = (Form_pg_class) GETSTRUCT(ctup);
+
+    /* Apply required updates, if any, to copied tuple */
+
+    dirty = false;
+    if (pgcform->reloscache != (float4) per_oscache)
+    {
+        pgcform->reloscache = (float4) per_oscache;
+        dirty = true;
+    }
+    if (pgcform->relpgcache != (float4) per_pgcache)
+    {
+        pgcform->relpgcache = (float4) per_pgcache;
+        dirty = true;
+    }
+
+    /*
+     * relfrozenxid should never go backward.  Caller can pass
+     * InvalidTransactionId if it has no new data.
+     */
+    if (TransactionIdIsNormal(frozenxid) &&
+        TransactionIdPrecedes(pgcform->relfrozenxid, frozenxid))
+    {
+        pgcform->relfrozenxid = frozenxid;
+        dirty = true;
+    }
+
+    /* If anything changed, write out the tuple. */
+    if (dirty)
+        heap_inplace_update(rd, ctup);
+
+    heap_close(rd, RowExclusiveLock);
+}
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index fd8ea45..39f9eab 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -108,7 +108,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
      */
     if (!inhparent)
         estimate_rel_size(relation, rel->attr_widths - rel->min_attr,
-                          &rel->pages, &rel->tuples);
+                          &rel->pages, &rel->tuples,
+                          &rel->oscache, &rel->pgcache);

     /*
      * Make list of indexes.  Ignore indexes on system catalogs if told to.
@@ -323,11 +324,14 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
             {
                 info->pages = RelationGetNumberOfBlocks(indexRelation);
                 info->tuples = rel->tuples;
+                info->oscache = 0;
+                info->pgcache = 0;
             }
             else
             {
                 estimate_rel_size(indexRelation, NULL,
-                                  &info->pages, &info->tuples);
+                                  &info->pages, &info->tuples,
+                                  &info->oscache, &info->pgcache);
                 if (info->tuples > rel->tuples)
                     info->tuples = rel->tuples;
             }
@@ -362,7 +366,8 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
  */
 void
 estimate_rel_size(Relation rel, int32 *attr_widths,
-                  BlockNumber *pages, double *tuples)
+                  BlockNumber *pages, double *tuples,
+                  float4 *oscache, float4 *pgcache)
 {
     BlockNumber curpages;
     BlockNumber relpages;
@@ -451,21 +456,29 @@ estimate_rel_size(Relation rel, int32 *attr_widths,
                 density = (BLCKSZ - SizeOfPageHeaderData) / tuple_width;
             }
             *tuples = rint(density * (double) curpages);
+            *oscache = (float4) rel->rd_rel->reloscache;
+            *pgcache = (float4) rel->rd_rel->relpgcache;
             break;
         case RELKIND_SEQUENCE:
             /* Sequences always have a known size */
             *pages = 1;
             *tuples = 1;
+            *oscache = 0;
+            *pgcache = 0;
             break;
         case RELKIND_FOREIGN_TABLE:
             /* Just use whatever's in pg_class */
             *pages = rel->rd_rel->relpages;
             *tuples = rel->rd_rel->reltuples;
+            *oscache = 0;
+            *pgcache = 0;
             break;
         default:
             /* else it has no disk storage; probably shouldn't get here? */
             *pages = 0;
             *tuples = 0;
+            *oscache = 0;
+            *pgcache = 0;
             break;
     }
 }
diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y
index 1d39674..cc0d6f5 100644
--- a/src/backend/parser/gram.y
+++ b/src/backend/parser/gram.y
@@ -342,7 +342,7 @@ static void SplitColQualList(List *qualList,

 %type <boolean> opt_instead
 %type <boolean> opt_unique opt_concurrently opt_verbose opt_full
-%type <boolean> opt_freeze opt_default opt_recheck
+%type <boolean> opt_freeze opt_oscache opt_default opt_recheck
 %type <defelt>    opt_binary opt_oids copy_delimiter

 %type <boolean> copy_from
@@ -529,7 +529,7 @@ static void SplitColQualList(List *qualList,
     NULLS_P NUMERIC

     OBJECT_P OF OFF OFFSET OIDS ON ONLY OPERATOR OPTION OPTIONS OR
-    ORDER OUT_P OUTER_P OVER OVERLAPS OVERLAY OWNED OWNER
+    ORDER OSCACHE OUT_P OUTER_P OVER OVERLAPS OVERLAY OWNED OWNER

     PARSER PARTIAL PARTITION PASSING PASSWORD PLACING PLANS POSITION
     PRECEDING PRECISION PRESERVE PREPARE PREPARED PRIMARY
@@ -7801,11 +7801,13 @@ vacuum_option_elem:
         ;

 AnalyzeStmt:
-            analyze_keyword opt_verbose
+            analyze_keyword opt_oscache opt_verbose
                 {
                     VacuumStmt *n = makeNode(VacuumStmt);
                     n->options = VACOPT_ANALYZE;
                     if ($2)
+                        n->options |= VACOPT_CACHE;
+                    if ($3)
                         n->options |= VACOPT_VERBOSE;
                     n->freeze_min_age = -1;
                     n->freeze_table_age = -1;
@@ -7813,16 +7815,18 @@ AnalyzeStmt:
                     n->va_cols = NIL;
                     $$ = (Node *)n;
                 }
-            | analyze_keyword opt_verbose qualified_name opt_name_list
+            | analyze_keyword opt_oscache opt_verbose qualified_name opt_name_list
                 {
                     VacuumStmt *n = makeNode(VacuumStmt);
                     n->options = VACOPT_ANALYZE;
                     if ($2)
+                        n->options |= VACOPT_CACHE;
+                    if ($3)
                         n->options |= VACOPT_VERBOSE;
                     n->freeze_min_age = -1;
                     n->freeze_table_age = -1;
-                    n->relation = $3;
-                    n->va_cols = $4;
+                    n->relation = $4;
+                    n->va_cols = $5;
                     $$ = (Node *)n;
                 }
         ;
@@ -7845,6 +7849,11 @@ opt_freeze: FREEZE                                    { $$ = TRUE; }
             | /*EMPTY*/                                { $$ = FALSE; }
         ;

+opt_oscache:
+            OSCACHE                                    { $$ = TRUE; }
+            | /*EMPTY*/                             { $$ = FALSE; }
+        ;
+
 opt_name_list:
             '(' name_list ')'                        { $$ = $2; }
             | /*EMPTY*/                                { $$ = NIL; }
@@ -12158,6 +12167,7 @@ type_func_name_keyword:
             | LIKE
             | NATURAL
             | NOTNULL
+            | OSCACHE
             | OUTER_P
             | OVER
             | OVERLAPS
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f96685d..5cea929 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -106,6 +106,13 @@ static volatile BufferDesc *BufferAlloc(SMgrRelation smgr,
 static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln);
 static void AtProcExit_Buffers(int code, Datum arg);

+/*
+ * Hooks for plugins to get control in
+ * RelationGetRelationOSCacheInFork
+ * RelationGetRelationPGCacheInFork
+ */
+oscache_hook_type OSCache_hook = NULL;
+pgcache_hook_type PGCache_hook = NULL;

 /*
  * PrefetchBuffer -- initiate asynchronous read of a block of a relation
@@ -1922,6 +1929,40 @@ RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
     return smgrnblocks(relation->rd_smgr, forkNum);
 }

+/*
+ * RelationGetRelationOSCacheInFork
+ *        Determines the current percentage of pages in OS cache for the
+ *        relation.
+ */
+float4
+RelationGetRelationOSCacheInFork(Relation relation, ForkNumber forkNum)
+{
+    float4 percent = 0;
+
+    /* if a plugin is present, let it manage things */
+    if (OSCache_hook)
+        percent = (*OSCache_hook) (relation, forkNum);
+
+    return percent;
+}
+
+/*
+ * RelationGetRelationPGCacheInFork
+ *        Determines the current percentage of pages in PostgreSQL cache
+ *        for the relation.
+ */
+float4
+RelationGetRelationPGCacheInFork(Relation relation, ForkNumber forkNum)
+{
+    float4 percent = 0;
+
+    /* if a plugin is present, let it manage things */
+    if (PGCache_hook)
+        percent = (*PGCache_hook) (relation, forkNum);
+
+    return percent;
+}
+
 /* ---------------------------------------------------------------------
  *        DropRelFileNodeBuffers
  *
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index d7e94ff..159096a 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -1417,6 +1417,8 @@ formrdesc(const char *relationName, Oid relationReltype,

     relation->rd_rel->relpages = 1;
     relation->rd_rel->reltuples = 1;
+    relation->rd_rel->reloscache = 0;
+    relation->rd_rel->relpgcache = 0;
     relation->rd_rel->relkind = RELKIND_RELATION;
     relation->rd_rel->relhasoids = hasoids;
     relation->rd_rel->relnatts = (int16) natts;
@@ -2661,6 +2663,8 @@ RelationSetNewRelfilenode(Relation relation, TransactionId freezeXid)
     {
         classform->relpages = 0;    /* it's empty until further notice */
         classform->reltuples = 0;
+        classform->reloscache = 0;
+        classform->relpgcache = 0;
     }
     classform->relfrozenxid = freezeXid;

diff --git a/src/include/catalog/pg_class.h b/src/include/catalog/pg_class.h
index ffcce3c..dc79df5 100644
--- a/src/include/catalog/pg_class.h
+++ b/src/include/catalog/pg_class.h
@@ -45,6 +45,8 @@ CATALOG(pg_class,1259) BKI_BOOTSTRAP BKI_ROWTYPE_OID(83) BKI_SCHEMA_MACRO
     Oid            reltablespace;    /* identifier of table space for relation */
     int4        relpages;        /* # of blocks (not always up-to-date) */
     float4        reltuples;        /* # of tuples (not always up-to-date) */
+    float4        reloscache;        /* % of files in OS cache (not always up-to-date) */
+    float4        relpgcache;        /* % of files in PostgreSQL cache (not always up-to-date) */
     Oid            reltoastrelid;    /* OID of toast table; 0 if none */
     Oid            reltoastidxid;    /* if toast table, OID of chunk_id index */
     bool        relhasindex;    /* T if has (or has had) any indexes */
@@ -92,7 +94,7 @@ typedef FormData_pg_class *Form_pg_class;
  * ----------------
  */

-#define Natts_pg_class                    26
+#define Natts_pg_class                    28
 #define Anum_pg_class_relname            1
 #define Anum_pg_class_relnamespace        2
 #define Anum_pg_class_reltype            3
@@ -103,22 +105,24 @@ typedef FormData_pg_class *Form_pg_class;
 #define Anum_pg_class_reltablespace        8
 #define Anum_pg_class_relpages            9
 #define Anum_pg_class_reltuples            10
-#define Anum_pg_class_reltoastrelid        11
-#define Anum_pg_class_reltoastidxid        12
-#define Anum_pg_class_relhasindex        13
-#define Anum_pg_class_relisshared        14
-#define Anum_pg_class_relpersistence    15
-#define Anum_pg_class_relkind            16
-#define Anum_pg_class_relnatts            17
-#define Anum_pg_class_relchecks            18
-#define Anum_pg_class_relhasoids        19
-#define Anum_pg_class_relhaspkey        20
-#define Anum_pg_class_relhasrules        21
-#define Anum_pg_class_relhastriggers    22
-#define Anum_pg_class_relhassubclass    23
-#define Anum_pg_class_relfrozenxid        24
-#define Anum_pg_class_relacl            25
-#define Anum_pg_class_reloptions        26
+#define Anum_pg_class_reloscache        11
+#define Anum_pg_class_relpgcache        12
+#define Anum_pg_class_reltoastrelid        13
+#define Anum_pg_class_reltoastidxid        14
+#define Anum_pg_class_relhasindex        15
+#define Anum_pg_class_relisshared        16
+#define Anum_pg_class_relpersistence    17
+#define Anum_pg_class_relkind            18
+#define Anum_pg_class_relnatts            19
+#define Anum_pg_class_relchecks            20
+#define Anum_pg_class_relhasoids        21
+#define Anum_pg_class_relhaspkey        22
+#define Anum_pg_class_relhasrules        23
+#define Anum_pg_class_relhastriggers    24
+#define Anum_pg_class_relhassubclass    25
+#define Anum_pg_class_relfrozenxid        26
+#define Anum_pg_class_relacl            27
+#define Anum_pg_class_reloptions        28

 /* ----------------
  *        initial contents of pg_class
@@ -130,13 +134,13 @@ typedef FormData_pg_class *Form_pg_class;
  */

 /* Note: "3" in the relfrozenxid column stands for FirstNormalTransactionId */
-DATA(insert OID = 1247 (  pg_type        PGNSP 71 0 PGUID 0 0 0 0 0 0 0 f f p r 29 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1247 (  pg_type        PGNSP 71 0 PGUID 0 0 0 0 0 0 0 0 0 f f p r 29 0 t f f f f 3 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1249 (  pg_attribute    PGNSP 75 0 PGUID 0 0 0 0 0 0 0 f f p r 20 0 f f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1249 (  pg_attribute    PGNSP 75 0 PGUID 0 0 0 0 0 0 0 0 0 f f p r 20 0 f f f f f 3 _null_ _null_
));
 DESCR("");
-DATA(insert OID = 1255 (  pg_proc        PGNSP 81 0 PGUID 0 0 0 0 0 0 0 f f p r 25 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1255 (  pg_proc        PGNSP 81 0 PGUID 0 0 0 0 0 0 0 0 0 f f p r 25 0 t f f f f 3 _null_ _null_ ));
 DESCR("");
-DATA(insert OID = 1259 (  pg_class        PGNSP 83 0 PGUID 0 0 0 0 0 0 0 f f p r 26 0 t f f f f 3 _null_ _null_ ));
+DATA(insert OID = 1259 (  pg_class        PGNSP 83 0 PGUID 0 0 0 0 0 0 0 0 0 f f p r 28 0 t f f f f 3 _null_ _null_
));
 DESCR("");


diff --git a/src/include/commands/vacuum.h b/src/include/commands/vacuum.h
index 79c9f5d..7f1801a 100644
--- a/src/include/commands/vacuum.h
+++ b/src/include/commands/vacuum.h
@@ -155,6 +155,11 @@ extern void vacuum_set_xid_limits(int freeze_min_age, int freeze_table_age,
 extern void vac_update_datfrozenxid(void);
 extern void vacuum_delay_point(void);

+extern void cache_update_relstats(Relation relation,
+                                  float4 per_oscache,
+                                  float4 per_pgcache,
+                                  TransactionId frozenxid);
+
 /* in commands/vacuumlazy.c */
 extern void lazy_vacuum_rel(Relation onerel, VacuumStmt *vacstmt,
                 BufferAccessStrategy bstrategy, bool *scanned_all);
diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h
index ee1881b..bc7a301 100644
--- a/src/include/nodes/parsenodes.h
+++ b/src/include/nodes/parsenodes.h
@@ -2417,7 +2417,8 @@ typedef enum VacuumOption
     VACOPT_VERBOSE = 1 << 2,    /* print progress info */
     VACOPT_FREEZE = 1 << 3,        /* FREEZE option */
     VACOPT_FULL = 1 << 4,        /* FULL (non-concurrent) vacuum */
-    VACOPT_NOWAIT = 1 << 5
+    VACOPT_NOWAIT = 1 << 5,
+    VACOPT_CACHE = 1 << 6        /* do CACHE stats analyze */
 } VacuumOption;

 typedef struct VacuumStmt
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index f659269..3f08bb0 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -408,6 +408,8 @@ typedef struct RelOptInfo
     List       *indexlist;        /* list of IndexOptInfo */
     BlockNumber pages;
     double        tuples;
+    float4        oscache;
+    float4        pgcache;
     struct Plan *subplan;        /* if subquery */
     List       *subrtable;        /* if subquery */
     List       *subrowmark;        /* if subquery */
@@ -466,6 +468,8 @@ typedef struct IndexOptInfo
     /* statistics from pg_class */
     BlockNumber pages;            /* number of disk pages in index */
     double        tuples;            /* number of index tuples in index */
+    float4        oscache;
+    float4        pgcache;

     /* index descriptor information */
     int            ncolumns;        /* number of columns in index */
diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h
index c0b8eda..1dc78d5 100644
--- a/src/include/optimizer/plancat.h
+++ b/src/include/optimizer/plancat.h
@@ -29,7 +29,8 @@ extern void get_relation_info(PlannerInfo *root, Oid relationObjectId,
                   bool inhparent, RelOptInfo *rel);

 extern void estimate_rel_size(Relation rel, int32 *attr_widths,
-                  BlockNumber *pages, double *tuples);
+                              BlockNumber *pages, double *tuples,
+                              float4 *oscache, float4 *pgcache);

 extern int32 get_relation_data_width(Oid relid, int32 *attr_widths);

diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h
index 12c2faf..95a7e3d 100644
--- a/src/include/parser/kwlist.h
+++ b/src/include/parser/kwlist.h
@@ -264,6 +264,7 @@ PG_KEYWORD("option", OPTION, UNRESERVED_KEYWORD)
 PG_KEYWORD("options", OPTIONS, UNRESERVED_KEYWORD)
 PG_KEYWORD("or", OR, RESERVED_KEYWORD)
 PG_KEYWORD("order", ORDER, RESERVED_KEYWORD)
+PG_KEYWORD("oscache", OSCACHE, TYPE_FUNC_NAME_KEYWORD)
 PG_KEYWORD("out", OUT_P, COL_NAME_KEYWORD)
 PG_KEYWORD("outer", OUTER_P, TYPE_FUNC_NAME_KEYWORD)
 PG_KEYWORD("over", OVER, TYPE_FUNC_NAME_KEYWORD)
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index b8fc87e..8b621de 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -179,6 +179,10 @@ extern void CheckPointBuffers(int flags);
 extern BlockNumber BufferGetBlockNumber(Buffer buffer);
 extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
                                 ForkNumber forkNum);
+extern float4 RelationGetRelationOSCacheInFork(Relation relation,
+                                ForkNumber forkNum);
+extern float4 RelationGetRelationPGCacheInFork(Relation relation,
+                                ForkNumber forkNum);
 extern void FlushRelationBuffers(Relation rel);
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
@@ -215,4 +219,14 @@ extern void AtProcExit_LocalBuffers(void);
 extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);

+/*
+* Hooks for plugins to get control in
+* RelationGetRelationOSCacheInFork
+* RelationGetRelationPGCacheInFork
+*/
+typedef float4 (*oscache_hook_type) (Relation relation, ForkNumber forkNum);
+extern PGDLLIMPORT oscache_hook_type OSCache_hook;
+typedef float4 (*pgcache_hook_type) (Relation relation, ForkNumber forkNum);
+extern PGDLLIMPORT pgcache_hook_type PGCache_hook;
+
 #endif
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 5e28289..64ef53f 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -963,6 +963,7 @@ OprCacheKey
 OprInfo
 OprProofCacheEntry
 OprProofCacheKey
+OSCache_hook_type
 OutputContext
 OverrideSearchPath
 OverrideStackEntry
@@ -973,6 +974,7 @@ PBOOL
 PCtxtHandle
 PFN
 PGAsyncStatusType
+PGCache_hook_type
 PGCALL2
 PGEvent
 PGEventConnDestroy

pgsql-hackers by date:

Previous
From: Robert Haas
Date:
Subject: Re: DOMAINs and CASTs
Next
From: Andrew Dunstan
Date:
Subject: Isolation checks under MSVC