Re: Allowing extensions to supply operator-/function-specific info - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Allowing extensions to supply operator-/function-specific info
Date
Msg-id 6044.1548524131@sss.pgh.pa.us
Whole thread Raw
In response to Re: Allowing extensions to supply operator-/function-specific info  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: Allowing extensions to supply operator-/function-specific info  (Robert Haas <robertmhaas@gmail.com>)
List pgsql-hackers
I wrote:
> There's a considerable amount of follow-up work that ought to happen
> now to make use of these capabilities for places that have been
> pain points in the past, such as generate_series() and unnest().
> But I haven't touched that yet.

Attached is an 0004 that makes a stab at providing some intelligence
for unnest() and the integer cases of generate_series().  This only
affects one plan choice in the existing regression tests; I tweaked
that test to keep the plan the same.  I didn't add new test cases
demonstrating the functionality, since it's a bit hard to show it
directly within the constraints of EXPLAIN (COSTS OFF).  We could
do something along the lines of the quick-hack rowcount test in 0003,
perhaps, but that's pretty indirect.

Looking at this, I'm dissatisfied with the amount of new #include's
being dragged into datatype-specific .c files.  I don't really want
to end up with most of utils/adt/ having dependencies on planner
data structures, but that's where we would be headed.  I can think
of a couple of possibilities:

* Instead of putting support functions beside their target function,
group all the core's support functions into one new .c file.  I'm
afraid this would lead to the reverse problem of having to import
lots of datatype-private info into that file.

* Try to refactor the planner's .h files so that there's just one
"external use" header providing stuff like estimate_expression_value,
while keeping PlannerInfo as an opaque struct.  Then importing that
into utils/adt/ files would not represent such a big dependency
footprint.

I find the second choice more appealing, though it's getting a bit
far afield from where this started.  OTOH, lots of other header
refactoring is going on right now, so why not ...

Thoughts?

            regards, tom lane

diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index e457d81..14cc202 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -22,12 +22,15 @@
 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "libpq/pqformat.h"
+#include "nodes/supportnodes.h"
+#include "optimizer/clauses.h"
 #include "utils/array.h"
 #include "utils/arrayaccess.h"
 #include "utils/builtins.h"
 #include "utils/datum.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
+#include "utils/selfuncs.h"
 #include "utils/typcache.h"


@@ -6026,6 +6029,36 @@ array_unnest(PG_FUNCTION_ARGS)
     }
 }

+/*
+ * Planner support function for array_unnest(anyarray)
+ */
+Datum
+array_unnest_support(PG_FUNCTION_ARGS)
+{
+    Node       *rawreq = (Node *) PG_GETARG_POINTER(0);
+    Node       *ret = NULL;
+
+    if (IsA(rawreq, SupportRequestRows))
+    {
+        /* Try to estimate the number of rows returned */
+        SupportRequestRows *req = (SupportRequestRows *) rawreq;
+
+        if (is_funcclause(req->node))    /* be paranoid */
+        {
+            List       *args = ((FuncExpr *) req->node)->args;
+            Node       *arg1;
+
+            /* We can use estimated argument values here */
+            arg1 = estimate_expression_value(req->root, linitial(args));
+
+            req->rows = estimate_array_length(arg1);
+            ret = (Node *) req;
+        }
+    }
+
+    PG_RETURN_POINTER(ret);
+}
+

 /*
  * array_replace/array_remove support
diff --git a/src/backend/utils/adt/int.c b/src/backend/utils/adt/int.c
index fd82a83..263920c 100644
--- a/src/backend/utils/adt/int.c
+++ b/src/backend/utils/adt/int.c
@@ -30,11 +30,14 @@

 #include <ctype.h>
 #include <limits.h>
+#include <math.h>

 #include "catalog/pg_type.h"
 #include "common/int.h"
 #include "funcapi.h"
 #include "libpq/pqformat.h"
+#include "nodes/supportnodes.h"
+#include "optimizer/clauses.h"
 #include "utils/array.h"
 #include "utils/builtins.h"

@@ -1427,3 +1430,73 @@ generate_series_step_int4(PG_FUNCTION_ARGS)
         /* do when there is no more left */
         SRF_RETURN_DONE(funcctx);
 }
+
+/*
+ * Planner support function for generate_series(int4, int4 [, int4])
+ */
+Datum
+generate_series_int4_support(PG_FUNCTION_ARGS)
+{
+    Node       *rawreq = (Node *) PG_GETARG_POINTER(0);
+    Node       *ret = NULL;
+
+    if (IsA(rawreq, SupportRequestRows))
+    {
+        /* Try to estimate the number of rows returned */
+        SupportRequestRows *req = (SupportRequestRows *) rawreq;
+
+        if (is_funcclause(req->node))    /* be paranoid */
+        {
+            List       *args = ((FuncExpr *) req->node)->args;
+            Node       *arg1,
+                       *arg2,
+                       *arg3;
+
+            /* We can use estimated argument values here */
+            arg1 = estimate_expression_value(req->root, linitial(args));
+            arg2 = estimate_expression_value(req->root, lsecond(args));
+            if (list_length(args) >= 3)
+                arg3 = estimate_expression_value(req->root, lthird(args));
+            else
+                arg3 = NULL;
+
+            /*
+             * If any argument is constant NULL, we can safely assume that
+             * zero rows are returned.  Otherwise, if they're all non-NULL
+             * constants, we can calculate the number of rows that will be
+             * returned.  Use double arithmetic to avoid overflow hazards.
+             */
+            if ((IsA(arg1, Const) &&
+                 ((Const *) arg1)->constisnull) ||
+                (IsA(arg2, Const) &&
+                 ((Const *) arg2)->constisnull) ||
+                (arg3 != NULL && IsA(arg3, Const) &&
+                 ((Const *) arg3)->constisnull))
+            {
+                req->rows = 0;
+                ret = (Node *) req;
+            }
+            else if (IsA(arg1, Const) &&
+                     IsA(arg2, Const) &&
+                     (arg3 == NULL || IsA(arg3, Const)))
+            {
+                double        start,
+                            finish,
+                            step;
+
+                start = DatumGetInt32(((Const *) arg1)->constvalue);
+                finish = DatumGetInt32(((Const *) arg2)->constvalue);
+                step = arg3 ? DatumGetInt32(((Const *) arg3)->constvalue) : 1;
+
+                /* This equation works for either sign of step */
+                if (step != 0)
+                {
+                    req->rows = floor((finish - start + step) / step);
+                    ret = (Node *) req;
+                }
+            }
+        }
+    }
+
+    PG_RETURN_POINTER(ret);
+}
diff --git a/src/backend/utils/adt/int8.c b/src/backend/utils/adt/int8.c
index d16cc9e..5157de4 100644
--- a/src/backend/utils/adt/int8.c
+++ b/src/backend/utils/adt/int8.c
@@ -20,6 +20,8 @@
 #include "common/int.h"
 #include "funcapi.h"
 #include "libpq/pqformat.h"
+#include "nodes/supportnodes.h"
+#include "optimizer/clauses.h"
 #include "utils/int8.h"
 #include "utils/builtins.h"

@@ -1373,3 +1375,73 @@ generate_series_step_int8(PG_FUNCTION_ARGS)
         /* do when there is no more left */
         SRF_RETURN_DONE(funcctx);
 }
+
+/*
+ * Planner support function for generate_series(int8, int8 [, int8])
+ */
+Datum
+generate_series_int8_support(PG_FUNCTION_ARGS)
+{
+    Node       *rawreq = (Node *) PG_GETARG_POINTER(0);
+    Node       *ret = NULL;
+
+    if (IsA(rawreq, SupportRequestRows))
+    {
+        /* Try to estimate the number of rows returned */
+        SupportRequestRows *req = (SupportRequestRows *) rawreq;
+
+        if (is_funcclause(req->node))    /* be paranoid */
+        {
+            List       *args = ((FuncExpr *) req->node)->args;
+            Node       *arg1,
+                       *arg2,
+                       *arg3;
+
+            /* We can use estimated argument values here */
+            arg1 = estimate_expression_value(req->root, linitial(args));
+            arg2 = estimate_expression_value(req->root, lsecond(args));
+            if (list_length(args) >= 3)
+                arg3 = estimate_expression_value(req->root, lthird(args));
+            else
+                arg3 = NULL;
+
+            /*
+             * If any argument is constant NULL, we can safely assume that
+             * zero rows are returned.  Otherwise, if they're all non-NULL
+             * constants, we can calculate the number of rows that will be
+             * returned.  Use double arithmetic to avoid overflow hazards.
+             */
+            if ((IsA(arg1, Const) &&
+                 ((Const *) arg1)->constisnull) ||
+                (IsA(arg2, Const) &&
+                 ((Const *) arg2)->constisnull) ||
+                (arg3 != NULL && IsA(arg3, Const) &&
+                 ((Const *) arg3)->constisnull))
+            {
+                req->rows = 0;
+                ret = (Node *) req;
+            }
+            else if (IsA(arg1, Const) &&
+                     IsA(arg2, Const) &&
+                     (arg3 == NULL || IsA(arg3, Const)))
+            {
+                double        start,
+                            finish,
+                            step;
+
+                start = DatumGetInt64(((Const *) arg1)->constvalue);
+                finish = DatumGetInt64(((Const *) arg2)->constvalue);
+                step = arg3 ? DatumGetInt64(((Const *) arg3)->constvalue) : 1;
+
+                /* This equation works for either sign of step */
+                if (step != 0)
+                {
+                    req->rows = floor((finish - start + step) / step);
+                    ret = (Node *) req;
+                }
+            }
+        }
+    }
+
+    PG_RETURN_POINTER(ret);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index e5cb5bb..039b596 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1530,9 +1530,12 @@
   proargtypes => 'anyelement _int4 _int4',
   prosrc => 'array_fill_with_lower_bounds' },
 { oid => '2331', descr => 'expand array to set of rows',
-  proname => 'unnest', prorows => '100', proretset => 't',
-  prorettype => 'anyelement', proargtypes => 'anyarray',
+  proname => 'unnest', prorows => '100', prosupport => 'array_unnest_support',
+  proretset => 't', prorettype => 'anyelement', proargtypes => 'anyarray',
   prosrc => 'array_unnest' },
+{ oid => '3996', descr => 'planner support for array_unnest',
+  proname => 'array_unnest_support', prorettype => 'internal',
+  proargtypes => 'internal', prosrc => 'array_unnest_support' },
 { oid => '3167',
   descr => 'remove any occurrences of an element from an array',
   proname => 'array_remove', proisstrict => 'f', prorettype => 'anyarray',
@@ -7536,21 +7539,31 @@

 # non-persistent series generator
 { oid => '1066', descr => 'non-persistent series generator',
-  proname => 'generate_series', prorows => '1000', proretset => 't',
+  proname => 'generate_series', prorows => '1000',
+  prosupport => 'generate_series_int4_support', proretset => 't',
   prorettype => 'int4', proargtypes => 'int4 int4 int4',
   prosrc => 'generate_series_step_int4' },
 { oid => '1067', descr => 'non-persistent series generator',
-  proname => 'generate_series', prorows => '1000', proretset => 't',
+  proname => 'generate_series', prorows => '1000',
+  prosupport => 'generate_series_int4_support', proretset => 't',
   prorettype => 'int4', proargtypes => 'int4 int4',
   prosrc => 'generate_series_int4' },
+{ oid => '3994', descr => 'planner support for generate_series',
+  proname => 'generate_series_int4_support', prorettype => 'internal',
+  proargtypes => 'internal', prosrc => 'generate_series_int4_support' },
 { oid => '1068', descr => 'non-persistent series generator',
-  proname => 'generate_series', prorows => '1000', proretset => 't',
+  proname => 'generate_series', prorows => '1000',
+  prosupport => 'generate_series_int8_support', proretset => 't',
   prorettype => 'int8', proargtypes => 'int8 int8 int8',
   prosrc => 'generate_series_step_int8' },
 { oid => '1069', descr => 'non-persistent series generator',
-  proname => 'generate_series', prorows => '1000', proretset => 't',
+  proname => 'generate_series', prorows => '1000',
+  prosupport => 'generate_series_int8_support', proretset => 't',
   prorettype => 'int8', proargtypes => 'int8 int8',
   prosrc => 'generate_series_int8' },
+{ oid => '3995', descr => 'planner support for generate_series',
+  proname => 'generate_series_int8_support', prorettype => 'internal',
+  proargtypes => 'internal', prosrc => 'generate_series_int8_support' },
 { oid => '3259', descr => 'non-persistent series generator',
   proname => 'generate_series', prorows => '1000', proretset => 't',
   prorettype => 'numeric', proargtypes => 'numeric numeric numeric',
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
index 588d069..4056afa 100644
--- a/src/test/regress/expected/subselect.out
+++ b/src/test/regress/expected/subselect.out
@@ -904,7 +904,7 @@ select * from int4_tbl where
 --
 explain (verbose, costs off)
 select * from int4_tbl o where (f1, f1) in
-  (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
+  (select f1, generate_series(1,50) / 10 g from int4_tbl i group by f1);
                             QUERY PLAN
 -------------------------------------------------------------------
  Nested Loop Semi Join
@@ -918,9 +918,9 @@ select * from int4_tbl o where (f1, f1) in
                Output: "ANY_subquery".f1, "ANY_subquery".g
                Filter: ("ANY_subquery".f1 = "ANY_subquery".g)
                ->  Result
-                     Output: i.f1, ((generate_series(1, 2)) / 10)
+                     Output: i.f1, ((generate_series(1, 50)) / 10)
                      ->  ProjectSet
-                           Output: generate_series(1, 2), i.f1
+                           Output: generate_series(1, 50), i.f1
                            ->  HashAggregate
                                  Output: i.f1
                                  Group Key: i.f1
@@ -929,7 +929,7 @@ select * from int4_tbl o where (f1, f1) in
 (19 rows)

 select * from int4_tbl o where (f1, f1) in
-  (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
+  (select f1, generate_series(1,50) / 10 g from int4_tbl i group by f1);
  f1
 ----
   0
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
index 843f511..ccbe8a1 100644
--- a/src/test/regress/sql/subselect.sql
+++ b/src/test/regress/sql/subselect.sql
@@ -498,9 +498,9 @@ select * from int4_tbl where
 --
 explain (verbose, costs off)
 select * from int4_tbl o where (f1, f1) in
-  (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
+  (select f1, generate_series(1,50) / 10 g from int4_tbl i group by f1);
 select * from int4_tbl o where (f1, f1) in
-  (select f1, generate_series(1,2) / 10 g from int4_tbl i group by f1);
+  (select f1, generate_series(1,50) / 10 g from int4_tbl i group by f1);

 --
 -- check for over-optimization of whole-row Var referencing an Append plan

pgsql-hackers by date:

Previous
From: Andrew Gierth
Date:
Subject: Re: Opossum vs. float4 NaN
Next
From: Dmitry Dolgov
Date:
Subject: Re: Index Skip Scan