Re: Weird index or sort behaviour - Mailing list pgsql-performance

From Tom Lane
Subject Re: Weird index or sort behaviour
Date
Msg-id 9131.1258253421@sss.pgh.pa.us
Whole thread Raw
In response to Re: Weird index or sort behaviour  (Matthew Wakeling <matthew@flymine.org>)
Responses Re: Weird index or sort behaviour  (Matthew Wakeling <matthew@flymine.org>)
List pgsql-performance
Matthew Wakeling <matthew@flymine.org> writes:
> [ discussion about applying materialize to a mergejoin's inner indexscan ]

I have finally gotten round to doing something about this, and applied
the attached patch to CVS HEAD.  Could you test it on your problem case
to see what happens?  If it's not convenient to load your data into
8.5devel, I believe the patch would work all right in 8.4.x.  (A quick
check shows that it applies except for one deletion hunk that has a
conflict due to a comment change; you could easily do that deletion
manually.)

            regards, tom lane

Index: src/backend/nodes/outfuncs.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/nodes/outfuncs.c,v
retrieving revision 1.371
diff -c -r1.371 outfuncs.c
*** src/backend/nodes/outfuncs.c    28 Oct 2009 14:55:38 -0000    1.371
--- src/backend/nodes/outfuncs.c    15 Nov 2009 02:45:20 -0000
***************
*** 1501,1506 ****
--- 1501,1507 ----
      WRITE_NODE_FIELD(path_mergeclauses);
      WRITE_NODE_FIELD(outersortkeys);
      WRITE_NODE_FIELD(innersortkeys);
+     WRITE_BOOL_FIELD(materialize_inner);
  }

  static void
Index: src/backend/optimizer/path/allpaths.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/optimizer/path/allpaths.c,v
retrieving revision 1.188
diff -c -r1.188 allpaths.c
*** src/backend/optimizer/path/allpaths.c    26 Oct 2009 02:26:33 -0000    1.188
--- src/backend/optimizer/path/allpaths.c    15 Nov 2009 02:45:20 -0000
***************
*** 1443,1455 ****
          {
              MergePath  *mp = (MergePath *) path;

!             if (mp->outersortkeys || mp->innersortkeys)
!             {
!                 for (i = 0; i < indent; i++)
!                     printf("\t");
!                 printf("  sortouter=%d sortinner=%d\n",
!                        ((mp->outersortkeys) ? 1 : 0),
!                        ((mp->innersortkeys) ? 1 : 0));
              }
          }

--- 1443,1454 ----
          {
              MergePath  *mp = (MergePath *) path;

!             for (i = 0; i < indent; i++)
!                 printf("\t");
!             printf("  sortouter=%d sortinner=%d materializeinner=%d\n",
!                    ((mp->outersortkeys) ? 1 : 0),
!                    ((mp->innersortkeys) ? 1 : 0),
!                    ((mp->materialize_inner) ? 1 : 0));
              }
          }

Index: src/backend/optimizer/path/costsize.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/optimizer/path/costsize.c,v
retrieving revision 1.211
diff -c -r1.211 costsize.c
*** src/backend/optimizer/path/costsize.c    12 Sep 2009 22:12:03 -0000    1.211
--- src/backend/optimizer/path/costsize.c    15 Nov 2009 02:45:20 -0000
***************
*** 1167,1189 ****
  }

  /*
-  * sort_exceeds_work_mem
-  *      Given a finished Sort plan node, detect whether it is expected to
-  *      spill to disk (ie, will need more than work_mem workspace)
-  *
-  * This assumes there will be no available LIMIT.
-  */
- bool
- sort_exceeds_work_mem(Sort *sort)
- {
-     double        input_bytes = relation_byte_size(sort->plan.plan_rows,
-                                                  sort->plan.plan_width);
-     long        work_mem_bytes = work_mem * 1024L;
-
-     return (input_bytes > work_mem_bytes);
- }
-
- /*
   * cost_material
   *      Determines and returns the cost of materializing a relation, including
   *      the cost of reading the input data.
--- 1167,1172 ----
***************
*** 1543,1549 ****
   *      Determines and returns the cost of joining two relations using the
   *      merge join algorithm.
   *
!  * 'path' is already filled in except for the cost fields
   * 'sjinfo' is extra info about the join for selectivity estimation
   *
   * Notes: path's mergeclauses should be a subset of the joinrestrictinfo list;
--- 1526,1543 ----
   *      Determines and returns the cost of joining two relations using the
   *      merge join algorithm.
   *
!  * Unlike other costsize functions, this routine makes one actual decision:
!  * whether we should materialize the inner path.  We do that either because
!  * the inner path can't support mark/restore, or because it's cheaper to
!  * use an interposed Material node to handle mark/restore.  When the decision
!  * is cost-based it would be logically cleaner to build and cost two separate
!  * paths with and without that flag set; but that would require repeating most
!  * of the calculations here, which are not all that cheap.  Since the choice
!  * will not affect output pathkeys or startup cost, only total cost, there is
!  * no possibility of wanting to keep both paths.  So it seems best to make
!  * the decision here and record it in the path's materialize_inner field.
!  *
!  * 'path' is already filled in except for the cost fields and materialize_inner
   * 'sjinfo' is extra info about the join for selectivity estimation
   *
   * Notes: path's mergeclauses should be a subset of the joinrestrictinfo list;
***************
*** 1561,1567 ****
      List       *innersortkeys = path->innersortkeys;
      Cost        startup_cost = 0;
      Cost        run_cost = 0;
!     Cost        cpu_per_tuple;
      QualCost    merge_qual_cost;
      QualCost    qp_qual_cost;
      double        outer_path_rows = PATH_ROWS(outer_path);
--- 1555,1564 ----
      List       *innersortkeys = path->innersortkeys;
      Cost        startup_cost = 0;
      Cost        run_cost = 0;
!     Cost        cpu_per_tuple,
!                 inner_run_cost,
!                 bare_inner_cost,
!                 mat_inner_cost;
      QualCost    merge_qual_cost;
      QualCost    qp_qual_cost;
      double        outer_path_rows = PATH_ROWS(outer_path);
***************
*** 1606,1615 ****
      /*
       * When there are equal merge keys in the outer relation, the mergejoin
       * must rescan any matching tuples in the inner relation. This means
!      * re-fetching inner tuples.  Our cost model for this is that a re-fetch
!      * costs the same as an original fetch, which is probably an overestimate;
!      * but on the other hand we ignore the bookkeeping costs of mark/restore.
!      * Not clear if it's worth developing a more refined model.
       *
       * For regular inner and outer joins, the number of re-fetches can be
       * estimated approximately as size of merge join output minus size of
--- 1603,1609 ----
      /*
       * When there are equal merge keys in the outer relation, the mergejoin
       * must rescan any matching tuples in the inner relation. This means
!      * re-fetching inner tuples; we have to estimate how often that happens.
       *
       * For regular inner and outer joins, the number of re-fetches can be
       * estimated approximately as size of merge join output minus size of
***************
*** 1641,1647 ****
          if (rescannedtuples < 0)
              rescannedtuples = 0;
      }
!     /* We'll inflate inner run cost this much to account for rescanning */
      rescanratio = 1.0 + (rescannedtuples / inner_path_rows);

      /*
--- 1635,1641 ----
          if (rescannedtuples < 0)
              rescannedtuples = 0;
      }
!     /* We'll inflate various costs this much to account for rescanning */
      rescanratio = 1.0 + (rescannedtuples / inner_path_rows);

      /*
***************
*** 1778,1809 ****
                    -1.0);
          startup_cost += sort_path.startup_cost;
          startup_cost += (sort_path.total_cost - sort_path.startup_cost)
!             * innerstartsel * rescanratio;
!         run_cost += (sort_path.total_cost - sort_path.startup_cost)
!             * (innerendsel - innerstartsel) * rescanratio;
!
!         /*
!          * If the inner sort is expected to spill to disk, we want to add a
!          * materialize node to shield it from the need to handle mark/restore.
!          * This will allow it to perform the last merge pass on-the-fly, while
!          * in most cases not requiring the materialize to spill to disk.
!          * Charge an extra cpu_tuple_cost per tuple to account for the
!          * materialize node.  (Keep this estimate in sync with similar ones in
!          * create_mergejoin_path and create_mergejoin_plan.)
!          */
!         if (relation_byte_size(inner_path_rows, inner_path->parent->width) >
!             (work_mem * 1024L))
!             run_cost += cpu_tuple_cost * inner_path_rows;
      }
      else
      {
          startup_cost += inner_path->startup_cost;
          startup_cost += (inner_path->total_cost - inner_path->startup_cost)
!             * innerstartsel * rescanratio;
!         run_cost += (inner_path->total_cost - inner_path->startup_cost)
!             * (innerendsel - innerstartsel) * rescanratio;
      }

      /* CPU costs */

      /*
--- 1772,1854 ----
                    -1.0);
          startup_cost += sort_path.startup_cost;
          startup_cost += (sort_path.total_cost - sort_path.startup_cost)
!             * innerstartsel;
!         inner_run_cost = (sort_path.total_cost - sort_path.startup_cost)
!             * (innerendsel - innerstartsel);
      }
      else
      {
          startup_cost += inner_path->startup_cost;
          startup_cost += (inner_path->total_cost - inner_path->startup_cost)
!             * innerstartsel;
!         inner_run_cost = (inner_path->total_cost - inner_path->startup_cost)
!             * (innerendsel - innerstartsel);
      }

+     /*
+      * Decide whether we want to materialize the inner input to shield it from
+      * mark/restore and performing re-fetches.  Our cost model for regular
+      * re-fetches is that a re-fetch costs the same as an original fetch,
+      * which is probably an overestimate; but on the other hand we ignore the
+      * bookkeeping costs of mark/restore.  Not clear if it's worth developing
+      * a more refined model.  So we just need to inflate the inner run cost
+      * by rescanratio.
+      */
+     bare_inner_cost = inner_run_cost * rescanratio;
+     /*
+      * When we interpose a Material node the re-fetch cost is assumed to be
+      * just cpu_tuple_cost per tuple, independently of the underlying plan's
+      * cost; but we have to charge an extra cpu_tuple_cost per original fetch
+      * as well.  Note that we're assuming the materialize node will never
+      * spill to disk, since it only has to remember tuples back to the last
+      * mark.  (If there are a huge number of duplicates, our other cost
+      * factors will make the path so expensive that it probably won't get
+      * chosen anyway.)  So we don't use cost_rescan here.
+      *
+      * Note: keep this estimate in sync with create_mergejoin_plan's labeling
+      * of the generated Material node.
+      */
+     mat_inner_cost = inner_run_cost +
+         cpu_tuple_cost * inner_path_rows * rescanratio;
+
+     /* Prefer materializing if it looks cheaper */
+     if (mat_inner_cost < bare_inner_cost)
+         path->materialize_inner = true;
+     /*
+      * Even if materializing doesn't look cheaper, we *must* do it if the
+      * inner path is to be used directly (without sorting) and it doesn't
+      * support mark/restore.
+      *
+      * Since the inner side must be ordered, and only Sorts and IndexScans can
+      * create order to begin with, and they both support mark/restore, you
+      * might think there's no problem --- but you'd be wrong.  Nestloop and
+      * merge joins can *preserve* the order of their inputs, so they can be
+      * selected as the input of a mergejoin, and they don't support
+      * mark/restore at present.
+      */
+     else if (innersortkeys == NIL &&
+              !ExecSupportsMarkRestore(inner_path->pathtype))
+         path->materialize_inner = true;
+     /*
+      * Also, force materializing if the inner path is to be sorted and the
+      * sort is expected to spill to disk.  This is because the final merge
+      * pass can be done on-the-fly if it doesn't have to support mark/restore.
+      * We don't try to adjust the cost estimates for this consideration,
+      * though.
+      */
+     else if (innersortkeys != NIL &&
+              relation_byte_size(inner_path_rows, inner_path->parent->width) >
+              (work_mem * 1024L))
+         path->materialize_inner = true;
+     else
+         path->materialize_inner = false;
+
+     /* Charge the right incremental cost for the chosen case */
+     if (path->materialize_inner)
+         run_cost += mat_inner_cost;
+     else
+         run_cost += bare_inner_cost;
+
      /* CPU costs */

      /*
Index: src/backend/optimizer/plan/createplan.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/optimizer/plan/createplan.c,v
retrieving revision 1.266
diff -c -r1.266 createplan.c
*** src/backend/optimizer/plan/createplan.c    26 Oct 2009 02:26:33 -0000    1.266
--- src/backend/optimizer/plan/createplan.c    15 Nov 2009 02:45:21 -0000
***************
*** 1664,1672 ****
                               best_path->jpath.outerjoinpath->parent->relids);

      /*
!      * Create explicit sort nodes for the outer and inner join paths if
!      * necessary.  The sort cost was already accounted for in the path. Make
!      * sure there are no excess columns in the inputs if sorting.
       */
      if (best_path->outersortkeys)
      {
--- 1664,1671 ----
                               best_path->jpath.outerjoinpath->parent->relids);

      /*
!      * Create explicit sort nodes for the outer and inner paths if necessary.
!      * Make sure there are no excess columns in the inputs if sorting.
       */
      if (best_path->outersortkeys)
      {
***************
*** 1695,1717 ****
          innerpathkeys = best_path->jpath.innerjoinpath->pathkeys;

      /*
!      * If inner plan is a sort that is expected to spill to disk, add a
!      * materialize node to shield it from the need to handle mark/restore.
!      * This will allow it to perform the last merge pass on-the-fly, while in
!      * most cases not requiring the materialize to spill to disk.
!      *
!      * XXX really, Sort oughta do this for itself, probably, to avoid the
!      * overhead of a separate plan node.
       */
!     if (IsA(inner_plan, Sort) &&
!         sort_exceeds_work_mem((Sort *) inner_plan))
      {
          Plan       *matplan = (Plan *) make_material(inner_plan);

          /*
           * We assume the materialize will not spill to disk, and therefore
           * charge just cpu_tuple_cost per tuple.  (Keep this estimate in sync
!          * with similar ones in cost_mergejoin and create_mergejoin_path.)
           */
          copy_plan_costsize(matplan, inner_plan);
          matplan->total_cost += cpu_tuple_cost * matplan->plan_rows;
--- 1694,1710 ----
          innerpathkeys = best_path->jpath.innerjoinpath->pathkeys;

      /*
!      * If specified, add a materialize node to shield the inner plan from
!      * the need to handle mark/restore.
       */
!     if (best_path->materialize_inner)
      {
          Plan       *matplan = (Plan *) make_material(inner_plan);

          /*
           * We assume the materialize will not spill to disk, and therefore
           * charge just cpu_tuple_cost per tuple.  (Keep this estimate in sync
!          * with cost_mergejoin.)
           */
          copy_plan_costsize(matplan, inner_plan);
          matplan->total_cost += cpu_tuple_cost * matplan->plan_rows;
***************
*** 1887,1892 ****
--- 1880,1886 ----
                                 inner_plan,
                                 best_path->jpath.jointype);

+     /* Costs of sort and material steps are included in path cost already */
      copy_path_costsize(&join_plan->join.plan, &best_path->jpath.path);

      return join_plan;
Index: src/backend/optimizer/util/pathnode.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/optimizer/util/pathnode.c,v
retrieving revision 1.154
diff -c -r1.154 pathnode.c
*** src/backend/optimizer/util/pathnode.c    17 Sep 2009 20:49:29 -0000    1.154
--- src/backend/optimizer/util/pathnode.c    15 Nov 2009 02:45:21 -0000
***************
*** 17,23 ****
  #include <math.h>

  #include "catalog/pg_operator.h"
- #include "executor/executor.h"
  #include "miscadmin.h"
  #include "optimizer/clauses.h"
  #include "optimizer/cost.h"
--- 17,22 ----
***************
*** 1414,1460 ****
          pathkeys_contained_in(innersortkeys, inner_path->pathkeys))
          innersortkeys = NIL;

-     /*
-      * If we are not sorting the inner path, we may need a materialize node to
-      * ensure it can be marked/restored.
-      *
-      * Since the inner side must be ordered, and only Sorts and IndexScans can
-      * create order to begin with, and they both support mark/restore, you
-      * might think there's no problem --- but you'd be wrong.  Nestloop and
-      * merge joins can *preserve* the order of their inputs, so they can be
-      * selected as the input of a mergejoin, and they don't support
-      * mark/restore at present.
-      *
-      * Note: Sort supports mark/restore, so no materialize is really needed in
-      * that case; but one may be desirable anyway to optimize the sort.
-      * However, since we aren't representing the sort step separately in the
-      * Path tree, we can't explicitly represent the materialize either. So
-      * that case is not handled here.  Instead, cost_mergejoin has to factor
-      * in the cost and create_mergejoin_plan has to add the plan node.
-      */
-     if (innersortkeys == NIL &&
-         !ExecSupportsMarkRestore(inner_path->pathtype))
-     {
-         Path       *mpath;
-
-         mpath = (Path *) create_material_path(inner_path->parent, inner_path);
-
-         /*
-          * We expect the materialize won't spill to disk (it could only do so
-          * if there were a whole lot of duplicate tuples, which is a case
-          * cost_mergejoin will avoid choosing anyway).    Therefore
-          * cost_material's cost estimate is bogus and we should charge just
-          * cpu_tuple_cost per tuple.  (Keep this estimate in sync with similar
-          * ones in cost_mergejoin and create_mergejoin_plan; also see
-          * cost_rescan.)
-          */
-         mpath->startup_cost = inner_path->startup_cost;
-         mpath->total_cost = inner_path->total_cost;
-         mpath->total_cost += cpu_tuple_cost * inner_path->parent->rows;
-
-         inner_path = mpath;
-     }
-
      pathnode->jpath.path.pathtype = T_MergeJoin;
      pathnode->jpath.path.parent = joinrel;
      pathnode->jpath.jointype = jointype;
--- 1413,1418 ----
***************
*** 1465,1470 ****
--- 1423,1429 ----
      pathnode->path_mergeclauses = mergeclauses;
      pathnode->outersortkeys = outersortkeys;
      pathnode->innersortkeys = innersortkeys;
+     /* pathnode->materialize_inner will be set by cost_mergejoin */

      cost_mergejoin(pathnode, root, sjinfo);

Index: src/include/nodes/relation.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/nodes/relation.h,v
retrieving revision 1.178
diff -c -r1.178 relation.h
*** src/include/nodes/relation.h    26 Oct 2009 02:26:43 -0000    1.178
--- src/include/nodes/relation.h    15 Nov 2009 02:45:21 -0000
***************
*** 835,840 ****
--- 835,848 ----
  /*
   * A mergejoin path has these fields.
   *
+  * Unlike other path types, a MergePath node doesn't represent just a single
+  * run-time plan node: it can represent up to four.  Aside from the MergeJoin
+  * node itself, there can be a Sort node for the outer input, a Sort node
+  * for the inner input, and/or a Material node for the inner input.  We could
+  * represent these nodes by separate path nodes, but considering how many
+  * different merge paths are investigated during a complex join problem,
+  * it seems better to avoid unnecessary palloc overhead.
+  *
   * path_mergeclauses lists the clauses (in the form of RestrictInfos)
   * that will be used in the merge.
   *
***************
*** 846,860 ****
   * outersortkeys (resp. innersortkeys) is NIL if the outer path
   * (resp. inner path) is already ordered appropriately for the
   * mergejoin.  If it is not NIL then it is a PathKeys list describing
!  * the ordering that must be created by an explicit sort step.
   */

  typedef struct MergePath
  {
      JoinPath    jpath;
      List       *path_mergeclauses;        /* join clauses to be used for merge */
!     List       *outersortkeys;    /* keys for explicit sort, if any */
!     List       *innersortkeys;    /* keys for explicit sort, if any */
  } MergePath;

  /*
--- 854,872 ----
   * outersortkeys (resp. innersortkeys) is NIL if the outer path
   * (resp. inner path) is already ordered appropriately for the
   * mergejoin.  If it is not NIL then it is a PathKeys list describing
!  * the ordering that must be created by an explicit Sort node.
!  *
!  * materialize_inner is TRUE if a Material node should be placed atop the
!  * inner input.  This may appear with or without an inner Sort step.
   */

  typedef struct MergePath
  {
      JoinPath    jpath;
      List       *path_mergeclauses;        /* join clauses to be used for merge */
!     List       *outersortkeys;            /* keys for explicit sort, if any */
!     List       *innersortkeys;            /* keys for explicit sort, if any */
!     bool        materialize_inner;        /* add Materialize to inner? */
  } MergePath;

  /*
Index: src/include/optimizer/cost.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/optimizer/cost.h,v
retrieving revision 1.98
diff -c -r1.98 cost.h
*** src/include/optimizer/cost.h    12 Sep 2009 22:12:04 -0000    1.98
--- src/include/optimizer/cost.h    15 Nov 2009 02:45:21 -0000
***************
*** 84,90 ****
  extern void cost_sort(Path *path, PlannerInfo *root,
            List *pathkeys, Cost input_cost, double tuples, int width,
            double limit_tuples);
- extern bool sort_exceeds_work_mem(Sort *sort);
  extern void cost_material(Path *path,
                Cost input_startup_cost, Cost input_total_cost,
                double tuples, int width);
--- 84,89 ----

pgsql-performance by date:

Previous
From: Merlin Moncure
Date:
Subject: Re: SSD + RAID
Next
From: Laszlo Nagy
Date:
Subject: Re: SSD + RAID