Re: pgsql: Support Parallel Append plan nodes. - Mailing list pgsql-committers

From Tom Lane
Subject Re: pgsql: Support Parallel Append plan nodes.
Date
Msg-id 2090.1512532919@sss.pgh.pa.us
Whole thread Raw
In response to Re: pgsql: Support Parallel Append plan nodes.  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: pgsql: Support Parallel Append plan nodes.  (Andres Freund <andres@anarazel.de>)
Re: pgsql: Support Parallel Append plan nodes.  (amul sul <sulamul@gmail.com>)
List pgsql-committers
I wrote:
> Robert Haas <rhaas@postgresql.org> writes:
>> Support Parallel Append plan nodes.

> Buildfarm member sidewinder doesn't like this:

It transpires that my machine prairiedog also shows the failure.
I instrumented nodeAppend.c (as per attached patch) to see what
was going on, and I get this trace on prairiedog:

2017-12-05 22:50:30.228 EST [28812] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 0, nplans 6
2017-12-05 22:50:30.228 EST [28812] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a1;
2017-12-05 22:50:30.241 EST [28813] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 0, nplans 6
2017-12-05 22:50:30.241 EST [28813] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a1;
2017-12-05 22:50:30.252 EST [28814] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 0, nplans 6
2017-12-05 22:50:30.252 EST [28814] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a1;
2017-12-05 22:50:30.363 EST [28816] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan 0, fpp 2, nplans 6
2017-12-05 22:50:30.363 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.363 EST [28816] LOG:  choose_next_subplan_for_worker: picking nextplan 0, fpp 2, nplans 6
2017-12-05 22:50:30.363 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.363 EST [28816] LOG:  choose_next_subplan_for_worker: whichplan 0, nextplan 1, fpp 2, nplans 6
2017-12-05 22:50:30.363 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.364 EST [28816] LOG:  choose_next_subplan_for_worker: picking nextplan 1, fpp 2, nplans 6
2017-12-05 22:50:30.364 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.364 EST [28816] LOG:  choose_next_subplan_for_worker: whichplan 1, nextplan 2, fpp 2, nplans 6
2017-12-05 22:50:30.364 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.364 EST [28816] LOG:  choose_next_subplan_for_worker: picking nextplan 2, fpp 2, nplans 6
2017-12-05 22:50:30.364 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.364 EST [28816] LOG:  choose_next_subplan_for_worker: whichplan 2, nextplan 3, fpp 2, nplans 6
2017-12-05 22:50:30.364 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.364 EST [28816] LOG:  choose_next_subplan_for_worker: picking nextplan 3, fpp 2, nplans 6
2017-12-05 22:50:30.364 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.365 EST [28816] LOG:  choose_next_subplan_for_worker: whichplan 3, nextplan 4, fpp 2, nplans 6
2017-12-05 22:50:30.365 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.365 EST [28816] LOG:  choose_next_subplan_for_worker: picking nextplan 4, fpp 2, nplans 6
2017-12-05 22:50:30.365 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.365 EST [28816] LOG:  choose_next_subplan_for_worker: whichplan 4, nextplan 5, fpp 2, nplans 6
2017-12-05 22:50:30.365 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.365 EST [28816] LOG:  choose_next_subplan_for_worker: picking nextplan 5, fpp 2, nplans 6
2017-12-05 22:50:30.365 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.365 EST [28816] LOG:  choose_next_subplan_for_worker: whichplan 5, nextplan 2, fpp 2, nplans 6
2017-12-05 22:50:30.365 EST [28816] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.407 EST [28815] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 2, nplans 6
2017-12-05 22:50:30.407 EST [28815] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.474 EST [28817] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 2, nplans 6
2017-12-05 22:50:30.474 EST [28817] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:50:30.568 EST [28818] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan 0, fpp 6, nplans 6
2017-12-05 22:50:30.568 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.568 EST [28818] LOG:  choose_next_subplan_for_worker: picking nextplan 0, fpp 6, nplans 6
2017-12-05 22:50:30.568 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.569 EST [28818] LOG:  choose_next_subplan_for_worker: whichplan 0, nextplan 1, fpp 6, nplans 6
2017-12-05 22:50:30.569 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.569 EST [28818] LOG:  choose_next_subplan_for_worker: picking nextplan 1, fpp 6, nplans 6
2017-12-05 22:50:30.569 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.569 EST [28818] LOG:  choose_next_subplan_for_worker: whichplan 1, nextplan 2, fpp 6, nplans 6
2017-12-05 22:50:30.569 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.569 EST [28818] LOG:  choose_next_subplan_for_worker: picking nextplan 2, fpp 6, nplans 6
2017-12-05 22:50:30.569 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.570 EST [28818] LOG:  choose_next_subplan_for_worker: whichplan 2, nextplan 3, fpp 6, nplans 6
2017-12-05 22:50:30.570 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.570 EST [28818] LOG:  choose_next_subplan_for_worker: picking nextplan 3, fpp 6, nplans 6
2017-12-05 22:50:30.570 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.570 EST [28818] LOG:  choose_next_subplan_for_worker: whichplan 3, nextplan 4, fpp 6, nplans 6
2017-12-05 22:50:30.570 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.570 EST [28818] LOG:  choose_next_subplan_for_worker: picking nextplan 4, fpp 6, nplans 6
2017-12-05 22:50:30.570 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.570 EST [28818] LOG:  choose_next_subplan_for_worker: whichplan 4, nextplan 5, fpp 6, nplans 6
2017-12-05 22:50:30.570 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:50:30.570 EST [28818] LOG:  choose_next_subplan_for_worker: picking nextplan 5, fpp 6, nplans 6
2017-12-05 22:50:30.570 EST [28818] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
TRAP: FailedAssertion("!(append->first_partial_plan < node->as_nplans)", File: "nodeAppend.c", Line: 520)
2017-12-05 22:50:38.211 EST [28158] LOG:  background worker "parallel worker" (PID 28818) was terminated by signal 6:
Aborttrap 
2017-12-05 22:50:38.211 EST [28158] DETAIL:  Failed process was running: select round(avg(aa)), sum(aa) from a_star a3;

That makes it pretty clear what's going wrong, but why don't we see it
elsewhere?

On my development machine, the same patch yields

2017-12-05 22:42:24.187 EST [843] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 0, nplans 6
2017-12-05 22:42:24.187 EST [843] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a1;
2017-12-05 22:42:24.188 EST [844] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 0, nplans 6
2017-12-05 22:42:24.188 EST [844] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a1;
2017-12-05 22:42:24.188 EST [845] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 0, nplans 6
2017-12-05 22:42:24.188 EST [845] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a1;
2017-12-05 22:42:24.197 EST [846] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 2, nplans 6
2017-12-05 22:42:24.197 EST [846] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:42:24.200 EST [847] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 2, nplans 6
2017-12-05 22:42:24.200 EST [847] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:42:24.200 EST [848] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 2, nplans 6
2017-12-05 22:42:24.200 EST [848] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a2;
2017-12-05 22:42:24.214 EST [849] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 6, nplans 6
2017-12-05 22:42:24.214 EST [849] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:42:24.214 EST [852] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 6, nplans 6
2017-12-05 22:42:24.214 EST [852] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;
2017-12-05 22:42:24.214 EST [853] LOG:  choose_next_subplan_for_worker: whichplan -1, nextplan -1, fpp 6, nplans 6
2017-12-05 22:42:24.214 EST [853] STATEMENT:  select round(avg(aa)), sum(aa) from a_star a3;

I conclude that the reason we don't see the crash except on the slowest
buildfarm critters is that on most machines the leader process manages
to finish off all the subplans before any of the workers get as far as
doing something.  This doesn't give me a warm fuzzy feeling about how
much testing this patch has gotten.

            regards, tom lane

diff --git a/src/backend/executor/nodeAppend.c b/src/backend/executor/nodeAppend.c
index 246a0b2..4c4355b 100644
--- a/src/backend/executor/nodeAppend.c
+++ b/src/backend/executor/nodeAppend.c
@@ -463,6 +463,12 @@ choose_next_subplan_for_worker(AppendState *node)

     LWLockAcquire(&pstate->pa_lock, LW_EXCLUSIVE);

+    elog(LOG, "choose_next_subplan_for_worker: whichplan %d, nextplan %d, fpp %d, nplans %d",
+         node->as_whichplan,
+         pstate->pa_next_plan,
+         append->first_partial_plan,
+         node->as_nplans);
+
     /* Mark just-completed subplan as finished. */
     if (node->as_whichplan != INVALID_SUBPLAN_INDEX)
         node->as_pstate->pa_finished[node->as_whichplan] = true;
@@ -502,6 +508,11 @@ choose_next_subplan_for_worker(AppendState *node)
         }
     }

+    elog(LOG, "choose_next_subplan_for_worker: picking nextplan %d, fpp %d, nplans %d",
+         pstate->pa_next_plan,
+         append->first_partial_plan,
+         node->as_nplans);
+
     /* Pick the plan we found, and advance pa_next_plan one more time. */
     node->as_whichplan = pstate->pa_next_plan++;
     if (pstate->pa_next_plan >= node->as_nplans)

pgsql-committers by date:

Previous
From: Tom Lane
Date:
Subject: pgsql: Adjust regression test cases added by commit ab7271677.
Next
From: Andres Freund
Date:
Subject: Re: pgsql: Support Parallel Append plan nodes.