Thread: BUG #15703: Segfault in cancelled CALL-Statements
The following bug has been logged on the website: Bug reference: 15703 Logged by: Julian Schauder Email address: julian.schauder@gmx.de PostgreSQL version: 11.2 Operating system: Ubuntu 18.04 / Linux 4.18.0-16 Description: Hi, there seems to be a SEGFAULT issue with CALL-Procedures once they get SIGINT'ed. While currently able to reproduce with non-disclosed functions and data- I am working on a minimal report for public reproduction. The Segfault occurs within ResourceArrayFree(&(owner->bufferarr)) once the portal gets cleaned up. Initially this was caused by a single SIGINT to a rarely called Procedure. Reproduction currently requires non-disclosed data and functions, alongside a rather desperate instumentation, but it causes a segfault within a few seconds. > while [ true ]; do for x in `seq 1 20`; do bash killer.sh & done ; wait ; done ; > $ cat killer.sh > echo "Starting..." > ID="$RANDOM" > psql -c "CALL procedure ( now()::date ) ;-- $ID " &>/dev/null & > sleep $[ ( $RANDOM % 3 ) + 1 ]s > psql -c "SELECT pg_cancel_backend( pid ) FROM pg_stat_activity where query ilike '%$ID%'; " > echo "Killed" > CREATE OR REPLACE PROCEDURE aggregate_inner(key text, minimumAge date) > LANGUAGE plpgsql > AS $function$ > $function$ > > CREATE OR REPLACE PROCEDURE aggregate_outer(minimumAge date) > LANGUAGE plpgsql > AS $function$ > DECLARE > key text; > BEGIN > for key in select distinct KEY from TABLE where date <= minimumAge LOOP > raise notice 'Aggregating key %', key; > perform aggregate_inner( key, minimumAge ); > COMMIT; > end loop; > END; > CALL procedure ( date ); > postgres[25608]: segfault at 557a3572c928 ip 0000557a3572c928 sp 00007ffd4e79cd08 error 15 > Code: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <80> 00 00 00 00 00 00 00 d8 d0 72 35 7a 55 00 00 58 bb 72 35 7a 55 > (gdb) bt full > #0 0x0000557a3572be78 in ?? () > No symbol table info available. > #1 0x0000557a34c6a3f2 in ResourceArrayFree (resarr=0x557a3572bf38) at ./build/../src/backend/utils/resowner/resowner.c:401 > No locals. > #2 ResourceOwnerDelete (owner=0x557a3572bf18) at ./build/../src/backend/utils/resowner/resowner.c:717 > No locals. > #3 0x0000557a34c682b0 in PortalDrop (portal=0x557a35760040, isTopCommit=<optimized out>) at ./build/../src/backend/utils/mmgr/portalmem.c:565 > isCommit = <optimized out> > __func__ = "PortalDrop" > #4 0x0000557a34c68adb in PortalErrorCleanup () at ./build/../src/backend/utils/mmgr/portalmem.c:922 > portal = <optimized out> > status = {hashp = 0x557a35761f40, curBucket = 8, curEntry = 0x0} > hentry = <optimized out> > #5 0x0000557a34b21dfd in PostgresMain (argc=1, argv=argv@entry=0x557a357244d8, dbname=<optimized out>, username=0x557a35724418 "postgres") > at ./build/../src/backend/tcop/postgres.c:3973 > firstchar = <optimized out> > input_message = {data = 0x557a356da3c0 "CALL documentation.ib_logbuch_aggregieren_jsc ( now()::date ) ;-- 15227 ", len = 73, maxlen = 1024, cursor = 73} > local_sigjmp_buf = {{__jmpbuf = {140725920059072, -8205351387769757095, 1, 93983371052056, 93983371052248, 140725920059936, -8205351388738641319, -2653101812051762599}, > __mask_was_saved = 1, __saved_mask = {__val = {0, 0, 8818781457506372608, 140725920059968, 93983361064142, 140725920059568, 309, 140725920061008, 0, 140725920059936, > 139880161943701, 206158430256, 140725920059544, 140725920059328, 8818781457506372608, 16}}}} > send_ready_for_query = false > disable_idle_in_transaction_timeout = false > __func__ = "PostgresMain" > #6 0x0000557a34aada7d in BackendRun (port=0x557a3571d880) at ./build/../src/backend/postmaster/postmaster.c:4361 > ac = 1 > secs = 606241460 > usecs = 285617 > i = 1 > av = 0x557a357244d8 > maxac = <optimized out> > av = <optimized out> > maxac = <optimized out> > ac = <optimized out> > secs = <optimized out> > usecs = <optimized out> > i = <optimized out> > #7 BackendStartup (port=0x557a3571d880) at ./build/../src/backend/postmaster/postmaster.c:4033 > bn = <optimized out> > pid = <optimized out> > bn = <optimized out> > pid = <optimized out> > save_errno = <optimized out> > #8 ServerLoop () at ./build/../src/backend/postmaster/postmaster.c:1706 > port = <optimized out> > i = <optimized out> > rmask = {fds_bits = {128, 0 <repeats 15 times>}} > selres = <optimized out> > now = <optimized out> > readmask = {fds_bits = {200, 0 <repeats 15 times>}} > nSockets = <optimized out> > last_lockfile_recheck_time = 1552926214 > last_touch_time = 1552926094 > __func__ = "ServerLoop" > #9 0x0000557a34aaeabf in PostmasterMain (argc=5, argv=0x557a356d4060) at ./build/../src/backend/postmaster/postmaster.c:1379 > opt = <optimized out> > status = <optimized out> > userDoption = <optimized out> > listen_addr_saved = <optimized out> > i = <optimized out> > output_config_variable = <optimized out> > __func__ = "PostmasterMain" > #10 0x0000557a3483b4c2 in main (argc=5, argv=0x557a356d4060) at ./build/../src/backend/main/main.c:228
On Tue, Mar 19, 2019 at 10:04:55AM +0000, PG Bug reporting form wrote: > there seems to be a SEGFAULT issue with CALL-Procedures once they get > SIGINT'ed. While currently able to reproduce with non-disclosed > functions and data- I am working on a minimal report for public > reproduction. It would be nice to have a reproducible test case. From what I can see, we don't have the definition of "procedure" which gets called at the highest level, so it is not possible to use your example. I have been playing a bit with query cancellations and some nested procedures, but could not see an issue: CREATE TABLE aa (a int); CREATE OR REPLACE PROCEDURE insert_tuples(num_tuples int) AS $func$ BEGIN CALL insert_tuples_inner(num_tuples); END $func$ LANGUAGE plpgsql; CREATE OR REPLACE PROCEDURE insert_tuples_inner(num_tuples int) AS $func$ BEGIN FOR i IN 1..num_tuples LOOP EXECUTE format('INSERT INTO aa VALUES (' || i || ');'); END LOOP; END $func$ LANGUAGE plpgsql; Then just do that in parallel between two sessions. One for the cancellation: while true; do psql -c "select pg_cancel_backend(pid) from pg_stat_activity where pid <> pg_backend_pid() and query ~ 'insert_tuples'"; sleep 5; done And another one for the repeated procedure calls, which uses CALL insert_tuples(1000000) as query to run multiple times. -- Michael
Attachment
PG Bug reporting form <noreply@postgresql.org> writes: > there seems to be a SEGFAULT issue with CALL-Procedures once they get > SIGINT'ed. What I'm seeing in an assert-enabled build is an assertion failure: TRAP: FailedAssertion("!(portal->resowner == ((void *)0))", File: "portalmem.c"\ , Line: 874) #0 0x00007fbe9cd6beab in raise () from /lib64/libc.so.6 #1 0x00007fbe9cd565b9 in abort () from /lib64/libc.so.6 #2 0x000000000089d9cf in ExceptionalCondition (conditionName=conditionName@entry=0xac34b0 "!(portal->resowner == ((void*)0))", errorType=errorType@entry=0x8ec7c9 "FailedAssertion", fileName=fileName@entry=0xac3163 "portalmem.c", lineNumber=lineNumber@entry=874) at assert.c:54 #3 0x00000000008ca924 in AtCleanup_Portals () at portalmem.c:874 #4 0x00000000005145c7 in CleanupTransaction () at xact.c:2676 #5 0x00000000005186ca in AbortCurrentTransaction () at xact.c:3133 #6 0x0000000000789004 in PostgresMain (argc=1, argv=argv@entry=0x2bc3ad0, dbname=<optimized out>, username=0x2b96518 "tgl")at postgres.c:4033 #7 0x000000000070d91f in BackendRun (port=0x2bbce20, port=0x2bbce20) at postmaster.c:4399 #8 BackendStartup (port=0x2bbce20) at postmaster.c:4090 #9 ServerLoop () at postmaster.c:1703 #10 0x000000000070e81e in PostmasterMain (argc=argc@entry=2, argv=argv@entry=0x2b944c0) at postmaster.c:1376 #11 0x000000000047acae in main (argc=2, argv=0x2b944c0) at main.c:228 (gdb) f 3 #3 0x00000000008ca924 in AtCleanup_Portals () at portalmem.c:874 874 Assert(portal->resowner == NULL); (gdb) p *portal $1 = {name = 0x2bff4b8 "<unnamed portal 1>", prepStmtName = 0x0, portalContext = 0x2c728a0, resowner = 0x2c141d0, cleanup= 0x0, createSubid = 1, activeSubid = 1, sourceText = 0x2c729b8 "SELECT GENERATE_SERIES(1,100000)", commandTag = 0xa10eb3 "SELECT", stmts = 0x2c74e30, cplan = 0x2c75520, portalParams = 0x0, queryEnv = 0x0, strategy = PORTAL_ONE_SELECT, cursorOptions = 4, run_once = false, status = PORTAL_FAILED, portalPinned = true, autoHeld = true, queryDesc = 0x0, tupDesc = 0x2c84118, formats = 0x0, holdStore= 0x2c83ee8, holdContext = 0x2c83dd0, holdSnapshot = 0x0, atStart = false, atEnd = false, portalPos = 10, creation_time = 606411039547087,visible = true} Poking around a bit, it seems like this is a thinko in HoldPinnedPortals: it's not considering the possibility of a failure during HoldPortal(). In your test the failure is triggered by query cancel, but an ordinary execution error while finishing out a cursor's calculations would be a problem too. I think it should not mark the portal as autoHeld until it's been successfully held. The attached patch seems to make the problem go away for me --- can you confirm it fixes your original issue? I find HoldPinnedPortals suspicious in another way: is it really OK for it to mark *all* pinned portals as auto-held? What if we're in a nest of procedures run by different PLs (of which only the innermost will have any idea we're committing)? Will the right things happen if we auto-hold a portal or two and then get a failure on another one? regards, tom lane diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index a92b4541bd..697c980282 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -1262,8 +1262,8 @@ HoldPinnedPortals(void) (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION), errmsg("cannot perform transaction commands inside a cursor loop that is not read-only"))); - portal->autoHeld = true; HoldPortal(portal); + portal->autoHeld = true; } } }
I wrote: > Poking around a bit, it seems like this is a thinko in HoldPinnedPortals: > it's not considering the possibility of a failure during HoldPortal(). > In your test the failure is triggered by query cancel, but an ordinary > execution error while finishing out a cursor's calculations would be > a problem too. Pursuant to which observation, here's a fully self-contained test case. regards, tom lane CREATE PROCEDURE crashme() LANGUAGE plpgsql AS $procedure$ DECLARE id_ int; BEGIN FOR id_ IN SELECT 1/(x-1000) from GENERATE_SERIES(1,1000) x LOOP COMMIT; END LOOP; END; $procedure$; call crashme();
Hi, > can you confirm it fixes your original issue? Fixed for original code and minimals.
I wrote: > I find HoldPinnedPortals suspicious in another way: is it really > OK for it to mark *all* pinned portals as auto-held? What if we're > in a nest of procedures run by different PLs (of which only the > innermost will have any idea we're committing)? After taking another look at this, I'm convinced that it's flat out broken. It cannot be up to individual PLs to decide whether or not to call HoldPinnedPortals before issuing a commit. If a PL that needs that calls (a function belonging to) some other PL that thinks it doesn't need that, and the second PL issues a commit or rollback, we'll have a failure for whatever portal(s) the first PL left pinned. I think, therefore, that we should get rid of HoldPinnedPortals and just assign the responsibility for holding pinned portals to the normal code paths, probably PreCommit_Portals and AtAbort_Portals. Another issue in this area is whether it's really safe to hold a portal during transaction abort at all. I'm inclined to think that it is not. HoldPortal can call arbitrary user code, and the idea that we'll let that happen during an already-failed transaction seems borderline insane from here. However, I'm not very sure what we can do instead. As things stand right now, the calls of HoldPinnedPortals from places like exec_stmt_rollback are safe in this sense, because we haven't yet started to abort the current transaction. However, that just raises the question of why we need those calls at all. If a transaction abort happens due to some kind of error, we aren't going to do any pinned-portal holding, so why should we need it in exec_stmt_rollback? Maybe the rule needs to be "we'll do HoldPinnedPortals in SPI_rollback, but not if you reach transaction abort via an actual error". Seems a bit weird though. Thoughts? regards, tom lane
I wrote: >> I find HoldPinnedPortals suspicious in another way: is it really >> OK for it to mark *all* pinned portals as auto-held? What if we're >> in a nest of procedures run by different PLs (of which only the >> innermost will have any idea we're committing)? > After taking another look at this, I'm convinced that it's flat out > broken. It cannot be up to individual PLs to decide whether or not > to call HoldPinnedPortals before issuing a commit. If a PL that needs > that calls (a function belonging to) some other PL that thinks it > doesn't need that, and the second PL issues a commit or rollback, > we'll have a failure for whatever portal(s) the first PL left pinned. After further study, it seems like the most practical thing to do here is to assign the responsibility for calling HoldPinnedPortals to spi.c. If there are any PLs out there that are trying to implement commit/ rollback in procedures without using SPI, it's going to be on their heads to include this step --- but there's a lot of other stuff they are going to need to know if they don't go through SPI, so that doesn't seem too awful, and in any case this change doesn't make it any more complicated for them. Conversely, if there are any PLs out there that are using spi.c and following the existing rule that they should call HoldPinnedPortals for themselves, this won't break them, because running HoldPinnedPortals twice in a row is harmless. Accordingly I propose the attached patch. Aside from moving those calls, adjusting the comments, and fixing the original bug, this inserts a test on portal->status, which I think is necessary by analogy to the existing tests in AtCommit_Portals. regards, tom lane diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c index 6e262d1..22d0fe5 100644 --- a/src/backend/executor/spi.c +++ b/src/backend/executor/spi.c @@ -241,6 +241,14 @@ _SPI_commit(bool chain) (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION), errmsg("cannot commit while a subtransaction is active"))); + /* + * Hold any pinned portals that any PLs might be using. We have to do + * this before changing transaction state, since this will run + * user-defined code that might throw an error. + */ + HoldPinnedPortals(); + + /* Start the actual commit */ _SPI_current->internal_xact = true; /* @@ -294,6 +302,15 @@ _SPI_rollback(bool chain) (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION), errmsg("cannot roll back while a subtransaction is active"))); + /* + * Hold any pinned portals that any PLs might be using. We have to do + * this before changing transaction state, since this will run + * user-defined code that might throw an error, and in any case couldn't + * be run in an already-aborted transaction. + */ + HoldPinnedPortals(); + + /* Start the actual rollback */ _SPI_current->internal_xact = true; if (chain) diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c index a92b454..334e35b 100644 --- a/src/backend/utils/mmgr/portalmem.c +++ b/src/backend/utils/mmgr/portalmem.c @@ -1226,13 +1226,19 @@ ThereAreNoReadyPortals(void) /* * Hold all pinned portals. * - * A procedural language implementation that uses pinned portals for its - * internally generated cursors can call this in its COMMIT command to convert - * those cursors to held cursors, so that they survive the transaction end. - * We mark those portals as "auto-held" so that exception exit knows to clean - * them up. (In normal, non-exception code paths, the PL needs to clean those - * portals itself, since transaction end won't do it anymore, but that should - * be normal practice anyway.) + * When initiating a COMMIT or ROLLBACK inside a procedure, this must be + * called to protect internally-generated cursors from being dropped during + * the transaction shutdown. Currently, SPI calls this automatically; PLs + * that initiate COMMIT or ROLLBACK some other way are on the hook to do it + * themselves. (Note that we couldn't do this in, say, AtAbort_Portals + * because we need to run user-defined code while persisting a portal. + * It's too late to do that once transaction abort has started.) + * + * We protect such portals by converting them to held cursors. We mark them + * as "auto-held" so that exception exit knows to clean them up. (In normal, + * non-exception code paths, the PL needs to clean such portals itself, since + * transaction end won't do it anymore; but that should be normal practice + * anyway.) */ void HoldPinnedPortals(void) @@ -1262,8 +1268,12 @@ HoldPinnedPortals(void) (errcode(ERRCODE_INVALID_TRANSACTION_TERMINATION), errmsg("cannot perform transaction commands inside a cursor loop that is not read-only"))); - portal->autoHeld = true; + /* Verify it's in a suitable state to be held */ + if (portal->status != PORTAL_READY) + elog(ERROR, "pinned portal is not ready to be auto-held"); + HoldPortal(portal); + portal->autoHeld = true; } } } diff --git a/src/pl/plperl/plperl.c b/src/pl/plperl/plperl.c index 31ba2f2..bda9517 100644 --- a/src/pl/plperl/plperl.c +++ b/src/pl/plperl/plperl.c @@ -3988,8 +3988,6 @@ plperl_spi_commit(void) PG_TRY(); { - HoldPinnedPortals(); - SPI_commit(); SPI_start_transaction(); } @@ -4015,8 +4013,6 @@ plperl_spi_rollback(void) PG_TRY(); { - HoldPinnedPortals(); - SPI_rollback(); SPI_start_transaction(); } diff --git a/src/pl/plpgsql/src/expected/plpgsql_transaction.out b/src/pl/plpgsql/src/expected/plpgsql_transaction.out index ba07453..5f57623 100644 --- a/src/pl/plpgsql/src/expected/plpgsql_transaction.out +++ b/src/pl/plpgsql/src/expected/plpgsql_transaction.out @@ -401,6 +401,50 @@ SELECT * FROM test3; 1 (1 row) +-- failure while trying to persist a cursor across a transaction (bug #15703) +CREATE PROCEDURE cursor_fail_during_commit() + LANGUAGE plpgsql +AS $$ + DECLARE id int; + BEGIN + FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP + INSERT INTO test1 VALUES(id); + COMMIT; + END LOOP; + END; +$$; +TRUNCATE test1; +CALL cursor_fail_during_commit(); +ERROR: division by zero +CONTEXT: PL/pgSQL function cursor_fail_during_commit() line 6 at COMMIT +-- note that error occurs during first COMMIT, hence nothing is in test1 +SELECT count(*) FROM test1; + count +------- + 0 +(1 row) + +CREATE PROCEDURE cursor_fail_during_rollback() + LANGUAGE plpgsql +AS $$ + DECLARE id int; + BEGIN + FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP + INSERT INTO test1 VALUES(id); + ROLLBACK; + END LOOP; + END; +$$; +TRUNCATE test1; +CALL cursor_fail_during_rollback(); +ERROR: division by zero +CONTEXT: PL/pgSQL function cursor_fail_during_rollback() line 6 at ROLLBACK +SELECT count(*) FROM test1; + count +------- + 0 +(1 row) + -- SET TRANSACTION DO LANGUAGE plpgsql $$ BEGIN diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c index f000500..23aed02 100644 --- a/src/pl/plpgsql/src/pl_exec.c +++ b/src/pl/plpgsql/src/pl_exec.c @@ -4791,8 +4791,6 @@ exec_stmt_close(PLpgSQL_execstate *estate, PLpgSQL_stmt_close *stmt) static int exec_stmt_commit(PLpgSQL_execstate *estate, PLpgSQL_stmt_commit *stmt) { - HoldPinnedPortals(); - if (stmt->chain) SPI_commit_and_chain(); else @@ -4815,8 +4813,6 @@ exec_stmt_commit(PLpgSQL_execstate *estate, PLpgSQL_stmt_commit *stmt) static int exec_stmt_rollback(PLpgSQL_execstate *estate, PLpgSQL_stmt_rollback *stmt) { - HoldPinnedPortals(); - if (stmt->chain) SPI_rollback_and_chain(); else diff --git a/src/pl/plpgsql/src/sql/plpgsql_transaction.sql b/src/pl/plpgsql/src/sql/plpgsql_transaction.sql index 0c137dd..7575655 100644 --- a/src/pl/plpgsql/src/sql/plpgsql_transaction.sql +++ b/src/pl/plpgsql/src/sql/plpgsql_transaction.sql @@ -329,6 +329,44 @@ $$; SELECT * FROM test3; +-- failure while trying to persist a cursor across a transaction (bug #15703) +CREATE PROCEDURE cursor_fail_during_commit() + LANGUAGE plpgsql +AS $$ + DECLARE id int; + BEGIN + FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP + INSERT INTO test1 VALUES(id); + COMMIT; + END LOOP; + END; +$$; + +TRUNCATE test1; + +CALL cursor_fail_during_commit(); + +-- note that error occurs during first COMMIT, hence nothing is in test1 +SELECT count(*) FROM test1; + +CREATE PROCEDURE cursor_fail_during_rollback() + LANGUAGE plpgsql +AS $$ + DECLARE id int; + BEGIN + FOR id IN SELECT 1/(x-1000) FROM generate_series(1,1000) x LOOP + INSERT INTO test1 VALUES(id); + ROLLBACK; + END LOOP; + END; +$$; + +TRUNCATE test1; + +CALL cursor_fail_during_rollback(); + +SELECT count(*) FROM test1; + -- SET TRANSACTION DO LANGUAGE plpgsql $$ diff --git a/src/pl/plpython/plpy_plpymodule.c b/src/pl/plpython/plpy_plpymodule.c index 23e49e4..f40f084 100644 --- a/src/pl/plpython/plpy_plpymodule.c +++ b/src/pl/plpython/plpy_plpymodule.c @@ -588,8 +588,6 @@ PLy_commit(PyObject *self, PyObject *args) { PLyExecutionContext *exec_ctx = PLy_current_execution_context(); - HoldPinnedPortals(); - SPI_commit(); SPI_start_transaction(); @@ -604,8 +602,6 @@ PLy_rollback(PyObject *self, PyObject *args) { PLyExecutionContext *exec_ctx = PLy_current_execution_context(); - HoldPinnedPortals(); - SPI_rollback(); SPI_start_transaction();