Thread: BUG #18497: Heap-use-after-free in plpgsql

BUG #18497: Heap-use-after-free in plpgsql

From
PG Bug reporting form
Date:
The following bug has been logged on the website:

Bug reference:      18497
Logged by:          Nikita Kalinin
Email address:      n.kalinin@postgrespro.ru
PostgreSQL version: 16.3
Operating system:   ubuntu 22.04
Description:

When building postgresql on REL_16_STABLE tag with ASAN assertion error:

#0  0x00007f491f4419fc in pthread_kill () from
/lib/x86_64-linux-gnu/libc.so.6
(gdb) bt
#0  0x00007f491f4419fc in pthread_kill () from
/lib/x86_64-linux-gnu/libc.so.6
#1  0x00007f491f3ed476 in raise () from /lib/x86_64-linux-gnu/libc.so.6
#2  0x00007f491f3d37f3 in abort () from /lib/x86_64-linux-gnu/libc.so.6
#3  0x00005557ce0b3c22 in __sanitizer::Abort() ()
#4  0x00005557ce0bf7dc in __sanitizer::Die() ()
#5  0x00005557ce09ec8c in
__asan::ScopedInErrorReport::~ScopedInErrorReport() ()
#6  0x00005557ce09e525 in __asan::ReportGenericError(unsigned long, unsigned
long, unsigned long, unsigned long, bool, unsigned long, unsigned int, bool)
()
#7  0x00005557ce09f24b in __asan_report_load4 ()
#8  0x00005557ce841147 in expr_setup_walker
(node=node@entry=0x61900002e4b8,
    info=info@entry=0x7ffec42a0170) at execExpr.c:2757
#9  0x00005557ce84337d in ExecCreateExprSetupSteps (
    state=state@entry=0x625000070d08, node=node@entry=0x61900002e4b8)
    at execExpr.c:2659
#10 0x00005557ce8515e7 in ExecInitExprWithParams (node=0x61900002e4b8,
    ext_params=ext_params@entry=0x625000075a18) at execExpr.c:180
#11 0x00007f49111a0a85 in exec_eval_simple_expr (
    estate=estate@entry=0x7ffec42a0790, expr=expr@entry=0x62500005aa98,
    result=result@entry=0x7ffec42a0340,
isNull=isNull@entry=0x7ffec42a03d0,
    rettype=rettype@entry=0x7ffec42a03e0,
rettypmod=rettypmod@entry=0x7ffec42a03f0)
    at pl_exec.c:6178
#12 0x00007f49111a3788 in exec_eval_expr
(estate=estate@entry=0x7ffec42a0790,
    expr=expr@entry=0x62500005aa98, isNull=isNull@entry=0x7ffec42a03d0,
    rettype=rettype@entry=0x7ffec42a03e0,
rettypmod=rettypmod@entry=0x7ffec42a03f0) at pl_exec.c:5702
#13 0x00007f49111afb18 in exec_assign_expr (estate=<optimized out>,
target=0x625000075ad0, expr=0x62500005aa98) at pl_exec.c:5034
#14 0x00007f49111aff36 in exec_stmt_assign
(estate=estate@entry=0x7ffec42a0790, stmt=stmt@entry=0x62500005bf30) at
pl_exec.c:2155
#15 0x00007f49111b365c in exec_stmts (estate=estate@entry=0x7ffec42a0790,
stmts=0x62500005bf78) at pl_exec.c:2019
#16 0x00007f49111b5242 in exec_stmt_block
(estate=estate@entry=0x7ffec42a0790, block=block@entry=0x62500005bfc8) at
pl_exec.c:1942
#17 0x00007f49111b54cc in exec_toplevel_block
(estate=estate@entry=0x7ffec42a0790, block=0x62500005bfc8) at
pl_exec.c:1633
#18 0x00007f49111b6234 in plpgsql_exec_function
(func=func@entry=0x629000024ad0, fcinfo=fcinfo@entry=0x625000058100,
simple_eval_estate=simple_eval_estate@entry=0x0,
simple_eval_resowner=simple_eval_resowner@entry=0x0,
procedure_resowner=procedure_resowner@entry=0x0, atomic=<optimized out>) at
pl_exec.c:622
#19 0x00007f49111dfa3f in plpgsql_call_handler (fcinfo=<optimized out>) at
pl_handler.c:277
#20 0x00005557ce874901 in ExecInterpExpr (state=0x625000058028,
econtext=0x625000057d50, isnull=0x7ffec42a0bd0) at execExprInterp.c:734
#21 0x00005557ce8614df in ExecInterpExprStillValid (state=0x625000058028,
econtext=0x625000057d50, isNull=0x7ffec42a0bd0) at execExprInterp.c:1870
#22 0x00005557ce98f19b in ExecEvalExprSwitchContext (isNull=0x7ffec42a0bd0,
econtext=0x625000057d50, state=0x625000058028) at
../../../src/include/executor/executor.h:355
#23 ExecProject (projInfo=0x625000058020) at
../../../src/include/executor/executor.h:389
#24 ExecResult (pstate=<optimized out>) at nodeResult.c:136
#25 0x00005557ce8b104f in ExecProcNodeFirst (node=0x625000057c40) at
execProcnode.c:464
#26 0x00005557ce88f146 in ExecProcNode (node=0x625000057c40) at
../../../src/include/executor/executor.h:273
#27 ExecutePlan (estate=estate@entry=0x625000057a18,
planstate=0x625000057c40, use_parallel_mode=<optimized out>,
use_parallel_mode@entry=false, operation=operation@entry=CMD_SELECT,
sendTuples=true, numberTuples=numberTuples@entry=0,
direction=ForwardScanDirection, dest=0x625000085098, execute_once=true) at
execMain.c:1670
#28 0x00005557ce88f747 in standard_ExecutorRun (queryDesc=0x619000001a98,
direction=ForwardScanDirection, count=0,
execute_once=execute_once@entry=true) at execMain.c:365
#29 0x00005557ce88f9ab in ExecutorRun
(queryDesc=queryDesc@entry=0x619000001a98,
direction=direction@entry=ForwardScanDirection, count=count@entry=0,
execute_once=execute_once@entry=true) at execMain.c:309
#30 0x00005557cf025d95 in PortalRunSelect
(portal=portal@entry=0x625000025a18, forward=forward@entry=true, count=0,
count@entry=9223372036854775807, dest=dest@entry=0x625000085098) at
pquery.c:924
#31 0x00005557cf02c02c in PortalRun (portal=portal@entry=0x625000025a18,
count=count@entry=9223372036854775807, isTopLevel=isTopLevel@entry=true,
run_once=run_once@entry=true, dest=dest@entry=0x625000085098,
altdest=altdest@entry=0x625000085098, qc=<optimized out>) at pquery.c:768
#32 0x00005557cf01fd70 in exec_simple_query
(query_string=query_string@entry=0x625000005218 "select f1();") at
postgres.c:1274
#33 0x00005557cf024b87 in PostgresMain (dbname=dbname@entry=0x6250000020c8
"contrib_regression", username=username@entry=0x6250000020f8 "test") at
postgres.c:4637
#34 0x00005557cedc385d in BackendRun (port=port@entry=0x614000001840) at
postmaster.c:4464
#35 0x00005557cedcbfe6 in BackendStartup (port=port@entry=0x614000001840) at
postmaster.c:4192
#36 0x00005557cedcc5e3 in ServerLoop () at postmaster.c:1782
#37 0x00005557cedcec0e in PostmasterMain (argc=argc@entry=3,
argv=argv@entry=0x6030000002e0) at postmaster.c:1466
#38 0x00005557cea1f054 in main (argc=3, argv=0x6030000002e0) at main.c:198

How to reproduce:
Build postgresql with the following parameters: 
export

ASAN_OPTIONS=detect_leaks=0:abort_on_error=1:disable_coredump=0:strict_string_checks=1:check_initialization_order=1:strict_init_order=1
CPPFLAGS="-Og -fsanitize=address -fsanitize=undefined
-fno-sanitize-recover=all -fno-sanitize=nonnull-attribute -fstack-protector"
LDFLAGS='-fsanitize=address -fsanitize=undefined -static-libasan'
./configure --enable-tap-tests --enable-debug --enable-cassert >/dev/null &&
make -j4 -s && make -j4 -s -C contrib && make check

Two sql files are required:

cat 1.sql
create table t1(a int, b int);
select pg_sleep(1);

cat 2.sql
select pg_sleep(1);

create function g1(out a int, out b int)
as $$
  select 10,20;
$$ language sql;

create function f1()
returns void as $$
declare r record;
begin
  r := g1();
end;
$$ language plpgsql;

select f1();
drop function g1();
create function g1(out a int, out b int)
returns setof record as $$
select * from t1;
$$ language sql;
select f1();
select f1();

Playback script:

( psql -f 1.sql &> 1.log ) &
( psql -f 2.sql &> 2.log ) &
wait


Re: BUG #18497: Heap-use-after-free in plpgsql

From
Tom Lane
Date:
PG Bug reporting form <noreply@postgresql.org> writes:
> When building postgresql on REL_16_STABLE tag with ASAN assertion error:

Thanks for the report!

> Two sql files are required:

This can actually be reproduced more simply: you don't need two
sessions, because it's purely straight-line behavior.  You don't
need ASAN either, as a plain old --enable-cassert (or more
specifically -DCLOBBER_FREED_MEMORY) build will show it too --
or at least it does on HEAD:

regression=# select f1();
ERROR:  unexpected plan node type: 328
CONTEXT:  PL/pgSQL function f1() line 4 at assignment
regression=# select f1();
ERROR:  unrecognized node type: 2139062143
CONTEXT:  PL/pgSQL function f1() line 4 at assignment

It appears to me that there are two independent bugs here.

One is that exec_eval_simple_expr() assumes, when revalidating
a simple-expression plan, that it doesn't have to recheck the
"purely syntactic" tests made in exec_simple_check_plan().
I think that's mostly correct, but it's wrong at least with
respect to checking query->hasTargetSRFs: this test case shows
how to break that.  You could probably cause query->hasAggs to
change in the same way.  On the whole it seems safest to
refactor so that we recheck all of those things.

Right now, since we aren't rechecking that, we end up trying to
apply exec_save_simple_expr to a plan tree containing a ProjectSet
node, which it isn't expecting, leading to the "unexpected plan
node type" error.  And then we have the second bug: longjmp'ing
out of this leaves behind some dangling pointers, which is what
causes the "unrecognized node type: 2139062143" error (or the ASAN
complaint) during the second call.  So we need to be more careful
about leaving a clean state in case we fail out of here.

The attached seems to be enough to fix this.  I verified the
second aspect by leaving the new exec_is_simple_query(expr) test
out of exec_eval_simple_expr, in which case you still get the
"unexpected plan node type" error but things are OK on retries.

            regards, tom lane

diff --git a/src/pl/plpgsql/src/expected/plpgsql_simple.out b/src/pl/plpgsql/src/expected/plpgsql_simple.out
index e1f5d670fb..7b22e60f19 100644
--- a/src/pl/plpgsql/src/expected/plpgsql_simple.out
+++ b/src/pl/plpgsql/src/expected/plpgsql_simple.out
@@ -89,3 +89,32 @@ select simplecaller();
             4
 (1 row)

+-- Check case where called function changes from non-SRF to SRF (bug #18497)
+create or replace function simplecaller() returns int language plpgsql
+as $$
+declare x int;
+begin
+  x := simplesql();
+  return x;
+end$$;
+select simplecaller();
+ simplecaller
+--------------
+            4
+(1 row)
+
+drop function simplesql();
+create function simplesql() returns setof int language sql
+as $$select 22 + 22$$;
+select simplecaller();
+ simplecaller
+--------------
+           44
+(1 row)
+
+select simplecaller();
+ simplecaller
+--------------
+           44
+(1 row)
+
diff --git a/src/pl/plpgsql/src/pl_exec.c b/src/pl/plpgsql/src/pl_exec.c
index 6947575b94..d1958812b7 100644
--- a/src/pl/plpgsql/src/pl_exec.c
+++ b/src/pl/plpgsql/src/pl_exec.c
@@ -339,6 +339,7 @@ static void exec_eval_cleanup(PLpgSQL_execstate *estate);
 static void exec_prepare_plan(PLpgSQL_execstate *estate,
                               PLpgSQL_expr *expr, int cursorOptions);
 static void exec_simple_check_plan(PLpgSQL_execstate *estate, PLpgSQL_expr *expr);
+static bool exec_is_simple_query(PLpgSQL_expr *expr);
 static void exec_save_simple_expr(PLpgSQL_expr *expr, CachedPlan *cplan);
 static void exec_check_rw_parameter(PLpgSQL_expr *expr);
 static void exec_check_assignable(PLpgSQL_execstate *estate, int dno);
@@ -6088,18 +6089,26 @@ exec_eval_simple_expr(PLpgSQL_execstate *estate,
     {
         /* Need to replan */
         CachedPlan *cplan;
+        List       *plansources;
+        CachedPlanSource *plansource;

         /*
          * If we have a valid refcount on some previous version of the plan,
          * release it, so we don't leak plans intra-transaction.
          */
         if (expr->expr_simple_plan_lxid == curlxid)
-        {
             ReleaseCachedPlan(expr->expr_simple_plan,
                               estate->simple_eval_resowner);
-            expr->expr_simple_plan = NULL;
-            expr->expr_simple_plan_lxid = InvalidLocalTransactionId;
-        }
+
+        /*
+         * Reset to "not simple" to leave sane state (with no dangling
+         * pointers) in case we fail while replanning.
+         */
+        expr->expr_simple_expr = NULL;
+        expr->expr_rw_param = NULL;
+        expr->expr_simple_plansource = NULL;
+        expr->expr_simple_plan = NULL;
+        expr->expr_simple_plan_lxid = InvalidLocalTransactionId;

         /* Do the replanning work in the eval_mcontext */
         oldcontext = MemoryContextSwitchTo(get_eval_mcontext(estate));
@@ -6114,16 +6123,24 @@ exec_eval_simple_expr(PLpgSQL_execstate *estate,
          */
         Assert(cplan != NULL);

+        /* However, maybe the single CachedPlanSource is now different? */
+        plansources = SPI_plan_get_plan_sources(expr->plan);
+        plansource = (CachedPlanSource *) linitial(plansources);
+
         /*
-         * This test probably can't fail either, but if it does, cope by
-         * declaring the plan to be non-simple.  On success, we'll acquire a
-         * refcount on the new plan, stored in simple_eval_resowner.
+         * Recheck exec_is_simple_query, which could now report false in
+         * edge-case scenarios such as a non-SRF having been replaced with a
+         * SRF.  Also recheck CachedPlanAllowsSimpleValidityCheck, just to be
+         * sure.  If either test fails, cope by declaring the plan to be
+         * non-simple.  On success, we'll acquire a refcount on the new plan,
+         * stored in simple_eval_resowner.
          */
-        if (CachedPlanAllowsSimpleValidityCheck(expr->expr_simple_plansource,
-                                                cplan,
+        if (exec_is_simple_query(expr) &&
+            CachedPlanAllowsSimpleValidityCheck(plansource, cplan,
                                                 estate->simple_eval_resowner))
         {
             /* Remember that we have the refcount */
+            expr->expr_simple_plansource = plansource;
             expr->expr_simple_plan = cplan;
             expr->expr_simple_plan_lxid = curlxid;
         }
@@ -6131,9 +6148,6 @@ exec_eval_simple_expr(PLpgSQL_execstate *estate,
         {
             /* Release SPI_plan_get_cached_plan's refcount */
             ReleaseCachedPlan(cplan, CurrentResourceOwner);
-            /* Mark expression as non-simple, and fail */
-            expr->expr_simple_expr = NULL;
-            expr->expr_rw_param = NULL;
             return false;
         }

@@ -7974,7 +7988,6 @@ exec_simple_check_plan(PLpgSQL_execstate *estate, PLpgSQL_expr *expr)
 {
     List       *plansources;
     CachedPlanSource *plansource;
-    Query       *query;
     CachedPlan *cplan;
     MemoryContext oldcontext;

@@ -7990,31 +8003,88 @@ exec_simple_check_plan(PLpgSQL_execstate *estate, PLpgSQL_expr *expr)
      * called immediately after creating the CachedPlanSource, we need not
      * worry about the query being stale.
      */
+    if (!exec_is_simple_query(expr))
+        return;
+
+    /* exec_is_simple_query verified that there's just one CachedPlanSource */
+    plansources = SPI_plan_get_plan_sources(expr->plan);
+    plansource = (CachedPlanSource *) linitial(plansources);
+
+    /*
+     * Get the generic plan for the query.  If replanning is needed, do that
+     * work in the eval_mcontext.  (Note that replanning could throw an error,
+     * in which case the expr is left marked "not simple", which is fine.)
+     */
+    oldcontext = MemoryContextSwitchTo(get_eval_mcontext(estate));
+    cplan = SPI_plan_get_cached_plan(expr->plan);
+    MemoryContextSwitchTo(oldcontext);
+
+    /* Can't fail, because we checked for a single CachedPlanSource above */
+    Assert(cplan != NULL);
+
+    /*
+     * Verify that plancache.c thinks the plan is simple enough to use
+     * CachedPlanIsSimplyValid.  Given the restrictions above, it's unlikely
+     * that this could fail, but if it does, just treat plan as not simple. On
+     * success, save a refcount on the plan in the simple-expression resowner.
+     */
+    if (CachedPlanAllowsSimpleValidityCheck(plansource, cplan,
+                                            estate->simple_eval_resowner))
+    {
+        /* Remember that we have the refcount */
+        expr->expr_simple_plansource = plansource;
+        expr->expr_simple_plan = cplan;
+        expr->expr_simple_plan_lxid = MyProc->vxid.lxid;
+
+        /* Share the remaining work with the replan code path */
+        exec_save_simple_expr(expr, cplan);
+    }
+
+    /*
+     * Release the plan refcount obtained by SPI_plan_get_cached_plan.  (This
+     * refcount is held by the wrong resowner, so we can't just repurpose it.)
+     */
+    ReleaseCachedPlan(cplan, CurrentResourceOwner);
+}
+
+/*
+ * exec_is_simple_query - precheck a query tree to see if it might be simple
+ *
+ * Check the analyzed-and-rewritten form of a query to see if we will be
+ * able to treat it as a simple expression.  It is caller's responsibility
+ * that the CachedPlanSource be up-to-date.
+ */
+static bool
+exec_is_simple_query(PLpgSQL_expr *expr)
+{
+    List       *plansources;
+    CachedPlanSource *plansource;
+    Query       *query;

     /*
-     * We can only test queries that resulted in exactly one CachedPlanSource
+     * We can only test queries that resulted in exactly one CachedPlanSource.
      */
     plansources = SPI_plan_get_plan_sources(expr->plan);
     if (list_length(plansources) != 1)
-        return;
+        return false;
     plansource = (CachedPlanSource *) linitial(plansources);

     /*
      * 1. There must be one single querytree.
      */
     if (list_length(plansource->query_list) != 1)
-        return;
+        return false;
     query = (Query *) linitial(plansource->query_list);

     /*
-     * 2. It must be a plain SELECT query without any input tables
+     * 2. It must be a plain SELECT query without any input tables.
      */
     if (!IsA(query, Query))
-        return;
+        return false;
     if (query->commandType != CMD_SELECT)
-        return;
+        return false;
     if (query->rtable != NIL)
-        return;
+        return false;

     /*
      * 3. Can't have any subplans, aggregates, qual clauses either.  (These
@@ -8038,51 +8108,18 @@ exec_simple_check_plan(PLpgSQL_execstate *estate, PLpgSQL_expr *expr)
         query->limitOffset ||
         query->limitCount ||
         query->setOperations)
-        return;
+        return false;

     /*
-     * 4. The query must have a single attribute as result
+     * 4. The query must have a single attribute as result.
      */
     if (list_length(query->targetList) != 1)
-        return;
+        return false;

     /*
      * OK, we can treat it as a simple plan.
-     *
-     * Get the generic plan for the query.  If replanning is needed, do that
-     * work in the eval_mcontext.  (Note that replanning could throw an error,
-     * in which case the expr is left marked "not simple", which is fine.)
      */
-    oldcontext = MemoryContextSwitchTo(get_eval_mcontext(estate));
-    cplan = SPI_plan_get_cached_plan(expr->plan);
-    MemoryContextSwitchTo(oldcontext);
-
-    /* Can't fail, because we checked for a single CachedPlanSource above */
-    Assert(cplan != NULL);
-
-    /*
-     * Verify that plancache.c thinks the plan is simple enough to use
-     * CachedPlanIsSimplyValid.  Given the restrictions above, it's unlikely
-     * that this could fail, but if it does, just treat plan as not simple. On
-     * success, save a refcount on the plan in the simple-expression resowner.
-     */
-    if (CachedPlanAllowsSimpleValidityCheck(plansource, cplan,
-                                            estate->simple_eval_resowner))
-    {
-        /* Remember that we have the refcount */
-        expr->expr_simple_plansource = plansource;
-        expr->expr_simple_plan = cplan;
-        expr->expr_simple_plan_lxid = MyProc->vxid.lxid;
-
-        /* Share the remaining work with the replan code path */
-        exec_save_simple_expr(expr, cplan);
-    }
-
-    /*
-     * Release the plan refcount obtained by SPI_plan_get_cached_plan.  (This
-     * refcount is held by the wrong resowner, so we can't just repurpose it.)
-     */
-    ReleaseCachedPlan(cplan, CurrentResourceOwner);
+    return true;
 }

 /*
diff --git a/src/pl/plpgsql/src/sql/plpgsql_simple.sql b/src/pl/plpgsql/src/sql/plpgsql_simple.sql
index 57020d22d6..143bf09dce 100644
--- a/src/pl/plpgsql/src/sql/plpgsql_simple.sql
+++ b/src/pl/plpgsql/src/sql/plpgsql_simple.sql
@@ -80,3 +80,25 @@ create or replace function simplesql() returns int language sql
 as $$select 2 + 2$$;

 select simplecaller();
+
+
+-- Check case where called function changes from non-SRF to SRF (bug #18497)
+
+create or replace function simplecaller() returns int language plpgsql
+as $$
+declare x int;
+begin
+  x := simplesql();
+  return x;
+end$$;
+
+select simplecaller();
+
+drop function simplesql();
+
+create function simplesql() returns setof int language sql
+as $$select 22 + 22$$;
+
+select simplecaller();
+
+select simplecaller();