Thread: Creating foreign key on partitioned table is too slow

Creating foreign key on partitioned table is too slow

From

"kato-sho@fujitsu.com"

Date:

23 October 2019, 05:59:01

Hello

To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process
killedby OOM. 
the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).

I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours.

Is the community aware of this? is anyone working on this?
If you are discussing, please let me know the thread.

Table definition and pstack are as follows.

* table definition *

CREATE TABLE accounts (aid INTEGER, bid INTEGER, abalance INTEGER, filler CHAR(84)) PARTITION BY HASH(aid);
CREATE TABLE history (tid INTEGER, bid INTEGER, aid INTEGER, delta INTEGER, mtime TIMESTAMP, filler CHAR(22)) PARTITION
BYHASH(aid); 
\o /dev/null
SELECT 'CREATE TABLE accounts_' || p || ' PARTITION OF accounts FOR VALUES WITH (modulus 8192, remainder ' || p || ');'
FROMgenerate_series(0, 8191) p; 
\gexec
SELECT 'CREATE TABLE history_' || p || ' PARTITION OF history FOR VALUES WITH (modulus 8192, remainder ' || p || ');'
FROMgenerate_series(0, 8191) p; 
\gexec
\o
ALTER TABLE accounts ADD CONSTRAINT accounts_pk PRIMARY KEY (aid);
ALTER TABLE history ADD CONSTRAINT history_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);

* pstack before killed by OOM *

#0  0x0000000000a84aec in ReleaseSysCache (tuple=0x7fbb0a15dc28) at syscache.c:1175
#1  0x0000000000a7135d in get_rel_relkind (relid=164628) at lsyscache.c:1816
#2  0x0000000000845f0a in RelationBuildPartitionDesc (rel=0x7fbadb9bfb10) at partdesc.c:230
#3  0x0000000000a78b9a in RelationBuildDesc (targetRelId=139268, insertIt=false) at relcache.c:1173
#4  0x0000000000a7b52e in RelationClearRelation (relation=0x7fbb0a1393e8, rebuild=true) at relcache.c:2534
#5  0x0000000000a7bacf in RelationFlushRelation (relation=0x7fbb0a1393e8) at relcache.c:2692
#6  0x0000000000a7bbe1 in RelationCacheInvalidateEntry (relationId=139268) at relcache.c:2744
#7  0x0000000000a6e11d in LocalExecuteInvalidationMessage (msg=0x7fbadb62e480) at inval.c:589
#8  0x0000000000a6de7d in ProcessInvalidationMessages (hdr=0x1d36d48, func=0xa6e01a <LocalExecuteInvalidationMessage>)
atinval.c:460 
#9  0x0000000000a6e94e in CommandEndInvalidationMessages () at inval.c:1095
#10 0x0000000000559c93 in AtCCI_LocalCache () at xact.c:1458
#11 0x00000000005596ac in CommandCounterIncrement () at xact.c:1040
#12 0x00000000006b1811 in addFkRecurseReferenced (wqueue=0x7fffcb0a0588, fkconstraint=0x20cf6a0, rel=0x7fbb0a1393e8,
pkrel=0x7fbadb9bbe90,indexOid=189582, parentConstr=204810, numfks=1, pkattnum=0x7fffcb0a0190, fkattnum=0x7fffcb0a0150,
pfeqoperators=0x7fffcb09ff50,ppeqoperators=0x7fffcb09fed0, ffeqoperators=0x7fffcb09fe50, old_check_ok=false) at
tablecmds.c:8168
#13 0x00000000006b1a0b in addFkRecurseReferenced (wqueue=0x7fffcb0a0588, fkconstraint=0x20cf6a0, rel=0x7fbb0a1393e8,
pkrel=0x7fbadc188840,indexOid=188424, parentConstr=0, numfks=1, pkattnum=0x7fffcb0a0190, fkattnum=0x7fffcb0a0150,
pfeqoperators=0x7fffcb09ff50,ppeqoperators=0x7fffcb09fed0, ffeqoperators=0x7fffcb09fe50, old_check_ok=false) at
tablecmds.c:8219
#14 0x00000000006b13e0 in ATAddForeignKeyConstraint (wqueue=0x7fffcb0a0588, tab=0x20cf4d8, rel=0x7fbb0a1393e8,
fkconstraint=0x20cf6a0,parentConstr=0, recurse=true, recursing=false, lockmode=6) at tablecmds.c:8005 
#15 0x00000000006afa0c in ATExecAddConstraint (wqueue=0x7fffcb0a0588, tab=0x20cf4d8, rel=0x7fbb0a1393e8,
newConstraint=0x20cf6a0,recurse=true, is_readd=false, lockmode=6) at tablecmds.c:7419 
#16 0x00000000006a8a7a in ATExecCmd (wqueue=0x7fffcb0a0588, tab=0x20cf4d8, rel=0x7fbb0a1393e8, cmd=0x20cf648,
lockmode=6)at tablecmds.c:4300 
#17 0x00000000006a8448 in ATRewriteCatalogs (wqueue=0x7fffcb0a0588, lockmode=6) at tablecmds.c:4185
#18 0x00000000006a7bf9 in ATController (parsetree=0x1cb4350, rel=0x7fbb0a1393e8, cmds=0x20cf428, recurse=true,
lockmode=6)at tablecmds.c:3843 
#19 0x00000000006a78a4 in AlterTable (relid=139268, lockmode=6, stmt=0x1cb4350) at tablecmds.c:3504
#20 0x0000000000914999 in ProcessUtilitySlow (pstate=0x1cb3a10, pstmt=0x1c91380, queryString=0x1c90170 "ALTER TABLE
historyADD CONSTRAINT history_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);", context=PROCESS_UTILITY_TOPLEVEL,
params=0x0,queryEnv=0x0, dest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at utility.c:1131 
#21 0x0000000000914490 in standard_ProcessUtility (pstmt=0x1c91380, queryString=0x1c90170 "ALTER TABLE history ADD
CONSTRAINThistory_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0,
queryEnv=0x0,dest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at utility.c:927 
#22 0x0000000000913534 in ProcessUtility (pstmt=0x1c91380, queryString=0x1c90170 "ALTER TABLE history ADD CONSTRAINT
history_fk3FOREIGN KEY (aid) REFERENCES accounts (aid);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0,
dest=0x1c91470,completionTag=0x7fffcb0a0d20 "") at utility.c:360 
#23 0x000000000091245a in PortalRunUtility (portal=0x1cf5ee0, pstmt=0x1c91380, isTopLevel=true, setHoldSnapshot=false,
dest=0x1c91470,completionTag=0x7fffcb0a0d20 "") at pquery.c:1175 
#24 0x0000000000912671 in PortalRunMulti (portal=0x1cf5ee0, isTopLevel=true, setHoldSnapshot=false, dest=0x1c91470,
altdest=0x1c91470,completionTag=0x7fffcb0a0d20 "") at pquery.c:1321 
#25 0x0000000000911ba6 in PortalRun (portal=0x1cf5ee0, count=9223372036854775807, isTopLevel=true, run_once=true,
dest=0x1c91470,altdest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at pquery.c:796 
#26 0x000000000090b9ad in exec_simple_query (query_string=0x1c90170 "ALTER TABLE history ADD CONSTRAINT history_fk3
FOREIGNKEY (aid) REFERENCES accounts (aid);") at postgres.c:1231 
#27 0x000000000090fd13 in PostgresMain (argc=1, argv=0x1cb9fb8, dbname=0x1cb9ed0 "postgres", username=0x1cb9eb0
"k5user")at postgres.c:4256 
#28 0x0000000000864cbd in BackendRun (port=0x1cb1e90) at postmaster.c:4498
#29 0x000000000086449b in BackendStartup (port=0x1cb1e90) at postmaster.c:4189
#30 0x00000000008608d7 in ServerLoop () at postmaster.c:1727
#31 0x000000000086018d in PostmasterMain (argc=1, argv=0x1c8aa40) at postmaster.c:1400
#32 0x0000000000770835 in main (argc=1, argv=0x1c8aa40) at main.c:210

regards,

Kato Sho

Re: Creating foreign key on partitioned table is too slow

From

Alvaro Herrera

Date:

24 October 2019, 18:48:57

On 2019-Oct-23, kato-sho@fujitsu.com wrote:

> Hello
> 
> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process
killedby OOM.

> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).
> 
> I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours.

Thanks for reporting.  It sounds like there must be a memory leak here.
I am fairly pressed for time at present so I won't be able to
investigate this until, at least, mid November.

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Re: Creating foreign key on partitioned table is too slow

From

Tomas Vondra

Date:

24 October 2019, 22:17:58

On Thu, Oct 24, 2019 at 03:48:57PM -0300, Alvaro Herrera wrote:
>On 2019-Oct-23, kato-sho@fujitsu.com wrote:
>
>> Hello
>>
>> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process
killedby OOM.
 
>> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).
>>
>> I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours.
>
>Thanks for reporting.  It sounds like there must be a memory leak here.
>I am fairly pressed for time at present so I won't be able to
>investigate this until, at least, mid November.
>

I've briefly looked into this, and I think the main memory leak is in
RelationBuildPartitionDesc. It gets called with PortalContext, it
allocates a lot of memory building the descriptor, copies it into
CacheContext but does not even try to free anything. So we end up with
something like this:

TopMemoryContext: 215344 total in 11 blocks; 47720 free (12 chunks); 167624 used
  pgstat TabStatusArray lookup hash table: 32768 total in 3 blocks; 9160 free (4 chunks); 23608 used
  TopTransactionContext: 4194304 total in 10 blocks; 1992968 free (18 chunks); 2201336 used
  RowDescriptionContext: 8192 total in 1 blocks; 6880 free (0 chunks); 1312 used
  MessageContext: 8192 total in 1 blocks; 3256 free (1 chunks); 4936 used
  Operator class cache: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used
  smgr relation table: 32768 total in 3 blocks; 16768 free (8 chunks); 16000 used
  TransactionAbortContext: 32768 total in 1 blocks; 32504 free (0 chunks); 264 used
  Portal hash: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used
  TopPortalContext: 8192 total in 1 blocks; 7648 free (0 chunks); 544 used
    PortalContext: 1557985728 total in 177490 blocks; 9038656 free (167645 chunks); 1548947072 used: 
  Relcache by OID: 16384 total in 2 blocks; 3424 free (3 chunks); 12960 used
  CacheMemoryContext: 17039424 total in 13 blocks; 7181480 free (9 chunks); 9857944 used
    partition key: 1024 total in 1 blocks; 168 free (0 chunks); 856 used: history
    index info: 2048 total in 2 blocks; 568 free (1 chunks); 1480 used: pg_class_tblspc_relfilenode_index
  ...
    index info: 2048 total in 2 blocks; 872 free (0 chunks); 1176 used: pg_class_oid_index
  WAL record construction: 49776 total in 2 blocks; 6344 free (0 chunks); 43432 used
  PrivateRefCount: 8192 total in 1 blocks; 2584 free (0 chunks); 5608 used
  MdSmgr: 8192 total in 1 blocks; 5976 free (0 chunks); 2216 used
  LOCALLOCK hash: 65536 total in 4 blocks; 18584 free (12 chunks); 46952 used
  Timezones: 104128 total in 2 blocks; 2584 free (0 chunks); 101544 used
  ErrorContext: 8192 total in 1 blocks; 6840 free (4 chunks); 1352 used
Grand total: 1580997216 bytes in 177834 blocks; 18482808 free (167857 chunks); 1562514408 used

(At which point I simply interrupted the query, it'd allocate more and
more memory until an OOM).

The attached patch trivially fixes that by adding a memory context
tracking all the temporary data, and then just deletes it as a whole at
the end of the function. This significantly reduces the memory usage for
me, not sure it's 100% correct.

FWIW, even with this fix it still takes an awful lot to create the
foreign key, because the CPU is stuck doing this

    60.78%    60.78%  postgres  postgres            [.] bms_equal
    32.58%    32.58%  postgres  postgres            [.] get_eclass_for_sort_expr
     3.83%     3.83%  postgres  postgres            [.] add_child_rel_equivalences
     0.23%     0.00%  postgres  [unknown]           [.] 0x0000000000000005
     0.22%     0.00%  postgres  [unknown]           [.] 0000000000000000
     0.18%     0.18%  postgres  postgres            [.] AllocSetCheck
   ...

Haven't looked into the details yet.


regards

-- 
Tomas Vondra                  http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachment

memleak-fix.patch

Re: Creating foreign key on partitioned table is too slow

From

Tomas Vondra

Date:

24 October 2019, 23:07:56

On Fri, Oct 25, 2019 at 12:17:58AM +0200, Tomas Vondra wrote:
>
> ...
>
>FWIW, even with this fix it still takes an awful lot to create the
>foreign key, because the CPU is stuck doing this
>
>   60.78%    60.78%  postgres  postgres            [.] bms_equal
>   32.58%    32.58%  postgres  postgres            [.] get_eclass_for_sort_expr
>    3.83%     3.83%  postgres  postgres            [.] add_child_rel_equivalences
>    0.23%     0.00%  postgres  [unknown]           [.] 0x0000000000000005
>    0.22%     0.00%  postgres  [unknown]           [.] 0000000000000000
>    0.18%     0.18%  postgres  postgres            [.] AllocSetCheck
>  ...
>
>Haven't looked into the details yet.
>

OK, a bit more info. A better perf report (with framepointers etc) looks
like this:

  +   99.98%     0.00%  postgres  [unknown]     [.] 0x495641002c4d133d
  +   99.98%     0.00%  postgres  libc-2.29.so  [.] __libc_start_main
  +   99.98%     0.00%  postgres  postgres      [.] startup_hacks
  +   99.98%     0.00%  postgres  postgres      [.] PostmasterMain
  +   99.98%     0.00%  postgres  postgres      [.] ServerLoop
  +   99.98%     0.00%  postgres  postgres      [.] BackendStartup
  +   99.98%     0.00%  postgres  postgres      [.] ExitPostmaster
  +   99.98%     0.00%  postgres  postgres      [.] PostgresMain
  +   99.98%     0.00%  postgres  postgres      [.] exec_simple_query
  +   99.98%     0.00%  postgres  postgres      [.] PortalRun
  +   99.98%     0.00%  postgres  postgres      [.] PortalRunMulti
  +   99.98%     0.00%  postgres  postgres      [.] PortalRunUtility
  +   99.98%     0.00%  postgres  postgres      [.] ProcessUtility
  +   99.98%     0.00%  postgres  postgres      [.] standard_ProcessUtility
  +   99.98%     0.00%  postgres  postgres      [.] ProcessUtilitySlow
  +   99.98%     0.00%  postgres  postgres      [.] AlterTable
  +   99.98%     0.00%  postgres  postgres      [.] ATController
  +   99.98%     0.00%  postgres  postgres      [.] ATRewriteTables
  +   99.98%     0.00%  postgres  postgres      [.] validateForeignKeyConstraint
  +   99.98%     0.00%  postgres  postgres      [.] RI_Initial_Check
  +   99.96%     0.00%  postgres  postgres      [.] SPI_execute_snapshot
  +   99.86%     0.00%  postgres  postgres      [.] _SPI_execute_plan
  +   99.70%     0.00%  postgres  postgres      [.] GetCachedPlan
  +   99.70%     0.00%  postgres  postgres      [.] BuildCachedPlan
  +   99.66%     0.00%  postgres  postgres      [.] pg_plan_queries
  +   99.66%     0.00%  postgres  postgres      [.] pg_plan_query
  +   99.66%     0.00%  postgres  postgres      [.] planner
  +   99.66%     0.00%  postgres  postgres      [.] standard_planner
  +   99.62%     0.00%  postgres  postgres      [.] subquery_planner
  +   99.62%     0.00%  postgres  postgres      [.] grouping_planner
  +   99.62%     0.00%  postgres  postgres      [.] query_planner
  +   99.31%     0.00%  postgres  postgres      [.] make_one_rel
  +   97.53%     0.00%  postgres  postgres      [.] set_base_rel_pathlists
  +   97.53%     0.00%  postgres  postgres      [.] set_rel_pathlist
  +   97.53%     0.01%  postgres  postgres      [.] set_append_rel_pathlist
  +   97.42%     0.00%  postgres  postgres      [.] set_plain_rel_pathlist
  +   97.40%     0.02%  postgres  postgres      [.] create_index_paths
  +   97.16%     0.01%  postgres  postgres      [.] get_index_paths
  +   97.12%     0.02%  postgres  postgres      [.] build_index_paths
  +   96.67%     0.01%  postgres  postgres      [.] build_index_pathkeys
  +   96.61%     0.01%  postgres  postgres      [.] make_pathkey_from_sortinfo
  +   95.70%    21.27%  postgres  postgres      [.] get_eclass_for_sort_expr
  +   75.21%    75.21%  postgres  postgres      [.] bms_equal
  +   48.72%     0.00%  postgres  postgres      [.] consider_index_join_clauses
  +   48.72%     0.00%  postgres  postgres      [.] consider_index_join_outer_rels
  +   48.72%     0.02%  postgres  postgres      [.] get_join_index_paths
  +    1.78%     0.00%  postgres  postgres      [.] set_base_rel_sizes
  +    1.78%     0.00%  postgres  postgres      [.] set_rel_size
  +    1.78%     0.01%  postgres  postgres      [.] set_append_rel_size
  +    1.66%     1.34%  postgres  postgres      [.] add_child_rel_equivalences

It is (pretty much) a single callstack, i.e. each function is simply
calling the one below it (with some minor exceptions at the end, but
that's pretty negligible here).

This essentially says that planning queries executed by RI_Initial_Check
with many partitions is damn expensive. An example query is this one:

test=# \timing

test=# SELECT fk."aid" FROM ONLY "public"."history_60" fk LEFT OUTER
JOIN "public"."accounts" pk ON ( pk."aid" OPERATOR(pg_catalog.=)
fk."aid") WHERE pk."aid" IS NULL AND (fk."aid" IS NOT NULL);

 aid 
-----
(0 rows)

Time: 28791.492 ms (00:28.791)

Bear in mind those are *empty* tables, so the execution is pretty cheap
(explain analyze says the execution takes ~65ms, but the planning itself
takes ~28 seconds). And we have 8192 such partitions, which means we'd
spend ~230k seconds just planning the RI queries. That's 64 hours.

regards

-- 
Tomas Vondra                  http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Re: Creating foreign key on partitioned table is too slow

From

Andres Freund

Date:

24 October 2019, 23:28:38

Hi,

On 2019-10-23 05:59:01 +0000, kato-sho@fujitsu.com wrote:
> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process
killedby OOM.

> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).

Obviously this should be improved. But I think it's also worthwhile to
note that using 8k partitions is very unlikely to be a good choice for
anything. The metadata, partition pruning, etc overhead is just going to
be very substantial.

Greetings,

Andres Freund

Re: Creating foreign key on partitioned table is too slow

From

Tomas Vondra

Date:

30 October 2019, 18:29:52

On Thu, Oct 24, 2019 at 04:28:38PM -0700, Andres Freund wrote:
>Hi,
>
>On 2019-10-23 05:59:01 +0000, kato-sho@fujitsu.com wrote:
>> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process
killedby OOM.

>> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).
>
>Obviously this should be improved. But I think it's also worthwhile to
>note that using 8k partitions is very unlikely to be a good choice for
>anything. The metadata, partition pruning, etc overhead is just going to
>be very substantial.
>

True. Especially with two partitioned tables, each with 8k partitions.

I do think it makes sense to reduce the memory usage, because just
eating all available memory (in the extreme case) is not very nice. I've
added that patch to the CF, although the patch I shared is very crude
and I'm by no means suggesting it's how it should be done ultimately.

The other bit (speed of planning with 8k partitions) is probably a more
general issue, and I suppose we'll improve that over time. I don't think
there's a simple change magically improving that.

regards

-- 
Tomas Vondra                  http://www.2ndQuadrant.com
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Re: Creating foreign key on partitioned table is too slow

From

David Rowley

Date:

30 October 2019, 22:19:05

On Thu, 31 Oct 2019 at 07:30, Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote:
>
> On Thu, Oct 24, 2019 at 04:28:38PM -0700, Andres Freund wrote:
> >Hi,
> >
> >On 2019-10-23 05:59:01 +0000, kato-sho@fujitsu.com wrote:
> >> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend
processkilled by OOM.

> >> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).
> >
> >Obviously this should be improved. But I think it's also worthwhile to
> >note that using 8k partitions is very unlikely to be a good choice for
> >anything. The metadata, partition pruning, etc overhead is just going to
> >be very substantial.
> >
>
> True. Especially with two partitioned tables, each with 8k partitions.

In Ottawa this year, Andres and I briefly talked about the possibility
of making a series of changes to how equalfuncs.c works. The idea was
to make it easy by using some pre-processor magic to allow us to
create another version of equalfuncs which would let us have an equal
comparison function that returns -1 / 0 / +1 rather than just true or
false.  This would allow us to Binary Search Trees of objects. I
identified that EquivalenceClass.ec_members would be better written as
a BST to allow much faster lookups in get_eclass_for_sort_expr().

The implementation I had in mind for the BST was a compact tree that
instead of using pointers for the left and right children, it just
uses an integer to reference the array element number.  This would
allow us to maintain very fast insert-order traversals.  Deletes would
need to decrement all child references greater than the deleted index.
This is sort of on-par with how the new List implementation in master.
i.e deletes take additional effort, but inserts are fast if there's
enough space in the array for a new element, traversals are
cache-friendly, etc.  I think trees might be better than hash tables
for this as a hash function needs to hash all fields, whereas a
comparison function can stop when it finds the first non-match.

This may also be able to help simplify the code in setrefs.c to get
rid of the complex code around indexed tlists. tlist_member() would
become O(log n) instead of O(n), so perhaps there'd be not much point
in having both search_indexed_tlist_for_var() and
search_indexed_tlist_for_non_var().

-- 
 David Rowley                   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services

Re: Creating foreign key on partitioned table is too slow

From

Andres Freund

Date:

30 October 2019, 22:29:07

Hi,

On 2019-10-31 11:19:05 +1300, David Rowley wrote:
> In Ottawa this year, Andres and I briefly talked about the possibility
> of making a series of changes to how equalfuncs.c works. The idea was
> to make it easy by using some pre-processor magic to allow us to
> create another version of equalfuncs which would let us have an equal
> comparison function that returns -1 / 0 / +1 rather than just true or
> false.

See also the thread at
https://www.postgresql.org/message-id/20190920051857.2fhnvhvx4qdddviz%40alap3.anarazel.de
which would make this fairly easy, without having to compile equalfuncs
twice or such.

Greetings,

Andres Freund

Re: Creating foreign key on partitioned table is too slow

From

Tom Lane

Date:

31 October 2019, 04:56:16

David Rowley <david.rowley@2ndquadrant.com> writes:
> In Ottawa this year, Andres and I briefly talked about the possibility
> of making a series of changes to how equalfuncs.c works. The idea was
> to make it easy by using some pre-processor magic to allow us to
> create another version of equalfuncs which would let us have an equal
> comparison function that returns -1 / 0 / +1 rather than just true or
> false.  This would allow us to Binary Search Trees of objects. I
> identified that EquivalenceClass.ec_members would be better written as
> a BST to allow much faster lookups in get_eclass_for_sort_expr().

This seems like a good idea, but why would we want to maintain two
versions?  Just change equalfuncs.c into comparefuncs.c, full stop.
equal() would be a trivial wrapper for (compare_objects(a,b) == 0).

Andres' ideas about autogenerating all that boilerplate aren't
bad, but that's no justification for carrying two full sets of
per-node logic when one set would do.

            regards, tom lane

Re: Creating foreign key on partitioned table is too slow

From

David Rowley

Date:

31 October 2019, 20:17:25

On Thu, 31 Oct 2019 at 17:56, Tom Lane <tgl@sss.pgh.pa.us> wrote:
>
> David Rowley <david.rowley@2ndquadrant.com> writes:
> > In Ottawa this year, Andres and I briefly talked about the possibility
> > of making a series of changes to how equalfuncs.c works. The idea was
> > to make it easy by using some pre-processor magic to allow us to
> > create another version of equalfuncs which would let us have an equal
> > comparison function that returns -1 / 0 / +1 rather than just true or
> > false.  This would allow us to Binary Search Trees of objects. I
> > identified that EquivalenceClass.ec_members would be better written as
> > a BST to allow much faster lookups in get_eclass_for_sort_expr().
>
> This seems like a good idea, but why would we want to maintain two
> versions?  Just change equalfuncs.c into comparefuncs.c, full stop.
> equal() would be a trivial wrapper for (compare_objects(a,b) == 0).

If we can do that without slowing down the comparison, then sure.
Checking which node sorts earlier is a bit more expensive than just
checking for equality. But if that's not going to be noticeable in
real-world test cases, then I agree.

-- 
 David Rowley                   http://www.2ndQuadrant.com/
 PostgreSQL Development, 24x7 Support, Training & Services

Re: Creating foreign key on partitioned table is too slow

From

Amit Langote

Date:

01 November 2019, 08:37:58

Hello,

On Fri, Oct 25, 2019 at 7:18 AM Tomas Vondra
<tomas.vondra@2ndquadrant.com> wrote:
> On Thu, Oct 24, 2019 at 03:48:57PM -0300, Alvaro Herrera wrote:
> >On 2019-Oct-23, kato-sho@fujitsu.com wrote:
> >
> >> Hello
> >>
> >> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend
processkilled by OOM.

> >> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84).
> >>
> >> I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours.
> >
> >Thanks for reporting.

Thank you Kato-san.

>  It sounds like there must be a memory leak here.
> >I am fairly pressed for time at present so I won't be able to
> >investigate this until, at least, mid November.
>
> I've briefly looked into this, and I think the main memory leak is in
> RelationBuildPartitionDesc. It gets called with PortalContext, it
> allocates a lot of memory building the descriptor, copies it into
> CacheContext but does not even try to free anything. So we end up with
> something like this:
...
> The attached patch trivially fixes that by adding a memory context
> tracking all the temporary data, and then just deletes it as a whole at
> the end of the function. This significantly reduces the memory usage for
> me, not sure it's 100% correct.

Thank you Tomas.  I think we have considered this temporary context
fix a number of times before, but it got stalled for one reason or
another ([1] comes to mind as the last thread where this came up).

Another angle to look at this is that our design where PartitionDesc
is rebuilt on relcache reload of the parent relation is not a great
one after all. It seems that we're rightly (?) invalidating the
parent's relcache 8192 times in this case, because its cacheable
foreign key descriptor changes on processing each partition, but
PartitionDesc itself doesn't change.  Having to pointlessly rebuild it
8192 times seems really wasteful.

I recall a discussion where it was proposed to build PartitionDesc
only when needed as opposed on every relcache reload of the parent
relation.  Attached PoC-at-best patch that does that seems to go
through without OOM and passes make check-world.  I think this should
have a very minor impact on select queries.

But...

> FWIW, even with this fix it still takes an awful lot to create the
> foreign key, because the CPU is stuck doing this
>
>     60.78%    60.78%  postgres  postgres            [.] bms_equal
>     32.58%    32.58%  postgres  postgres            [.] get_eclass_for_sort_expr
>      3.83%     3.83%  postgres  postgres            [.] add_child_rel_equivalences
>      0.23%     0.00%  postgres  [unknown]           [.] 0x0000000000000005
>      0.22%     0.00%  postgres  [unknown]           [.] 0000000000000000
>      0.18%     0.18%  postgres  postgres            [.] AllocSetCheck

...we have many problems to solve here. :-(

Thanks,
Amit

[1] https://www.postgresql.org/message-id/CA%2BTgmoY3bRmGB6-DUnoVy5fJoreiBJ43rwMrQRCdPXuKt4Ykaw%40mail.gmail.com

Attachment

build-PartitionDesc-when-needed-PoC.patch

Re: Creating foreign key on partitioned table is too slow

From

Alvaro Herrera

Date:

13 November 2019, 18:50:47

On 2019-Oct-25, Tomas Vondra wrote:

> The attached patch trivially fixes that by adding a memory context
> tracking all the temporary data, and then just deletes it as a whole at
> the end of the function. This significantly reduces the memory usage for
> me, not sure it's 100% correct.

FWIW we already had this code (added by commit 2455ab48844c), but it was
removed by commit d3f48dfae42f.  I think we should put it back.  (I
think it may be useful to use a static MemoryContext that we can just
reset each time, instead of creating and deleting each time, to save on
memcxt churn.  That'd make the function non-reentrant, but I don't see
that we'd make the catalogs partitioned any time soon.  This may be
premature optimization though -- not really wedded to it.)

With Amit's patch to make RelationBuildPartitionDesc called lazily, the
time to plan the RI_InitialCheck query (using Kato Sho's test case) goes
from 30 seconds to 14 seconds on my laptop.  Obviously there's more that
we'd need to fix to make the scenario fully supported, but it seems a
decent step forward.

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachment

Re: Creating foreign key on partitioned table is too slow

From

Tom Lane

Date:

13 November 2019, 19:31:39

Alvaro Herrera <alvherre@2ndquadrant.com> writes:
> On 2019-Oct-25, Tomas Vondra wrote:
>> The attached patch trivially fixes that by adding a memory context
>> tracking all the temporary data, and then just deletes it as a whole at
>> the end of the function. This significantly reduces the memory usage for
>> me, not sure it's 100% correct.

> FWIW we already had this code (added by commit 2455ab48844c), but it was
> removed by commit d3f48dfae42f.  I think we should put it back.

I disagree.  The point of d3f48dfae42f is that the management of that
leakage is now being done at the caller level, and I'm quite firmly
against having RelationBuildPartitionDesc duplicate that.  If we
don't like the amount of space RelationBuildPartitionDesc is leaking,
we aren't going to like the amount of space that sibling routines
such as RelationBuildTriggers leak, either.

What we ought to be thinking about instead is adjusting the
RECOVER_RELATION_BUILD_MEMORY heuristic in relcache.c.  I am not
sure what it ought to look like, but I doubt that "do it all the
time" has suddenly become the right answer, when it wasn't the
right answer for 20-something years.

It's conceivable that "do it if CCA is on, or if the current
table is a partition child table" is a reasonable approach.
But I'm not sure whether we can know the relation relkind
early enough for that :-(

(BTW, a different question one could ask is exactly why
RelationBuildPartitionDesc is so profligate of leaked memory.)

            regards, tom lane

Re: Creating foreign key on partitioned table is too slow

From

Alvaro Herrera

Date:

13 November 2019, 20:00:11

On 2019-Nov-13, Tom Lane wrote:

> (BTW, a different question one could ask is exactly why
> RelationBuildPartitionDesc is so profligate of leaked memory.)

The original partitioning code (f0e44751d717) decided that it didn't
want to bother with adding a "free" routine for PartitionBoundInfo
structs, maybe because it had too many pointers, so there's no way for
RelationBuildPartitionDesc to free everything it allocates anyway.  We
could add a couple of pfrees and list_frees here and there, but for the
main thing being leaked we'd need to improve that API.

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Re: Creating foreign key on partitioned table is too slow

From

Alvaro Herrera

Date:

13 November 2019, 21:45:44

On 2019-Nov-13, Alvaro Herrera wrote:

> On 2019-Nov-13, Tom Lane wrote:
> 
> > (BTW, a different question one could ask is exactly why
> > RelationBuildPartitionDesc is so profligate of leaked memory.)
> 
> The original partitioning code (f0e44751d717) decided that it didn't
> want to bother with adding a "free" routine for PartitionBoundInfo
> structs, maybe because it had too many pointers, so there's no way for
> RelationBuildPartitionDesc to free everything it allocates anyway.  We
> could add a couple of pfrees and list_frees here and there, but for the
> main thing being leaked we'd need to improve that API.

Ah, we also leak an array of PartitionBoundSpec, which is a Node.  Do we
have any way to free those?  I don't think we do.

In short, it looks to me as if this function was explicitly designed
with the idea that it'd be called in a temp mem context.

I looked at d3f48dfae42f again per your earlier suggestion.  Doing that
memory context dance for partitioned relations does seem to fix the
problem too; we just need to move the context creation to just after
ScanPgRelation, at which point we have the relkind.  (Note: I think the
problematic case is the partitioned table, not the partitions
themselves.  At least, with the attached patch the problem goes away.  I
guess it would be sensible to research whether we need to do this for
relispartition=true as well, but I haven't done that.)

There is indeed some leakage for relations that have triggers too (or
rules), but in order for those to become significant you would have to
have thousands of triggers or rules ...  and in reasonable designs, you
just don't because it doesn't make sense.  But it is not totally
unreasonable to have lots of partitions, and as we improve the system,
more and more people will want to.

Aside: while messing with this I noticed that how significant pg_strtok
is as a resource hog when building partition descs (from the
stringToNode that's applied to each partition's partbound.)

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Attachment

v2-0001-build-partdesc-memcxt.patch

Re: Creating foreign key on partitioned table is too slow

From

Robert Haas

Date:

15 November 2019, 13:30:21

On Wed, Nov 13, 2019 at 4:46 PM Alvaro Herrera <alvherre@2ndquadrant.com> wrote:
> But it is not totally
> unreasonable to have lots of partitions, and as we improve the system,
> more and more people will want to.

Yep.

-- 
Robert Haas
EnterpriseDB: http://www.enterprisedb.com
The Enterprise PostgreSQL Company

Re: Creating foreign key on partitioned table is too slow

From

David Steele

Date:

24 March 2020, 14:39:09

On 11/13/19 4:45 PM, Alvaro Herrera wrote:
 >
> But it is not totally
> unreasonable to have lots of partitions, and as we improve the system,
> more and more people will want to.

+1

This patch still applies but there seems to be some disagreement on how 
to proceed.

Any thoughts?

Regards,
-- 
-David
david@pgmasters.net

Re: Creating foreign key on partitioned table is too slow

From

Alvaro Herrera

Date:

24 March 2020, 15:26:23

On 2020-Mar-24, David Steele wrote:

> This patch still applies but there seems to be some disagreement on
> how to proceed.

Actually, I don't think there's any disagreement regarding the patch I
last posted.  (There was disagreement on the previous patches, which
were very different).  Tom suggested to look at the heuristics used for
RECOVER_RELATION_BUILD_MEMORY, and the patch does exactly that.  It
would be great if Kato Sho can try the original test case with my latest
patch (the one in https://postgr.es/m/20191113214544.GA16060@alvherre.pgsql )
and let us know if it improves things.

The patch as posted generates these warnings in my current GCC that it
didn't when I checked last, but they're harmless -- if/when I push,
it'll be without the parens.

/pgsql/source/master/src/backend/utils/cache/relcache.c:1064:21: warning: equality comparison with extraneous
parentheses[-Wparentheses-equality]
 
        if ((relp->relkind == RELKIND_PARTITIONED_TABLE)
             ~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~
/pgsql/source/master/src/backend/utils/cache/relcache.c:1064:21: note: remove extraneous parentheses around the
comparisonto silence this warning
 
        if ((relp->relkind == RELKIND_PARTITIONED_TABLE)
            ~              ^                           ~
/pgsql/source/master/src/backend/utils/cache/relcache.c:1064:21: note: use '=' to turn this equality comparison into an
assignment
        if ((relp->relkind == RELKIND_PARTITIONED_TABLE)
                           ^~
                           =
/pgsql/source/master/src/backend/utils/cache/relcache.c:1242:33: warning: equality comparison with extraneous
parentheses[-Wparentheses-equality]
 
        if ((relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
             ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~
/pgsql/source/master/src/backend/utils/cache/relcache.c:1242:33: note: remove extraneous parentheses around the
comparisonto silence this warning
 
        if ((relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
            ~                          ^                           ~
/pgsql/source/master/src/backend/utils/cache/relcache.c:1242:33: note: use '=' to turn this equality comparison into an
assignment
        if ((relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE)
                                       ^~
                                       =
2 warnings generated.

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Re: Creating foreign key on partitioned table is too slow

From

Daniel Gustafsson

Date:

14 July 2020, 14:28:38

> On 24 Mar 2020, at 16:26, Alvaro Herrera <alvherre@2ndquadrant.com> wrote:

> It
> would be great if Kato Sho can try the original test case with my latest
> patch (the one in https://postgr.es/m/20191113214544.GA16060@alvherre.pgsql )
> and let us know if it improves things.

Hi!,

Are you able to test Alvaros latest patch to see if that solves the originally
reported problem, so that we can reach closure on this item during the
commitfest?

cheers ./daniel

RE: Creating foreign key on partitioned table is too slow

From

"kato-sho@fujitsu.com"

Date:

05 August 2020, 00:43:14

On Tuesday, July 14, 2020 11:29 PM, Daniel Fustafsson wrote:
>Are you able to test Alvaros latest patch to see if that solves the originally reported problem, so that we can reach
>closureon this item during the commitfest? 

Sorry for the too late replay.  I missed this mail.
And, thanks for writing patches. I start test now.
I'll report the result before the end of August .

Regards,
Sho kato

RE: Creating foreign key on partitioned table is too slow

From

"kato-sho@fujitsu.com"

Date:

06 August 2020, 07:25:00

On Wednesday, August 5, 2020 9:43 AM I wrote:
> I'll report the result before the end of August .

I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok.

Firstly, I execute ALTER TABLE ADD CONSTRAINT FOREIGN KEY on the table which has 8k tables.
This query execution completes in about 22 hours without OOM.

Secondary, I confirm the reduction of memory context usage.
Running with 8k partitions takes too long, I confirm with 1k partitions.
I use gdb and call MemoryContextStats(TopMemoryContext) at addFkRecurseReferencing().

CacheMemoryContext size becomes small, so I think it is working as expected.
The Results are as follows.

- before applying patch

TopMemoryContext: 418896 total in 18 blocks; 91488 free (13 chunks); 327408 used
  pgstat TabStatusArray lookup hash table: 65536 total in 4 blocks; 16808 free (7 chunks); 48728 used
  TopTransactionContext: 4194304 total in 10 blocks; 1045728 free (18 chunks); 3148576 used
  TableSpace cache: 8192 total in 1 blocks; 2048 free (0 chunks); 6144 used
  Type information cache: 24624 total in 2 blocks; 2584 free (0 chunks); 22040 used
  Operator lookup cache: 24576 total in 2 blocks; 10712 free (4 chunks); 13864 used
  RowDescriptionContext: 8192 total in 1 blocks; 6880 free (0 chunks); 1312 used
  MessageContext: 8192 total in 1 blocks; 3064 free (0 chunks); 5128 used
  Operator class cache: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used
  smgr relation table: 32768 total in 3 blocks; 16768 free (8 chunks); 16000 used
  TransactionAbortContext: 32768 total in 1 blocks; 32504 free (0 chunks); 264 used
  Portal hash: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used
  TopPortalContext: 8192 total in 1 blocks; 7648 free (0 chunks); 544 used
    PortalContext: 9621216 total in 1179 blocks; 13496 free (13 chunks); 9607720 used:
  Relcache by OID: 16384 total in 2 blocks; 3424 free (3 chunks); 12960 used
  CacheMemoryContext: 4243584 total in 12 blocks; 1349808 free (12 chunks); 2893776 used
    index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_trigger_tgconstraint_index
    index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_trigger_oid_index
    index info: 2048 total in 2 blocks; 352 free (1 chunks); 1696 used: pg_inherits_relid_seqno_index
    partition descriptor: 65344 total in 12 blocks; 7336 free (4 chunks); 58008 used: accounts
    index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_inherits_parent_index
    partition key: 1024 total in 1 blocks; 160 free (0 chunks); 864 used: accounts
    ...
    index info: 2048 total in 2 blocks; 736 free (2 chunks); 1312 used: pg_database_oid_index
    index info: 2048 total in 2 blocks; 736 free (2 chunks); 1312 used: pg_authid_rolname_index
  WAL record construction: 49776 total in 2 blocks; 6344 free (0 chunks); 43432 used
  PrivateRefCount: 8192 total in 1 blocks; 2584 free (0 chunks); 5608 used
  MdSmgr: 8192 total in 1 blocks; 5528 free (0 chunks); 2664 used
  LOCALLOCK hash: 131072 total in 5 blocks; 26376 free (15 chunks); 104696 used
  Timezones: 104128 total in 2 blocks; 2584 free (0 chunks); 101544 used
  ErrorContext: 8192 total in 1 blocks; 7928 free (3 chunks); 264 used
Grand total: 19322960 bytes in 1452 blocks; 2743560 free (186 chunks); 16579400 used

- after applying patch

TopMemoryContext: 418896 total in 18 blocks; 91488 free (13 chunks); 327408 used
  pgstat TabStatusArray lookup hash table: 65536 total in 4 blocks; 16808 free (7 chunks); 48728 used
  TopTransactionContext: 4194304 total in 10 blocks; 1045728 free (18 chunks); 3148576 used
  RowDescriptionContext: 8192 total in 1 blocks; 6880 free (0 chunks); 1312 used
  MessageContext: 8192 total in 1 blocks; 3064 free (0 chunks); 5128 used
  Operator class cache: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used
  smgr relation table: 32768 total in 3 blocks; 16768 free (8 chunks); 16000 used
  TransactionAbortContext: 32768 total in 1 blocks; 32504 free (0 chunks); 264 used
  Portal hash: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used
  TopPortalContext: 8192 total in 1 blocks; 7648 free (0 chunks); 544 used
    PortalContext: 9621216 total in 1179 blocks; 13496 free (13 chunks); 9607720 used:
  Relcache by OID: 16384 total in 2 blocks; 3424 free (3 chunks); 12960 used
  CacheMemoryContext: 2113600 total in 10 blocks; 556240 free (10 chunks); 1557360 used
    index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_trigger_tgconstraint_index
    index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_trigger_oid_index
    index info: 2048 total in 2 blocks; 352 free (1 chunks); 1696 used: pg_inherits_relid_seqno_index
    partition descriptor: 65344 total in 12 blocks; 7336 free (4 chunks); 58008 used: accounts
    index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_inherits_parent_index
    partition key: 1024 total in 1 blocks; 160 free (0 chunks); 864 used: accounts
    ...
    index info: 2048 total in 2 blocks; 736 free (2 chunks); 1312 used: pg_database_oid_index
    index info: 2048 total in 2 blocks; 736 free (2 chunks); 1312 used: pg_authid_rolname_index
  WAL record construction: 49776 total in 2 blocks; 6344 free (0 chunks); 43432 used
  PrivateRefCount: 8192 total in 1 blocks; 2584 free (0 chunks); 5608 used
  MdSmgr: 8192 total in 1 blocks; 6360 free (0 chunks); 1832 used
  LOCALLOCK hash: 131072 total in 5 blocks; 26376 free (15 chunks); 104696 used
  Timezones: 104128 total in 2 blocks; 2584 free (0 chunks); 101544 used
  ErrorContext: 8192 total in 1 blocks; 7928 free (3 chunks); 264 used
Grand total: 17131488 bytes in 1441 blocks; 1936008 free (234 chunks); 15195480 used

Finally, I do make check and all tests are passed.
So, I'll change this patch status to ready for committer.

Regards,
Sho Kato

Re: Creating foreign key on partitioned table is too slow

From

Amit Langote

Date:

19 August 2020, 05:02:55

Hi Alvaro,

On Thu, Aug 6, 2020 at 4:25 PM kato-sho@fujitsu.com
<kato-sho@fujitsu.com> wrote:
> On Wednesday, August 5, 2020 9:43 AM I wrote:
> > I'll report the result before the end of August .
>
> I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok.

Is this patch meant for HEAD or back-patching?  I ask because v13 got this:

commit 5b9312378e2f8fb35ef4584aea351c3319a10422
Author: Tom Lane <tgl@sss.pgh.pa.us>
Date:   Wed Dec 25 14:43:13 2019 -0500

    Load relcache entries' partitioning data on-demand, not immediately.

which prevents a partitioned table's PartitionDesc from being rebuilt
repeatedly as would happen before this commit in Kato-san's case,
because it moves RelationBuildPartitionDesc out of the relcache flush
code path.

So, the OOM situation that Kato-san original reported should not occur
as of v13.

-- 
Amit Langote
EnterpriseDB: http://www.enterprisedb.com

Re: Creating foreign key on partitioned table is too slow

From

Alvaro Herrera

Date:

19 August 2020, 18:06:12

On 2020-Aug-19, Amit Langote wrote:

Hello

> On Thu, Aug 6, 2020 at 4:25 PM kato-sho@fujitsu.com
> <kato-sho@fujitsu.com> wrote:
> > On Wednesday, August 5, 2020 9:43 AM I wrote:
> > > I'll report the result before the end of August .
> >
> > I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok.
> 
> Is this patch meant for HEAD or back-patching?  I ask because v13 got this:
> 
> commit 5b9312378e2f8fb35ef4584aea351c3319a10422
> Author: Tom Lane <tgl@sss.pgh.pa.us>
> Date:   Wed Dec 25 14:43:13 2019 -0500
> 
>     Load relcache entries' partitioning data on-demand, not immediately.
> 
> which prevents a partitioned table's PartitionDesc from being rebuilt
> repeatedly as would happen before this commit in Kato-san's case,
> because it moves RelationBuildPartitionDesc out of the relcache flush
> code path.

Hmm, so this is a problem only in v11 and v12?  It seems that putting
the patch in master *only* is pointless.  OTOH v11 had other severe
performance drawbacks with lots of partitions, so it might not be needed
there.

I admit I'm hesitant to carry code in only one or two stable branches
that exists nowhere else.  But maybe the problem is serious enough in
those branches (that will still live for quite a few years) that we
should get it there.

OTOH it could be argued that the coding in master is not all that great
anyway (given the willingness for memory to be leaked) that it should
apply to all three branches.

-- 
Álvaro Herrera                https://www.2ndQuadrant.com/
PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services

Re: Creating foreign key on partitioned table is too slow

From

Amit Langote

Date:

20 August 2020, 01:50:29

Hi,

On Thu, Aug 20, 2020 at 3:06 AM Alvaro Herrera <alvherre@2ndquadrant.com> wrote:
> On 2020-Aug-19, Amit Langote wrote:
> > On Thu, Aug 6, 2020 at 4:25 PM kato-sho@fujitsu.com
> > <kato-sho@fujitsu.com> wrote:
> > > On Wednesday, August 5, 2020 9:43 AM I wrote:
> > > > I'll report the result before the end of August .
> > >
> > > I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok.
> >
> > Is this patch meant for HEAD or back-patching?  I ask because v13 got this:
> >
> > commit 5b9312378e2f8fb35ef4584aea351c3319a10422
> > Author: Tom Lane <tgl@sss.pgh.pa.us>
> > Date:   Wed Dec 25 14:43:13 2019 -0500
> >
> >     Load relcache entries' partitioning data on-demand, not immediately.
> >
> > which prevents a partitioned table's PartitionDesc from being rebuilt
> > repeatedly as would happen before this commit in Kato-san's case,
> > because it moves RelationBuildPartitionDesc out of the relcache flush
> > code path.
>
> Hmm, so this is a problem only in v11 and v12?  It seems that putting
> the patch in master *only* is pointless.  OTOH v11 had other severe
> performance drawbacks with lots of partitions, so it might not be needed
> there.
>
> I admit I'm hesitant to carry code in only one or two stable branches
> that exists nowhere else.  But maybe the problem is serious enough in
> those branches (that will still live for quite a few years) that we
> should get it there.
>
> OTOH it could be argued that the coding in master is not all that great
> anyway (given the willingness for memory to be leaked) that it should
> apply to all three branches.

Fwiw, I am fine with applying the memory-leak fix in all branches down
to v12 if we are satisfied with the implementation.

That said, I don't offhand know any real world use case beside
Kato-san's that's affected by this leak.  Kato-san's case is creating
a foreign key referencing a partitioned table with many partitions,
something that's only supported from v12.  (You may have noticed that
the leak that occurs when rebuilding referencing table's PartitionDesc
accumulates while addFkRecurseReferenced is looping on referenced
table's partitions.)

-- 
Amit Langote
EnterpriseDB: http://www.enterprisedb.com

Re: Creating foreign key on partitioned table is too slow

From

Amit Langote

Date:

21 August 2020, 02:20:33

On Thu, Aug 20, 2020 at 10:50 AM Amit Langote <amitlangote09@gmail.com> wrote:
 On Thu, Aug 20, 2020 at 3:06 AM Alvaro Herrera
<alvherre@2ndquadrant.com> wrote:
> > On 2020-Aug-19, Amit Langote wrote:
> > > On Thu, Aug 6, 2020 at 4:25 PM kato-sho@fujitsu.com
> > > <kato-sho@fujitsu.com> wrote:
> > > > On Wednesday, August 5, 2020 9:43 AM I wrote:
> > > > > I'll report the result before the end of August .
> > > >
> > > > I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok.
>
> Fwiw, I am fine with applying the memory-leak fix in all branches down
> to v12 if we are satisfied with the implementation.

I have revised the above patch slightly to introduce a variable for
the condition whether to use a temporary memory context.

-- 
Amit Langote
EnterpriseDB: http://www.enterprisedb.com

Attachment

v3-0001-build-partdesc-memcxt.patch

Re: Creating foreign key on partitioned table is too slow

From

Tom Lane

Date:

03 September 2020, 18:35:43

Amit Langote <amitlangote09@gmail.com> writes:
>> Fwiw, I am fine with applying the memory-leak fix in all branches down
>> to v12 if we are satisfied with the implementation.

> I have revised the above patch slightly to introduce a variable for
> the condition whether to use a temporary memory context.

This CF entry has been marked "ready for committer", which I find
inappropriate since there doesn't seem to be any consensus about
whether we need it.

I tried running the original test case under HEAD.  I do not see
any visible memory leak, which I think indicates that 5b9312378 or
some other fix has taken care of the leak since the original report.
However, after waiting awhile and noting that the ADD FOREIGN KEY
wasn't finishing, I poked into its progress with a debugger and
observed that each iteration of RI_Initial_Check() was taking about
15 seconds.  I presume we have to do that for each partition,
which leads to the estimate that it'll take 34 hours to finish this
... and that's with no data in the partitions, god help me if
there'd been a lot.

Some quick "perf" work says that most of the time seems to be
getting spent in the planner, in get_eclass_for_sort_expr().
So this is likely just a variant of performance issues we've
seen before.  (I do wonder why we're not able to prune the
join to just the matching PK partition, though.)

Anyway, the long and the short of it is that this test case is far
larger than anything anyone could practically use in HEAD, let alone
in released branches.  I can't get excited about back-patching a fix
to a memory leak if that's just going to allow people to hit other
performance-killing issues.

In short, I don't see a reason why we need this patch in any branch,
so I recommend rejecting it.  If we did think we need a leak fix in
the back branches, back-porting 5b9312378 would likely be a saner
way to proceed.

            regards, tom lane

Re: Creating foreign key on partitioned table is too slow

From

Ashutosh Bapat

Date:

04 September 2020, 11:34:29

On Fri, Sep 4, 2020 at 12:05 AM Tom Lane <tgl@sss.pgh.pa.us> wrote:
>
> Amit Langote <amitlangote09@gmail.com> writes:
> >> Fwiw, I am fine with applying the memory-leak fix in all branches down
> >> to v12 if we are satisfied with the implementation.
>
> > I have revised the above patch slightly to introduce a variable for
> > the condition whether to use a temporary memory context.
>
> This CF entry has been marked "ready for committer", which I find
> inappropriate since there doesn't seem to be any consensus about
> whether we need it.
>
> I tried running the original test case under HEAD.  I do not see
> any visible memory leak, which I think indicates that 5b9312378 or
> some other fix has taken care of the leak since the original report.
> However, after waiting awhile and noting that the ADD FOREIGN KEY
> wasn't finishing, I poked into its progress with a debugger and
> observed that each iteration of RI_Initial_Check() was taking about
> 15 seconds.  I presume we have to do that for each partition,
> which leads to the estimate that it'll take 34 hours to finish this
> ... and that's with no data in the partitions, god help me if
> there'd been a lot.
>
> Some quick "perf" work says that most of the time seems to be
> getting spent in the planner, in get_eclass_for_sort_expr().
> So this is likely just a variant of performance issues we've
> seen before.  (I do wonder why we're not able to prune the
> join to just the matching PK partition, though.)
>

Consider this example
postgres=# create table t1 (a int, b int, CHECK (a between 100 and 150));
CREATE TABLE
postgres=# create table part(a int, b int) partition by range(a);
CREATE TABLE
postgres=# create table part_p1 partition of part for values from (0) to (50);
CREATE TABLE
postgres=# create table part_p2 partition of part for values from (50) to (100);
CREATE TABLE
postgres=# create table part_p3 partition of part for values from
(100) to (150);
CREATE TABLE
postgres=# create table part_p4 partition of part for values from
(150) to (200);
CREATE TABLE
postgres=# explain (costs off) select * from t1 r1, part r2 where r1.a = r2.a;
              QUERY PLAN
--------------------------------------
 Hash Join
   Hash Cond: (r2.a = r1.a)
   ->  Append
         ->  Seq Scan on part_p1 r2_1
         ->  Seq Scan on part_p2 r2_2
         ->  Seq Scan on part_p3 r2_3
         ->  Seq Scan on part_p4 r2_4
   ->  Hash
         ->  Seq Scan on t1 r1
(9 rows)

Given that t1.a can not have any value less than 100 and greater than
150, any row in t1 won't have its joining partner in part_p1 and
part_p2. So those two partitions can be pruned. But I think we don't
consider the constraints on table when joining two tables to render a
join empty or even prune partitions. That would be a good optimization
which will improve this case as well.

But further to that, I think when we add constraint on the partition
table which translates to constraints on individual partitions, we
should check the entire partitioned relation rather than individual
partitions. If we do that, we won't need to plan query for every
partition. If the foreign key happens to be partition key e.g in star
schema, this will use partitionwise join to further improve query
performance. Somewhere in future, we will be able to repartition the
foreign key table by foreign key and perform partitionwise join.

-- 
Best Wishes,
Ashutosh Bapat

Re: Creating foreign key on partitioned table is too slow

From

Alec Lazarescu

Date:

23 October 2023, 21:39:59

I ran into a situation that echoed this original one by Kato in the
start of this thread:
https://www.postgresql.org/message-id/OSAPR01MB374809E8DE169C8BF2B82CBD9F6B0%40OSAPR01MB3748.jpnprd01.prod.outlook.com

More below.

Tom Lane <tgl(at)sss(dot)pgh(dot)pa(dot)us> writes:
>
> I tried running the original test case under HEAD.  I do not see
> any visible memory leak, which I think indicates that 5b9312378 or
> some other fix has taken care of the leak since the original report.
> However, after waiting awhile and noting that the ADD FOREIGN KEY
> wasn't finishing, I poked into its progress with a debugger and
> observed that each iteration of RI_Initial_Check() was taking about
> 15 seconds.  I presume we have to do that for each partition,
> which leads to the estimate that it'll take 34 hours to finish this
> ... and that's with no data in the partitions, god help me if
> there'd been a lot.
>
> Some quick "perf" work says that most of the time seems to be
> getting spent in the planner, in get_eclass_for_sort_expr().
> So this is likely just a variant of performance issues we've
> seen before.  (I do wonder why we're not able to prune the
> join to just the matching PK partition, though.)

I found that the original example from Kato finishes in a little over
a minute now to create the FK constraint in Postgres 14-16.

However, in my case, I'm using composite partitions and that is taking
60x as long for an equivalent number of partitions. I must emphasize
this is with ZERO rows of data.

I'm using 1200 partitions in my example to finish a bit faster and
because that's my actual use case and less extreme than 8,000
partitions.

If I reach my 1,200 with 80 top-level and 15 leaf-level partitions in
a composite hierarchy (80x15 = 1,200 still) the speed is very slow.
I'm using composite partitions primarily because in my real code I
need LIST partitions which don't support multiple keys so I worked
around that using composite partitions with one LIST key in each
level. Doing some other workaround like a concatenated single key was
messy for my use case.

I modified Kato's test case to repeat the issue I'm having and saw
some very odd query plan behavior which is likely part of the issue.

The flat version with 1,200 partitions at a single level and no
composite partitions finishes in a little over a second while the 80 x
15 version with composite partitions takes over a minute (60x longer).
In my actual database with many such partitions with FK, the time
compounds and FK creation takes >30 minutes per FK leading to hours
just making FKs.

== COMPOSITE PARTITION INTERNAL SELECT PLAN ==

If I cancel the composite FK creation, I see where it stopped and that
gives a clue about the difference in speed. For the composite, it's
this statement with a plan linked on dalibo showing a massive amount
of sequential scans and Postgres making some assumptions about 1 row
existing.

ERROR: canceling statement due to user request
CONTEXT: SQL statement SELECT fk."aid" FROM ONLY
"public"."xhistory_25_12" fk LEFT OUTER JOIN "public"."xaccounts" pk
ON ( pk."aid" OPERATOR(pg_catalog.=) fk."aid") WHERE pk."aid" IS NULL
AND (fk."aid" IS NOT NULL)
SQL state: 57014

PLAN DETAILS: https://explain.dalibo.com/plan/fad72gdacb6727b4#plan

== FLAT PARTITION INTERNAL SELECT PLAN ==

This gives a direct result node and has no complexity at all

SELECT fk."aid" FROM ONLY "public"."history_23" fk LEFT OUTER JOIN
"public"."accounts" pk ON ( pk."aid" OPERATOR(pg_catalog.=) fk."aid")
WHERE pk."aid" IS NULL AND (fk."aid" IS NOT NULL)

PLAN DETAILS: https://explain.dalibo.com/plan/a83dae9b9569ebcd

Test cases to repeat easily below:

== FLAT PARTITION FAST FK DDL ==

CREATE DATABASE fastflatfk

CREATE TABLE accounts (aid INTEGER, bid INTEGER, abalance INTEGER,
filler CHAR(84)) PARTITION BY HASH(aid);
CREATE TABLE history (tid INTEGER, bid INTEGER, aid INTEGER, delta
INTEGER, mtime TIMESTAMP, filler CHAR(22)) PARTITION BY HASH(aid);

DO $$
DECLARE
   p INTEGER;
BEGIN
  FOR p IN 0..1023 LOOP
    EXECUTE 'CREATE TABLE accounts_' || p || ' PARTITION OF accounts
FOR VALUES WITH (modulus 1024, remainder ' || p || ') PARTITION BY
HASH(aid);';
    EXECUTE 'CREATE TABLE history_' || p || ' PARTITION OF history FOR
VALUES WITH (modulus 1024, remainder ' || p || ') PARTITION BY
HASH(aid);';
  END LOOP;
END $$;

ALTER TABLE accounts ADD CONSTRAINT accounts_pk PRIMARY KEY (aid);

-- Query returned successfully in 1 secs 547 msec.
ALTER TABLE history ADD CONSTRAINT history_fk FOREIGN KEY (aid)
REFERENCES accounts (aid) ON DELETE CASCADE;

--run to drop FK before you recreate it
--ALTER TABLE history DROP CONSTRAINT history_fk

== COMPOSITE PARTITION SLOW FK DDL ==

Now the composite partition version with 80 x 15 partitions which
finishes in a bit over a minute (60x the time)

CREATE DATABASE slowcompfk

-- Create the parent tables for xaccounts and xhistory
CREATE TABLE xaccounts (aid INTEGER, bid INTEGER, abalance INTEGER,
filler CHAR(84)) PARTITION BY HASH(aid);
CREATE TABLE xhistory (tid INTEGER, bid INTEGER, aid INTEGER, delta
INTEGER, mtime TIMESTAMP, filler CHAR(22)) PARTITION BY HASH(aid);

-- Generate SQL for creating 80 partitions for xaccounts
DO $$
DECLARE
   p INTEGER;
BEGIN
  FOR p IN 0..79 LOOP
    EXECUTE 'CREATE TABLE xaccounts_' || p || ' PARTITION OF xaccounts
FOR VALUES WITH (modulus 80, remainder ' || p || ') PARTITION BY
HASH(aid);';
  END LOOP;
END $$;

-- Generate SQL for creating 15 sub-partitions within each partition
for xaccounts
DO $$
DECLARE
  main_partition INTEGER;
  sub_partition INTEGER;
BEGIN
  FOR main_partition IN 0..79 LOOP
    FOR sub_partition IN 0..14 LOOP
      EXECUTE 'CREATE TABLE xaccounts_' || main_partition || '_' ||
sub_partition || ' PARTITION OF xaccounts_' || main_partition || ' FOR
VALUES WITH (modulus 15, remainder ' || sub_partition || ');';
    END LOOP;
  END LOOP;
END $$;

-- Generate SQL for creating 80 partitions for xhistory
DO $$
DECLARE
   p INTEGER;
BEGIN
  FOR p IN 0..79 LOOP
    EXECUTE 'CREATE TABLE xhistory_' || p || ' PARTITION OF xhistory
FOR VALUES WITH (modulus 80, remainder ' || p || ') PARTITION BY
HASH(aid);';
  END LOOP;
END $$;

-- Generate SQL for creating 15 sub-partitions within each partition
for xhistory
DO $$
DECLARE
  main_partition INTEGER;
  sub_partition INTEGER;
BEGIN
  FOR main_partition IN 0..79 LOOP
    FOR sub_partition IN 0..14 LOOP
      EXECUTE 'CREATE TABLE xhistory_' || main_partition || '_' ||
sub_partition || ' PARTITION OF xhistory_' || main_partition || ' FOR
VALUES WITH (modulus 15, remainder ' || sub_partition || ');';
    END LOOP;
  END LOOP;
END $$;

ALTER TABLE xaccounts ADD CONSTRAINT xaccounts_pk PRIMARY KEY (aid);

-- Query returned successfully in 1 min 18 secs.
ALTER TABLE xhistory ADD CONSTRAINT xhistory_fk FOREIGN KEY (aid)
REFERENCES xaccounts (aid) ON DELETE CASCADE;

--run to drop FK before you recreate it
--ALTER TABLE xhistory DROP CONSTRAINT xhistory_fk