Thread: Creating foreign key on partitioned table is too slow
Hello To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killedby OOM. the number of partitions is 8192. I tried in master(commit: ad4b7aeb84). I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours. Is the community aware of this? is anyone working on this? If you are discussing, please let me know the thread. Table definition and pstack are as follows. * table definition * CREATE TABLE accounts (aid INTEGER, bid INTEGER, abalance INTEGER, filler CHAR(84)) PARTITION BY HASH(aid); CREATE TABLE history (tid INTEGER, bid INTEGER, aid INTEGER, delta INTEGER, mtime TIMESTAMP, filler CHAR(22)) PARTITION BYHASH(aid); \o /dev/null SELECT 'CREATE TABLE accounts_' || p || ' PARTITION OF accounts FOR VALUES WITH (modulus 8192, remainder ' || p || ');' FROMgenerate_series(0, 8191) p; \gexec SELECT 'CREATE TABLE history_' || p || ' PARTITION OF history FOR VALUES WITH (modulus 8192, remainder ' || p || ');' FROMgenerate_series(0, 8191) p; \gexec \o ALTER TABLE accounts ADD CONSTRAINT accounts_pk PRIMARY KEY (aid); ALTER TABLE history ADD CONSTRAINT history_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid); * pstack before killed by OOM * #0 0x0000000000a84aec in ReleaseSysCache (tuple=0x7fbb0a15dc28) at syscache.c:1175 #1 0x0000000000a7135d in get_rel_relkind (relid=164628) at lsyscache.c:1816 #2 0x0000000000845f0a in RelationBuildPartitionDesc (rel=0x7fbadb9bfb10) at partdesc.c:230 #3 0x0000000000a78b9a in RelationBuildDesc (targetRelId=139268, insertIt=false) at relcache.c:1173 #4 0x0000000000a7b52e in RelationClearRelation (relation=0x7fbb0a1393e8, rebuild=true) at relcache.c:2534 #5 0x0000000000a7bacf in RelationFlushRelation (relation=0x7fbb0a1393e8) at relcache.c:2692 #6 0x0000000000a7bbe1 in RelationCacheInvalidateEntry (relationId=139268) at relcache.c:2744 #7 0x0000000000a6e11d in LocalExecuteInvalidationMessage (msg=0x7fbadb62e480) at inval.c:589 #8 0x0000000000a6de7d in ProcessInvalidationMessages (hdr=0x1d36d48, func=0xa6e01a <LocalExecuteInvalidationMessage>) atinval.c:460 #9 0x0000000000a6e94e in CommandEndInvalidationMessages () at inval.c:1095 #10 0x0000000000559c93 in AtCCI_LocalCache () at xact.c:1458 #11 0x00000000005596ac in CommandCounterIncrement () at xact.c:1040 #12 0x00000000006b1811 in addFkRecurseReferenced (wqueue=0x7fffcb0a0588, fkconstraint=0x20cf6a0, rel=0x7fbb0a1393e8, pkrel=0x7fbadb9bbe90,indexOid=189582, parentConstr=204810, numfks=1, pkattnum=0x7fffcb0a0190, fkattnum=0x7fffcb0a0150, pfeqoperators=0x7fffcb09ff50,ppeqoperators=0x7fffcb09fed0, ffeqoperators=0x7fffcb09fe50, old_check_ok=false) at tablecmds.c:8168 #13 0x00000000006b1a0b in addFkRecurseReferenced (wqueue=0x7fffcb0a0588, fkconstraint=0x20cf6a0, rel=0x7fbb0a1393e8, pkrel=0x7fbadc188840,indexOid=188424, parentConstr=0, numfks=1, pkattnum=0x7fffcb0a0190, fkattnum=0x7fffcb0a0150, pfeqoperators=0x7fffcb09ff50,ppeqoperators=0x7fffcb09fed0, ffeqoperators=0x7fffcb09fe50, old_check_ok=false) at tablecmds.c:8219 #14 0x00000000006b13e0 in ATAddForeignKeyConstraint (wqueue=0x7fffcb0a0588, tab=0x20cf4d8, rel=0x7fbb0a1393e8, fkconstraint=0x20cf6a0,parentConstr=0, recurse=true, recursing=false, lockmode=6) at tablecmds.c:8005 #15 0x00000000006afa0c in ATExecAddConstraint (wqueue=0x7fffcb0a0588, tab=0x20cf4d8, rel=0x7fbb0a1393e8, newConstraint=0x20cf6a0,recurse=true, is_readd=false, lockmode=6) at tablecmds.c:7419 #16 0x00000000006a8a7a in ATExecCmd (wqueue=0x7fffcb0a0588, tab=0x20cf4d8, rel=0x7fbb0a1393e8, cmd=0x20cf648, lockmode=6)at tablecmds.c:4300 #17 0x00000000006a8448 in ATRewriteCatalogs (wqueue=0x7fffcb0a0588, lockmode=6) at tablecmds.c:4185 #18 0x00000000006a7bf9 in ATController (parsetree=0x1cb4350, rel=0x7fbb0a1393e8, cmds=0x20cf428, recurse=true, lockmode=6)at tablecmds.c:3843 #19 0x00000000006a78a4 in AlterTable (relid=139268, lockmode=6, stmt=0x1cb4350) at tablecmds.c:3504 #20 0x0000000000914999 in ProcessUtilitySlow (pstate=0x1cb3a10, pstmt=0x1c91380, queryString=0x1c90170 "ALTER TABLE historyADD CONSTRAINT history_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0,queryEnv=0x0, dest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at utility.c:1131 #21 0x0000000000914490 in standard_ProcessUtility (pstmt=0x1c91380, queryString=0x1c90170 "ALTER TABLE history ADD CONSTRAINThistory_fk3 FOREIGN KEY (aid) REFERENCES accounts (aid);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0,dest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at utility.c:927 #22 0x0000000000913534 in ProcessUtility (pstmt=0x1c91380, queryString=0x1c90170 "ALTER TABLE history ADD CONSTRAINT history_fk3FOREIGN KEY (aid) REFERENCES accounts (aid);", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x1c91470,completionTag=0x7fffcb0a0d20 "") at utility.c:360 #23 0x000000000091245a in PortalRunUtility (portal=0x1cf5ee0, pstmt=0x1c91380, isTopLevel=true, setHoldSnapshot=false, dest=0x1c91470,completionTag=0x7fffcb0a0d20 "") at pquery.c:1175 #24 0x0000000000912671 in PortalRunMulti (portal=0x1cf5ee0, isTopLevel=true, setHoldSnapshot=false, dest=0x1c91470, altdest=0x1c91470,completionTag=0x7fffcb0a0d20 "") at pquery.c:1321 #25 0x0000000000911ba6 in PortalRun (portal=0x1cf5ee0, count=9223372036854775807, isTopLevel=true, run_once=true, dest=0x1c91470,altdest=0x1c91470, completionTag=0x7fffcb0a0d20 "") at pquery.c:796 #26 0x000000000090b9ad in exec_simple_query (query_string=0x1c90170 "ALTER TABLE history ADD CONSTRAINT history_fk3 FOREIGNKEY (aid) REFERENCES accounts (aid);") at postgres.c:1231 #27 0x000000000090fd13 in PostgresMain (argc=1, argv=0x1cb9fb8, dbname=0x1cb9ed0 "postgres", username=0x1cb9eb0 "k5user")at postgres.c:4256 #28 0x0000000000864cbd in BackendRun (port=0x1cb1e90) at postmaster.c:4498 #29 0x000000000086449b in BackendStartup (port=0x1cb1e90) at postmaster.c:4189 #30 0x00000000008608d7 in ServerLoop () at postmaster.c:1727 #31 0x000000000086018d in PostmasterMain (argc=1, argv=0x1c8aa40) at postmaster.c:1400 #32 0x0000000000770835 in main (argc=1, argv=0x1c8aa40) at main.c:210 regards, Kato Sho
On 2019-Oct-23, kato-sho@fujitsu.com wrote: > Hello > > To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killedby OOM. > the number of partitions is 8192. I tried in master(commit: ad4b7aeb84). > > I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours. Thanks for reporting. It sounds like there must be a memory leak here. I am fairly pressed for time at present so I won't be able to investigate this until, at least, mid November. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
On Thu, Oct 24, 2019 at 03:48:57PM -0300, Alvaro Herrera wrote: >On 2019-Oct-23, kato-sho@fujitsu.com wrote: > >> Hello >> >> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killedby OOM. >> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84). >> >> I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours. > >Thanks for reporting. It sounds like there must be a memory leak here. >I am fairly pressed for time at present so I won't be able to >investigate this until, at least, mid November. > I've briefly looked into this, and I think the main memory leak is in RelationBuildPartitionDesc. It gets called with PortalContext, it allocates a lot of memory building the descriptor, copies it into CacheContext but does not even try to free anything. So we end up with something like this: TopMemoryContext: 215344 total in 11 blocks; 47720 free (12 chunks); 167624 used pgstat TabStatusArray lookup hash table: 32768 total in 3 blocks; 9160 free (4 chunks); 23608 used TopTransactionContext: 4194304 total in 10 blocks; 1992968 free (18 chunks); 2201336 used RowDescriptionContext: 8192 total in 1 blocks; 6880 free (0 chunks); 1312 used MessageContext: 8192 total in 1 blocks; 3256 free (1 chunks); 4936 used Operator class cache: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used smgr relation table: 32768 total in 3 blocks; 16768 free (8 chunks); 16000 used TransactionAbortContext: 32768 total in 1 blocks; 32504 free (0 chunks); 264 used Portal hash: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used TopPortalContext: 8192 total in 1 blocks; 7648 free (0 chunks); 544 used PortalContext: 1557985728 total in 177490 blocks; 9038656 free (167645 chunks); 1548947072 used: Relcache by OID: 16384 total in 2 blocks; 3424 free (3 chunks); 12960 used CacheMemoryContext: 17039424 total in 13 blocks; 7181480 free (9 chunks); 9857944 used partition key: 1024 total in 1 blocks; 168 free (0 chunks); 856 used: history index info: 2048 total in 2 blocks; 568 free (1 chunks); 1480 used: pg_class_tblspc_relfilenode_index ... index info: 2048 total in 2 blocks; 872 free (0 chunks); 1176 used: pg_class_oid_index WAL record construction: 49776 total in 2 blocks; 6344 free (0 chunks); 43432 used PrivateRefCount: 8192 total in 1 blocks; 2584 free (0 chunks); 5608 used MdSmgr: 8192 total in 1 blocks; 5976 free (0 chunks); 2216 used LOCALLOCK hash: 65536 total in 4 blocks; 18584 free (12 chunks); 46952 used Timezones: 104128 total in 2 blocks; 2584 free (0 chunks); 101544 used ErrorContext: 8192 total in 1 blocks; 6840 free (4 chunks); 1352 used Grand total: 1580997216 bytes in 177834 blocks; 18482808 free (167857 chunks); 1562514408 used (At which point I simply interrupted the query, it'd allocate more and more memory until an OOM). The attached patch trivially fixes that by adding a memory context tracking all the temporary data, and then just deletes it as a whole at the end of the function. This significantly reduces the memory usage for me, not sure it's 100% correct. FWIW, even with this fix it still takes an awful lot to create the foreign key, because the CPU is stuck doing this 60.78% 60.78% postgres postgres [.] bms_equal 32.58% 32.58% postgres postgres [.] get_eclass_for_sort_expr 3.83% 3.83% postgres postgres [.] add_child_rel_equivalences 0.23% 0.00% postgres [unknown] [.] 0x0000000000000005 0.22% 0.00% postgres [unknown] [.] 0000000000000000 0.18% 0.18% postgres postgres [.] AllocSetCheck ... Haven't looked into the details yet. regards -- Tomas Vondra http://www.2ndQuadrant.com PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachment
On Fri, Oct 25, 2019 at 12:17:58AM +0200, Tomas Vondra wrote: > > ... > >FWIW, even with this fix it still takes an awful lot to create the >foreign key, because the CPU is stuck doing this > > 60.78% 60.78% postgres postgres [.] bms_equal > 32.58% 32.58% postgres postgres [.] get_eclass_for_sort_expr > 3.83% 3.83% postgres postgres [.] add_child_rel_equivalences > 0.23% 0.00% postgres [unknown] [.] 0x0000000000000005 > 0.22% 0.00% postgres [unknown] [.] 0000000000000000 > 0.18% 0.18% postgres postgres [.] AllocSetCheck > ... > >Haven't looked into the details yet. > OK, a bit more info. A better perf report (with framepointers etc) looks like this: + 99.98% 0.00% postgres [unknown] [.] 0x495641002c4d133d + 99.98% 0.00% postgres libc-2.29.so [.] __libc_start_main + 99.98% 0.00% postgres postgres [.] startup_hacks + 99.98% 0.00% postgres postgres [.] PostmasterMain + 99.98% 0.00% postgres postgres [.] ServerLoop + 99.98% 0.00% postgres postgres [.] BackendStartup + 99.98% 0.00% postgres postgres [.] ExitPostmaster + 99.98% 0.00% postgres postgres [.] PostgresMain + 99.98% 0.00% postgres postgres [.] exec_simple_query + 99.98% 0.00% postgres postgres [.] PortalRun + 99.98% 0.00% postgres postgres [.] PortalRunMulti + 99.98% 0.00% postgres postgres [.] PortalRunUtility + 99.98% 0.00% postgres postgres [.] ProcessUtility + 99.98% 0.00% postgres postgres [.] standard_ProcessUtility + 99.98% 0.00% postgres postgres [.] ProcessUtilitySlow + 99.98% 0.00% postgres postgres [.] AlterTable + 99.98% 0.00% postgres postgres [.] ATController + 99.98% 0.00% postgres postgres [.] ATRewriteTables + 99.98% 0.00% postgres postgres [.] validateForeignKeyConstraint + 99.98% 0.00% postgres postgres [.] RI_Initial_Check + 99.96% 0.00% postgres postgres [.] SPI_execute_snapshot + 99.86% 0.00% postgres postgres [.] _SPI_execute_plan + 99.70% 0.00% postgres postgres [.] GetCachedPlan + 99.70% 0.00% postgres postgres [.] BuildCachedPlan + 99.66% 0.00% postgres postgres [.] pg_plan_queries + 99.66% 0.00% postgres postgres [.] pg_plan_query + 99.66% 0.00% postgres postgres [.] planner + 99.66% 0.00% postgres postgres [.] standard_planner + 99.62% 0.00% postgres postgres [.] subquery_planner + 99.62% 0.00% postgres postgres [.] grouping_planner + 99.62% 0.00% postgres postgres [.] query_planner + 99.31% 0.00% postgres postgres [.] make_one_rel + 97.53% 0.00% postgres postgres [.] set_base_rel_pathlists + 97.53% 0.00% postgres postgres [.] set_rel_pathlist + 97.53% 0.01% postgres postgres [.] set_append_rel_pathlist + 97.42% 0.00% postgres postgres [.] set_plain_rel_pathlist + 97.40% 0.02% postgres postgres [.] create_index_paths + 97.16% 0.01% postgres postgres [.] get_index_paths + 97.12% 0.02% postgres postgres [.] build_index_paths + 96.67% 0.01% postgres postgres [.] build_index_pathkeys + 96.61% 0.01% postgres postgres [.] make_pathkey_from_sortinfo + 95.70% 21.27% postgres postgres [.] get_eclass_for_sort_expr + 75.21% 75.21% postgres postgres [.] bms_equal + 48.72% 0.00% postgres postgres [.] consider_index_join_clauses + 48.72% 0.00% postgres postgres [.] consider_index_join_outer_rels + 48.72% 0.02% postgres postgres [.] get_join_index_paths + 1.78% 0.00% postgres postgres [.] set_base_rel_sizes + 1.78% 0.00% postgres postgres [.] set_rel_size + 1.78% 0.01% postgres postgres [.] set_append_rel_size + 1.66% 1.34% postgres postgres [.] add_child_rel_equivalences It is (pretty much) a single callstack, i.e. each function is simply calling the one below it (with some minor exceptions at the end, but that's pretty negligible here). This essentially says that planning queries executed by RI_Initial_Check with many partitions is damn expensive. An example query is this one: test=# \timing test=# SELECT fk."aid" FROM ONLY "public"."history_60" fk LEFT OUTER JOIN "public"."accounts" pk ON ( pk."aid" OPERATOR(pg_catalog.=) fk."aid") WHERE pk."aid" IS NULL AND (fk."aid" IS NOT NULL); aid ----- (0 rows) Time: 28791.492 ms (00:28.791) Bear in mind those are *empty* tables, so the execution is pretty cheap (explain analyze says the execution takes ~65ms, but the planning itself takes ~28 seconds). And we have 8192 such partitions, which means we'd spend ~230k seconds just planning the RI queries. That's 64 hours. regards -- Tomas Vondra http://www.2ndQuadrant.com PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Hi, On 2019-10-23 05:59:01 +0000, kato-sho@fujitsu.com wrote: > To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killedby OOM. > the number of partitions is 8192. I tried in master(commit: ad4b7aeb84). Obviously this should be improved. But I think it's also worthwhile to note that using 8k partitions is very unlikely to be a good choice for anything. The metadata, partition pruning, etc overhead is just going to be very substantial. Greetings, Andres Freund
On Thu, Oct 24, 2019 at 04:28:38PM -0700, Andres Freund wrote: >Hi, > >On 2019-10-23 05:59:01 +0000, kato-sho@fujitsu.com wrote: >> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend process killedby OOM. >> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84). > >Obviously this should be improved. But I think it's also worthwhile to >note that using 8k partitions is very unlikely to be a good choice for >anything. The metadata, partition pruning, etc overhead is just going to >be very substantial. > True. Especially with two partitioned tables, each with 8k partitions. I do think it makes sense to reduce the memory usage, because just eating all available memory (in the extreme case) is not very nice. I've added that patch to the CF, although the patch I shared is very crude and I'm by no means suggesting it's how it should be done ultimately. The other bit (speed of planning with 8k partitions) is probably a more general issue, and I suppose we'll improve that over time. I don't think there's a simple change magically improving that. regards -- Tomas Vondra http://www.2ndQuadrant.com PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
On Thu, 31 Oct 2019 at 07:30, Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote: > > On Thu, Oct 24, 2019 at 04:28:38PM -0700, Andres Freund wrote: > >Hi, > > > >On 2019-10-23 05:59:01 +0000, kato-sho@fujitsu.com wrote: > >> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend processkilled by OOM. > >> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84). > > > >Obviously this should be improved. But I think it's also worthwhile to > >note that using 8k partitions is very unlikely to be a good choice for > >anything. The metadata, partition pruning, etc overhead is just going to > >be very substantial. > > > > True. Especially with two partitioned tables, each with 8k partitions. In Ottawa this year, Andres and I briefly talked about the possibility of making a series of changes to how equalfuncs.c works. The idea was to make it easy by using some pre-processor magic to allow us to create another version of equalfuncs which would let us have an equal comparison function that returns -1 / 0 / +1 rather than just true or false. This would allow us to Binary Search Trees of objects. I identified that EquivalenceClass.ec_members would be better written as a BST to allow much faster lookups in get_eclass_for_sort_expr(). The implementation I had in mind for the BST was a compact tree that instead of using pointers for the left and right children, it just uses an integer to reference the array element number. This would allow us to maintain very fast insert-order traversals. Deletes would need to decrement all child references greater than the deleted index. This is sort of on-par with how the new List implementation in master. i.e deletes take additional effort, but inserts are fast if there's enough space in the array for a new element, traversals are cache-friendly, etc. I think trees might be better than hash tables for this as a hash function needs to hash all fields, whereas a comparison function can stop when it finds the first non-match. This may also be able to help simplify the code in setrefs.c to get rid of the complex code around indexed tlists. tlist_member() would become O(log n) instead of O(n), so perhaps there'd be not much point in having both search_indexed_tlist_for_var() and search_indexed_tlist_for_non_var(). -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Hi, On 2019-10-31 11:19:05 +1300, David Rowley wrote: > In Ottawa this year, Andres and I briefly talked about the possibility > of making a series of changes to how equalfuncs.c works. The idea was > to make it easy by using some pre-processor magic to allow us to > create another version of equalfuncs which would let us have an equal > comparison function that returns -1 / 0 / +1 rather than just true or > false. See also the thread at https://www.postgresql.org/message-id/20190920051857.2fhnvhvx4qdddviz%40alap3.anarazel.de which would make this fairly easy, without having to compile equalfuncs twice or such. Greetings, Andres Freund
David Rowley <david.rowley@2ndquadrant.com> writes: > In Ottawa this year, Andres and I briefly talked about the possibility > of making a series of changes to how equalfuncs.c works. The idea was > to make it easy by using some pre-processor magic to allow us to > create another version of equalfuncs which would let us have an equal > comparison function that returns -1 / 0 / +1 rather than just true or > false. This would allow us to Binary Search Trees of objects. I > identified that EquivalenceClass.ec_members would be better written as > a BST to allow much faster lookups in get_eclass_for_sort_expr(). This seems like a good idea, but why would we want to maintain two versions? Just change equalfuncs.c into comparefuncs.c, full stop. equal() would be a trivial wrapper for (compare_objects(a,b) == 0). Andres' ideas about autogenerating all that boilerplate aren't bad, but that's no justification for carrying two full sets of per-node logic when one set would do. regards, tom lane
On Thu, 31 Oct 2019 at 17:56, Tom Lane <tgl@sss.pgh.pa.us> wrote: > > David Rowley <david.rowley@2ndquadrant.com> writes: > > In Ottawa this year, Andres and I briefly talked about the possibility > > of making a series of changes to how equalfuncs.c works. The idea was > > to make it easy by using some pre-processor magic to allow us to > > create another version of equalfuncs which would let us have an equal > > comparison function that returns -1 / 0 / +1 rather than just true or > > false. This would allow us to Binary Search Trees of objects. I > > identified that EquivalenceClass.ec_members would be better written as > > a BST to allow much faster lookups in get_eclass_for_sort_expr(). > > This seems like a good idea, but why would we want to maintain two > versions? Just change equalfuncs.c into comparefuncs.c, full stop. > equal() would be a trivial wrapper for (compare_objects(a,b) == 0). If we can do that without slowing down the comparison, then sure. Checking which node sorts earlier is a bit more expensive than just checking for equality. But if that's not going to be noticeable in real-world test cases, then I agree. -- David Rowley http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
Hello, On Fri, Oct 25, 2019 at 7:18 AM Tomas Vondra <tomas.vondra@2ndquadrant.com> wrote: > On Thu, Oct 24, 2019 at 03:48:57PM -0300, Alvaro Herrera wrote: > >On 2019-Oct-23, kato-sho@fujitsu.com wrote: > > > >> Hello > >> > >> To benchmark with tpcb model, I tried to create a foreign key in the partitioned history table, but backend processkilled by OOM. > >> the number of partitions is 8192. I tried in master(commit: ad4b7aeb84). > >> > >> I did the same thing in another server which has 200GB memory, but creating foreign key did not end in 24 hours. > > > >Thanks for reporting. Thank you Kato-san. > It sounds like there must be a memory leak here. > >I am fairly pressed for time at present so I won't be able to > >investigate this until, at least, mid November. > > I've briefly looked into this, and I think the main memory leak is in > RelationBuildPartitionDesc. It gets called with PortalContext, it > allocates a lot of memory building the descriptor, copies it into > CacheContext but does not even try to free anything. So we end up with > something like this: ... > The attached patch trivially fixes that by adding a memory context > tracking all the temporary data, and then just deletes it as a whole at > the end of the function. This significantly reduces the memory usage for > me, not sure it's 100% correct. Thank you Tomas. I think we have considered this temporary context fix a number of times before, but it got stalled for one reason or another ([1] comes to mind as the last thread where this came up). Another angle to look at this is that our design where PartitionDesc is rebuilt on relcache reload of the parent relation is not a great one after all. It seems that we're rightly (?) invalidating the parent's relcache 8192 times in this case, because its cacheable foreign key descriptor changes on processing each partition, but PartitionDesc itself doesn't change. Having to pointlessly rebuild it 8192 times seems really wasteful. I recall a discussion where it was proposed to build PartitionDesc only when needed as opposed on every relcache reload of the parent relation. Attached PoC-at-best patch that does that seems to go through without OOM and passes make check-world. I think this should have a very minor impact on select queries. But... > FWIW, even with this fix it still takes an awful lot to create the > foreign key, because the CPU is stuck doing this > > 60.78% 60.78% postgres postgres [.] bms_equal > 32.58% 32.58% postgres postgres [.] get_eclass_for_sort_expr > 3.83% 3.83% postgres postgres [.] add_child_rel_equivalences > 0.23% 0.00% postgres [unknown] [.] 0x0000000000000005 > 0.22% 0.00% postgres [unknown] [.] 0000000000000000 > 0.18% 0.18% postgres postgres [.] AllocSetCheck ...we have many problems to solve here. :-( Thanks, Amit [1] https://www.postgresql.org/message-id/CA%2BTgmoY3bRmGB6-DUnoVy5fJoreiBJ43rwMrQRCdPXuKt4Ykaw%40mail.gmail.com
Attachment
On 2019-Oct-25, Tomas Vondra wrote: > The attached patch trivially fixes that by adding a memory context > tracking all the temporary data, and then just deletes it as a whole at > the end of the function. This significantly reduces the memory usage for > me, not sure it's 100% correct. FWIW we already had this code (added by commit 2455ab48844c), but it was removed by commit d3f48dfae42f. I think we should put it back. (I think it may be useful to use a static MemoryContext that we can just reset each time, instead of creating and deleting each time, to save on memcxt churn. That'd make the function non-reentrant, but I don't see that we'd make the catalogs partitioned any time soon. This may be premature optimization though -- not really wedded to it.) With Amit's patch to make RelationBuildPartitionDesc called lazily, the time to plan the RI_InitialCheck query (using Kato Sho's test case) goes from 30 seconds to 14 seconds on my laptop. Obviously there's more that we'd need to fix to make the scenario fully supported, but it seems a decent step forward. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachment
Alvaro Herrera <alvherre@2ndquadrant.com> writes: > On 2019-Oct-25, Tomas Vondra wrote: >> The attached patch trivially fixes that by adding a memory context >> tracking all the temporary data, and then just deletes it as a whole at >> the end of the function. This significantly reduces the memory usage for >> me, not sure it's 100% correct. > FWIW we already had this code (added by commit 2455ab48844c), but it was > removed by commit d3f48dfae42f. I think we should put it back. I disagree. The point of d3f48dfae42f is that the management of that leakage is now being done at the caller level, and I'm quite firmly against having RelationBuildPartitionDesc duplicate that. If we don't like the amount of space RelationBuildPartitionDesc is leaking, we aren't going to like the amount of space that sibling routines such as RelationBuildTriggers leak, either. What we ought to be thinking about instead is adjusting the RECOVER_RELATION_BUILD_MEMORY heuristic in relcache.c. I am not sure what it ought to look like, but I doubt that "do it all the time" has suddenly become the right answer, when it wasn't the right answer for 20-something years. It's conceivable that "do it if CCA is on, or if the current table is a partition child table" is a reasonable approach. But I'm not sure whether we can know the relation relkind early enough for that :-( (BTW, a different question one could ask is exactly why RelationBuildPartitionDesc is so profligate of leaked memory.) regards, tom lane
On 2019-Nov-13, Tom Lane wrote: > (BTW, a different question one could ask is exactly why > RelationBuildPartitionDesc is so profligate of leaked memory.) The original partitioning code (f0e44751d717) decided that it didn't want to bother with adding a "free" routine for PartitionBoundInfo structs, maybe because it had too many pointers, so there's no way for RelationBuildPartitionDesc to free everything it allocates anyway. We could add a couple of pfrees and list_frees here and there, but for the main thing being leaked we'd need to improve that API. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
On 2019-Nov-13, Alvaro Herrera wrote: > On 2019-Nov-13, Tom Lane wrote: > > > (BTW, a different question one could ask is exactly why > > RelationBuildPartitionDesc is so profligate of leaked memory.) > > The original partitioning code (f0e44751d717) decided that it didn't > want to bother with adding a "free" routine for PartitionBoundInfo > structs, maybe because it had too many pointers, so there's no way for > RelationBuildPartitionDesc to free everything it allocates anyway. We > could add a couple of pfrees and list_frees here and there, but for the > main thing being leaked we'd need to improve that API. Ah, we also leak an array of PartitionBoundSpec, which is a Node. Do we have any way to free those? I don't think we do. In short, it looks to me as if this function was explicitly designed with the idea that it'd be called in a temp mem context. I looked at d3f48dfae42f again per your earlier suggestion. Doing that memory context dance for partitioned relations does seem to fix the problem too; we just need to move the context creation to just after ScanPgRelation, at which point we have the relkind. (Note: I think the problematic case is the partitioned table, not the partitions themselves. At least, with the attached patch the problem goes away. I guess it would be sensible to research whether we need to do this for relispartition=true as well, but I haven't done that.) There is indeed some leakage for relations that have triggers too (or rules), but in order for those to become significant you would have to have thousands of triggers or rules ... and in reasonable designs, you just don't because it doesn't make sense. But it is not totally unreasonable to have lots of partitions, and as we improve the system, more and more people will want to. Aside: while messing with this I noticed that how significant pg_strtok is as a resource hog when building partition descs (from the stringToNode that's applied to each partition's partbound.) -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Attachment
On Wed, Nov 13, 2019 at 4:46 PM Alvaro Herrera <alvherre@2ndquadrant.com> wrote: > But it is not totally > unreasonable to have lots of partitions, and as we improve the system, > more and more people will want to. Yep. -- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company
On 11/13/19 4:45 PM, Alvaro Herrera wrote: > > But it is not totally > unreasonable to have lots of partitions, and as we improve the system, > more and more people will want to. +1 This patch still applies but there seems to be some disagreement on how to proceed. Any thoughts? Regards, -- -David david@pgmasters.net
On 2020-Mar-24, David Steele wrote: > This patch still applies but there seems to be some disagreement on > how to proceed. Actually, I don't think there's any disagreement regarding the patch I last posted. (There was disagreement on the previous patches, which were very different). Tom suggested to look at the heuristics used for RECOVER_RELATION_BUILD_MEMORY, and the patch does exactly that. It would be great if Kato Sho can try the original test case with my latest patch (the one in https://postgr.es/m/20191113214544.GA16060@alvherre.pgsql ) and let us know if it improves things. The patch as posted generates these warnings in my current GCC that it didn't when I checked last, but they're harmless -- if/when I push, it'll be without the parens. /pgsql/source/master/src/backend/utils/cache/relcache.c:1064:21: warning: equality comparison with extraneous parentheses[-Wparentheses-equality] if ((relp->relkind == RELKIND_PARTITIONED_TABLE) ~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ /pgsql/source/master/src/backend/utils/cache/relcache.c:1064:21: note: remove extraneous parentheses around the comparisonto silence this warning if ((relp->relkind == RELKIND_PARTITIONED_TABLE) ~ ^ ~ /pgsql/source/master/src/backend/utils/cache/relcache.c:1064:21: note: use '=' to turn this equality comparison into an assignment if ((relp->relkind == RELKIND_PARTITIONED_TABLE) ^~ = /pgsql/source/master/src/backend/utils/cache/relcache.c:1242:33: warning: equality comparison with extraneous parentheses[-Wparentheses-equality] if ((relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) ~~~~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~ /pgsql/source/master/src/backend/utils/cache/relcache.c:1242:33: note: remove extraneous parentheses around the comparisonto silence this warning if ((relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) ~ ^ ~ /pgsql/source/master/src/backend/utils/cache/relcache.c:1242:33: note: use '=' to turn this equality comparison into an assignment if ((relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) ^~ = 2 warnings generated. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
> On 24 Mar 2020, at 16:26, Alvaro Herrera <alvherre@2ndquadrant.com> wrote: > It > would be great if Kato Sho can try the original test case with my latest > patch (the one in https://postgr.es/m/20191113214544.GA16060@alvherre.pgsql ) > and let us know if it improves things. Hi!, Are you able to test Alvaros latest patch to see if that solves the originally reported problem, so that we can reach closure on this item during the commitfest? cheers ./daniel
On Tuesday, July 14, 2020 11:29 PM, Daniel Fustafsson wrote: >Are you able to test Alvaros latest patch to see if that solves the originally reported problem, so that we can reach >closureon this item during the commitfest? Sorry for the too late replay. I missed this mail. And, thanks for writing patches. I start test now. I'll report the result before the end of August . Regards, Sho kato
On Wednesday, August 5, 2020 9:43 AM I wrote: > I'll report the result before the end of August . I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok. Firstly, I execute ALTER TABLE ADD CONSTRAINT FOREIGN KEY on the table which has 8k tables. This query execution completes in about 22 hours without OOM. Secondary, I confirm the reduction of memory context usage. Running with 8k partitions takes too long, I confirm with 1k partitions. I use gdb and call MemoryContextStats(TopMemoryContext) at addFkRecurseReferencing(). CacheMemoryContext size becomes small, so I think it is working as expected. The Results are as follows. - before applying patch TopMemoryContext: 418896 total in 18 blocks; 91488 free (13 chunks); 327408 used pgstat TabStatusArray lookup hash table: 65536 total in 4 blocks; 16808 free (7 chunks); 48728 used TopTransactionContext: 4194304 total in 10 blocks; 1045728 free (18 chunks); 3148576 used TableSpace cache: 8192 total in 1 blocks; 2048 free (0 chunks); 6144 used Type information cache: 24624 total in 2 blocks; 2584 free (0 chunks); 22040 used Operator lookup cache: 24576 total in 2 blocks; 10712 free (4 chunks); 13864 used RowDescriptionContext: 8192 total in 1 blocks; 6880 free (0 chunks); 1312 used MessageContext: 8192 total in 1 blocks; 3064 free (0 chunks); 5128 used Operator class cache: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used smgr relation table: 32768 total in 3 blocks; 16768 free (8 chunks); 16000 used TransactionAbortContext: 32768 total in 1 blocks; 32504 free (0 chunks); 264 used Portal hash: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used TopPortalContext: 8192 total in 1 blocks; 7648 free (0 chunks); 544 used PortalContext: 9621216 total in 1179 blocks; 13496 free (13 chunks); 9607720 used: Relcache by OID: 16384 total in 2 blocks; 3424 free (3 chunks); 12960 used CacheMemoryContext: 4243584 total in 12 blocks; 1349808 free (12 chunks); 2893776 used index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_trigger_tgconstraint_index index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_trigger_oid_index index info: 2048 total in 2 blocks; 352 free (1 chunks); 1696 used: pg_inherits_relid_seqno_index partition descriptor: 65344 total in 12 blocks; 7336 free (4 chunks); 58008 used: accounts index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_inherits_parent_index partition key: 1024 total in 1 blocks; 160 free (0 chunks); 864 used: accounts ... index info: 2048 total in 2 blocks; 736 free (2 chunks); 1312 used: pg_database_oid_index index info: 2048 total in 2 blocks; 736 free (2 chunks); 1312 used: pg_authid_rolname_index WAL record construction: 49776 total in 2 blocks; 6344 free (0 chunks); 43432 used PrivateRefCount: 8192 total in 1 blocks; 2584 free (0 chunks); 5608 used MdSmgr: 8192 total in 1 blocks; 5528 free (0 chunks); 2664 used LOCALLOCK hash: 131072 total in 5 blocks; 26376 free (15 chunks); 104696 used Timezones: 104128 total in 2 blocks; 2584 free (0 chunks); 101544 used ErrorContext: 8192 total in 1 blocks; 7928 free (3 chunks); 264 used Grand total: 19322960 bytes in 1452 blocks; 2743560 free (186 chunks); 16579400 used - after applying patch TopMemoryContext: 418896 total in 18 blocks; 91488 free (13 chunks); 327408 used pgstat TabStatusArray lookup hash table: 65536 total in 4 blocks; 16808 free (7 chunks); 48728 used TopTransactionContext: 4194304 total in 10 blocks; 1045728 free (18 chunks); 3148576 used RowDescriptionContext: 8192 total in 1 blocks; 6880 free (0 chunks); 1312 used MessageContext: 8192 total in 1 blocks; 3064 free (0 chunks); 5128 used Operator class cache: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used smgr relation table: 32768 total in 3 blocks; 16768 free (8 chunks); 16000 used TransactionAbortContext: 32768 total in 1 blocks; 32504 free (0 chunks); 264 used Portal hash: 8192 total in 1 blocks; 512 free (0 chunks); 7680 used TopPortalContext: 8192 total in 1 blocks; 7648 free (0 chunks); 544 used PortalContext: 9621216 total in 1179 blocks; 13496 free (13 chunks); 9607720 used: Relcache by OID: 16384 total in 2 blocks; 3424 free (3 chunks); 12960 used CacheMemoryContext: 2113600 total in 10 blocks; 556240 free (10 chunks); 1557360 used index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_trigger_tgconstraint_index index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_trigger_oid_index index info: 2048 total in 2 blocks; 352 free (1 chunks); 1696 used: pg_inherits_relid_seqno_index partition descriptor: 65344 total in 12 blocks; 7336 free (4 chunks); 58008 used: accounts index info: 2048 total in 2 blocks; 736 free (0 chunks); 1312 used: pg_inherits_parent_index partition key: 1024 total in 1 blocks; 160 free (0 chunks); 864 used: accounts ... index info: 2048 total in 2 blocks; 736 free (2 chunks); 1312 used: pg_database_oid_index index info: 2048 total in 2 blocks; 736 free (2 chunks); 1312 used: pg_authid_rolname_index WAL record construction: 49776 total in 2 blocks; 6344 free (0 chunks); 43432 used PrivateRefCount: 8192 total in 1 blocks; 2584 free (0 chunks); 5608 used MdSmgr: 8192 total in 1 blocks; 6360 free (0 chunks); 1832 used LOCALLOCK hash: 131072 total in 5 blocks; 26376 free (15 chunks); 104696 used Timezones: 104128 total in 2 blocks; 2584 free (0 chunks); 101544 used ErrorContext: 8192 total in 1 blocks; 7928 free (3 chunks); 264 used Grand total: 17131488 bytes in 1441 blocks; 1936008 free (234 chunks); 15195480 used Finally, I do make check and all tests are passed. So, I'll change this patch status to ready for committer. Regards, Sho Kato
Hi Alvaro, On Thu, Aug 6, 2020 at 4:25 PM kato-sho@fujitsu.com <kato-sho@fujitsu.com> wrote: > On Wednesday, August 5, 2020 9:43 AM I wrote: > > I'll report the result before the end of August . > > I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok. Is this patch meant for HEAD or back-patching? I ask because v13 got this: commit 5b9312378e2f8fb35ef4584aea351c3319a10422 Author: Tom Lane <tgl@sss.pgh.pa.us> Date: Wed Dec 25 14:43:13 2019 -0500 Load relcache entries' partitioning data on-demand, not immediately. which prevents a partitioned table's PartitionDesc from being rebuilt repeatedly as would happen before this commit in Kato-san's case, because it moves RelationBuildPartitionDesc out of the relcache flush code path. So, the OOM situation that Kato-san original reported should not occur as of v13. -- Amit Langote EnterpriseDB: http://www.enterprisedb.com
On 2020-Aug-19, Amit Langote wrote: Hello > On Thu, Aug 6, 2020 at 4:25 PM kato-sho@fujitsu.com > <kato-sho@fujitsu.com> wrote: > > On Wednesday, August 5, 2020 9:43 AM I wrote: > > > I'll report the result before the end of August . > > > > I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok. > > Is this patch meant for HEAD or back-patching? I ask because v13 got this: > > commit 5b9312378e2f8fb35ef4584aea351c3319a10422 > Author: Tom Lane <tgl@sss.pgh.pa.us> > Date: Wed Dec 25 14:43:13 2019 -0500 > > Load relcache entries' partitioning data on-demand, not immediately. > > which prevents a partitioned table's PartitionDesc from being rebuilt > repeatedly as would happen before this commit in Kato-san's case, > because it moves RelationBuildPartitionDesc out of the relcache flush > code path. Hmm, so this is a problem only in v11 and v12? It seems that putting the patch in master *only* is pointless. OTOH v11 had other severe performance drawbacks with lots of partitions, so it might not be needed there. I admit I'm hesitant to carry code in only one or two stable branches that exists nowhere else. But maybe the problem is serious enough in those branches (that will still live for quite a few years) that we should get it there. OTOH it could be argued that the coding in master is not all that great anyway (given the willingness for memory to be leaked) that it should apply to all three branches. -- Álvaro Herrera https://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Remote DBA, Training & Services
Hi, On Thu, Aug 20, 2020 at 3:06 AM Alvaro Herrera <alvherre@2ndquadrant.com> wrote: > On 2020-Aug-19, Amit Langote wrote: > > On Thu, Aug 6, 2020 at 4:25 PM kato-sho@fujitsu.com > > <kato-sho@fujitsu.com> wrote: > > > On Wednesday, August 5, 2020 9:43 AM I wrote: > > > > I'll report the result before the end of August . > > > > > > I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok. > > > > Is this patch meant for HEAD or back-patching? I ask because v13 got this: > > > > commit 5b9312378e2f8fb35ef4584aea351c3319a10422 > > Author: Tom Lane <tgl@sss.pgh.pa.us> > > Date: Wed Dec 25 14:43:13 2019 -0500 > > > > Load relcache entries' partitioning data on-demand, not immediately. > > > > which prevents a partitioned table's PartitionDesc from being rebuilt > > repeatedly as would happen before this commit in Kato-san's case, > > because it moves RelationBuildPartitionDesc out of the relcache flush > > code path. > > Hmm, so this is a problem only in v11 and v12? It seems that putting > the patch in master *only* is pointless. OTOH v11 had other severe > performance drawbacks with lots of partitions, so it might not be needed > there. > > I admit I'm hesitant to carry code in only one or two stable branches > that exists nowhere else. But maybe the problem is serious enough in > those branches (that will still live for quite a few years) that we > should get it there. > > OTOH it could be argued that the coding in master is not all that great > anyway (given the willingness for memory to be leaked) that it should > apply to all three branches. Fwiw, I am fine with applying the memory-leak fix in all branches down to v12 if we are satisfied with the implementation. That said, I don't offhand know any real world use case beside Kato-san's that's affected by this leak. Kato-san's case is creating a foreign key referencing a partitioned table with many partitions, something that's only supported from v12. (You may have noticed that the leak that occurs when rebuilding referencing table's PartitionDesc accumulates while addFkRecurseReferenced is looping on referenced table's partitions.) -- Amit Langote EnterpriseDB: http://www.enterprisedb.com
On Thu, Aug 20, 2020 at 10:50 AM Amit Langote <amitlangote09@gmail.com> wrote: On Thu, Aug 20, 2020 at 3:06 AM Alvaro Herrera <alvherre@2ndquadrant.com> wrote: > > On 2020-Aug-19, Amit Langote wrote: > > > On Thu, Aug 6, 2020 at 4:25 PM kato-sho@fujitsu.com > > > <kato-sho@fujitsu.com> wrote: > > > > On Wednesday, August 5, 2020 9:43 AM I wrote: > > > > > I'll report the result before the end of August . > > > > > > > > I test v2-0001-build-partdesc-memcxt.patch at 9a9db08ae4 and it is ok. > > Fwiw, I am fine with applying the memory-leak fix in all branches down > to v12 if we are satisfied with the implementation. I have revised the above patch slightly to introduce a variable for the condition whether to use a temporary memory context. -- Amit Langote EnterpriseDB: http://www.enterprisedb.com
Attachment
Amit Langote <amitlangote09@gmail.com> writes: >> Fwiw, I am fine with applying the memory-leak fix in all branches down >> to v12 if we are satisfied with the implementation. > I have revised the above patch slightly to introduce a variable for > the condition whether to use a temporary memory context. This CF entry has been marked "ready for committer", which I find inappropriate since there doesn't seem to be any consensus about whether we need it. I tried running the original test case under HEAD. I do not see any visible memory leak, which I think indicates that 5b9312378 or some other fix has taken care of the leak since the original report. However, after waiting awhile and noting that the ADD FOREIGN KEY wasn't finishing, I poked into its progress with a debugger and observed that each iteration of RI_Initial_Check() was taking about 15 seconds. I presume we have to do that for each partition, which leads to the estimate that it'll take 34 hours to finish this ... and that's with no data in the partitions, god help me if there'd been a lot. Some quick "perf" work says that most of the time seems to be getting spent in the planner, in get_eclass_for_sort_expr(). So this is likely just a variant of performance issues we've seen before. (I do wonder why we're not able to prune the join to just the matching PK partition, though.) Anyway, the long and the short of it is that this test case is far larger than anything anyone could practically use in HEAD, let alone in released branches. I can't get excited about back-patching a fix to a memory leak if that's just going to allow people to hit other performance-killing issues. In short, I don't see a reason why we need this patch in any branch, so I recommend rejecting it. If we did think we need a leak fix in the back branches, back-porting 5b9312378 would likely be a saner way to proceed. regards, tom lane
On Fri, Sep 4, 2020 at 12:05 AM Tom Lane <tgl@sss.pgh.pa.us> wrote: > > Amit Langote <amitlangote09@gmail.com> writes: > >> Fwiw, I am fine with applying the memory-leak fix in all branches down > >> to v12 if we are satisfied with the implementation. > > > I have revised the above patch slightly to introduce a variable for > > the condition whether to use a temporary memory context. > > This CF entry has been marked "ready for committer", which I find > inappropriate since there doesn't seem to be any consensus about > whether we need it. > > I tried running the original test case under HEAD. I do not see > any visible memory leak, which I think indicates that 5b9312378 or > some other fix has taken care of the leak since the original report. > However, after waiting awhile and noting that the ADD FOREIGN KEY > wasn't finishing, I poked into its progress with a debugger and > observed that each iteration of RI_Initial_Check() was taking about > 15 seconds. I presume we have to do that for each partition, > which leads to the estimate that it'll take 34 hours to finish this > ... and that's with no data in the partitions, god help me if > there'd been a lot. > > Some quick "perf" work says that most of the time seems to be > getting spent in the planner, in get_eclass_for_sort_expr(). > So this is likely just a variant of performance issues we've > seen before. (I do wonder why we're not able to prune the > join to just the matching PK partition, though.) > Consider this example postgres=# create table t1 (a int, b int, CHECK (a between 100 and 150)); CREATE TABLE postgres=# create table part(a int, b int) partition by range(a); CREATE TABLE postgres=# create table part_p1 partition of part for values from (0) to (50); CREATE TABLE postgres=# create table part_p2 partition of part for values from (50) to (100); CREATE TABLE postgres=# create table part_p3 partition of part for values from (100) to (150); CREATE TABLE postgres=# create table part_p4 partition of part for values from (150) to (200); CREATE TABLE postgres=# explain (costs off) select * from t1 r1, part r2 where r1.a = r2.a; QUERY PLAN -------------------------------------- Hash Join Hash Cond: (r2.a = r1.a) -> Append -> Seq Scan on part_p1 r2_1 -> Seq Scan on part_p2 r2_2 -> Seq Scan on part_p3 r2_3 -> Seq Scan on part_p4 r2_4 -> Hash -> Seq Scan on t1 r1 (9 rows) Given that t1.a can not have any value less than 100 and greater than 150, any row in t1 won't have its joining partner in part_p1 and part_p2. So those two partitions can be pruned. But I think we don't consider the constraints on table when joining two tables to render a join empty or even prune partitions. That would be a good optimization which will improve this case as well. But further to that, I think when we add constraint on the partition table which translates to constraints on individual partitions, we should check the entire partitioned relation rather than individual partitions. If we do that, we won't need to plan query for every partition. If the foreign key happens to be partition key e.g in star schema, this will use partitionwise join to further improve query performance. Somewhere in future, we will be able to repartition the foreign key table by foreign key and perform partitionwise join. -- Best Wishes, Ashutosh Bapat
I ran into a situation that echoed this original one by Kato in the start of this thread: https://www.postgresql.org/message-id/OSAPR01MB374809E8DE169C8BF2B82CBD9F6B0%40OSAPR01MB3748.jpnprd01.prod.outlook.com More below. Tom Lane <tgl(at)sss(dot)pgh(dot)pa(dot)us> writes: > > I tried running the original test case under HEAD. I do not see > any visible memory leak, which I think indicates that 5b9312378 or > some other fix has taken care of the leak since the original report. > However, after waiting awhile and noting that the ADD FOREIGN KEY > wasn't finishing, I poked into its progress with a debugger and > observed that each iteration of RI_Initial_Check() was taking about > 15 seconds. I presume we have to do that for each partition, > which leads to the estimate that it'll take 34 hours to finish this > ... and that's with no data in the partitions, god help me if > there'd been a lot. > > Some quick "perf" work says that most of the time seems to be > getting spent in the planner, in get_eclass_for_sort_expr(). > So this is likely just a variant of performance issues we've > seen before. (I do wonder why we're not able to prune the > join to just the matching PK partition, though.) I found that the original example from Kato finishes in a little over a minute now to create the FK constraint in Postgres 14-16. However, in my case, I'm using composite partitions and that is taking 60x as long for an equivalent number of partitions. I must emphasize this is with ZERO rows of data. I'm using 1200 partitions in my example to finish a bit faster and because that's my actual use case and less extreme than 8,000 partitions. If I reach my 1,200 with 80 top-level and 15 leaf-level partitions in a composite hierarchy (80x15 = 1,200 still) the speed is very slow. I'm using composite partitions primarily because in my real code I need LIST partitions which don't support multiple keys so I worked around that using composite partitions with one LIST key in each level. Doing some other workaround like a concatenated single key was messy for my use case. I modified Kato's test case to repeat the issue I'm having and saw some very odd query plan behavior which is likely part of the issue. The flat version with 1,200 partitions at a single level and no composite partitions finishes in a little over a second while the 80 x 15 version with composite partitions takes over a minute (60x longer). In my actual database with many such partitions with FK, the time compounds and FK creation takes >30 minutes per FK leading to hours just making FKs. == COMPOSITE PARTITION INTERNAL SELECT PLAN == If I cancel the composite FK creation, I see where it stopped and that gives a clue about the difference in speed. For the composite, it's this statement with a plan linked on dalibo showing a massive amount of sequential scans and Postgres making some assumptions about 1 row existing. ERROR: canceling statement due to user request CONTEXT: SQL statement SELECT fk."aid" FROM ONLY "public"."xhistory_25_12" fk LEFT OUTER JOIN "public"."xaccounts" pk ON ( pk."aid" OPERATOR(pg_catalog.=) fk."aid") WHERE pk."aid" IS NULL AND (fk."aid" IS NOT NULL) SQL state: 57014 PLAN DETAILS: https://explain.dalibo.com/plan/fad72gdacb6727b4#plan == FLAT PARTITION INTERNAL SELECT PLAN == This gives a direct result node and has no complexity at all SELECT fk."aid" FROM ONLY "public"."history_23" fk LEFT OUTER JOIN "public"."accounts" pk ON ( pk."aid" OPERATOR(pg_catalog.=) fk."aid") WHERE pk."aid" IS NULL AND (fk."aid" IS NOT NULL) PLAN DETAILS: https://explain.dalibo.com/plan/a83dae9b9569ebcd Test cases to repeat easily below: == FLAT PARTITION FAST FK DDL == CREATE DATABASE fastflatfk CREATE TABLE accounts (aid INTEGER, bid INTEGER, abalance INTEGER, filler CHAR(84)) PARTITION BY HASH(aid); CREATE TABLE history (tid INTEGER, bid INTEGER, aid INTEGER, delta INTEGER, mtime TIMESTAMP, filler CHAR(22)) PARTITION BY HASH(aid); DO $$ DECLARE p INTEGER; BEGIN FOR p IN 0..1023 LOOP EXECUTE 'CREATE TABLE accounts_' || p || ' PARTITION OF accounts FOR VALUES WITH (modulus 1024, remainder ' || p || ') PARTITION BY HASH(aid);'; EXECUTE 'CREATE TABLE history_' || p || ' PARTITION OF history FOR VALUES WITH (modulus 1024, remainder ' || p || ') PARTITION BY HASH(aid);'; END LOOP; END $$; ALTER TABLE accounts ADD CONSTRAINT accounts_pk PRIMARY KEY (aid); -- Query returned successfully in 1 secs 547 msec. ALTER TABLE history ADD CONSTRAINT history_fk FOREIGN KEY (aid) REFERENCES accounts (aid) ON DELETE CASCADE; --run to drop FK before you recreate it --ALTER TABLE history DROP CONSTRAINT history_fk == COMPOSITE PARTITION SLOW FK DDL == Now the composite partition version with 80 x 15 partitions which finishes in a bit over a minute (60x the time) CREATE DATABASE slowcompfk -- Create the parent tables for xaccounts and xhistory CREATE TABLE xaccounts (aid INTEGER, bid INTEGER, abalance INTEGER, filler CHAR(84)) PARTITION BY HASH(aid); CREATE TABLE xhistory (tid INTEGER, bid INTEGER, aid INTEGER, delta INTEGER, mtime TIMESTAMP, filler CHAR(22)) PARTITION BY HASH(aid); -- Generate SQL for creating 80 partitions for xaccounts DO $$ DECLARE p INTEGER; BEGIN FOR p IN 0..79 LOOP EXECUTE 'CREATE TABLE xaccounts_' || p || ' PARTITION OF xaccounts FOR VALUES WITH (modulus 80, remainder ' || p || ') PARTITION BY HASH(aid);'; END LOOP; END $$; -- Generate SQL for creating 15 sub-partitions within each partition for xaccounts DO $$ DECLARE main_partition INTEGER; sub_partition INTEGER; BEGIN FOR main_partition IN 0..79 LOOP FOR sub_partition IN 0..14 LOOP EXECUTE 'CREATE TABLE xaccounts_' || main_partition || '_' || sub_partition || ' PARTITION OF xaccounts_' || main_partition || ' FOR VALUES WITH (modulus 15, remainder ' || sub_partition || ');'; END LOOP; END LOOP; END $$; -- Generate SQL for creating 80 partitions for xhistory DO $$ DECLARE p INTEGER; BEGIN FOR p IN 0..79 LOOP EXECUTE 'CREATE TABLE xhistory_' || p || ' PARTITION OF xhistory FOR VALUES WITH (modulus 80, remainder ' || p || ') PARTITION BY HASH(aid);'; END LOOP; END $$; -- Generate SQL for creating 15 sub-partitions within each partition for xhistory DO $$ DECLARE main_partition INTEGER; sub_partition INTEGER; BEGIN FOR main_partition IN 0..79 LOOP FOR sub_partition IN 0..14 LOOP EXECUTE 'CREATE TABLE xhistory_' || main_partition || '_' || sub_partition || ' PARTITION OF xhistory_' || main_partition || ' FOR VALUES WITH (modulus 15, remainder ' || sub_partition || ');'; END LOOP; END LOOP; END $$; ALTER TABLE xaccounts ADD CONSTRAINT xaccounts_pk PRIMARY KEY (aid); -- Query returned successfully in 1 min 18 secs. ALTER TABLE xhistory ADD CONSTRAINT xhistory_fk FOREIGN KEY (aid) REFERENCES xaccounts (aid) ON DELETE CASCADE; --run to drop FK before you recreate it --ALTER TABLE xhistory DROP CONSTRAINT xhistory_fk