From 1f962d31d9f71fafe729b5a25396cdce112b7646 Mon Sep 17 00:00:00 2001 From: Dilip Kumar Date: Thu, 15 May 2025 17:39:58 +0530 Subject: [PATCH v1 3/4] Provide support for global Index Scan Path In previous patches we have added support for creating the global index. Now in this patch we provided a support in planner to choose a global index scan and index only scan paths at for the append rel. Currently we do not have support for selecting a bitmap scan using the global index. We may do that in future and if we need to do that we need to change a executor such that we can build a sperate tidmap for each leaf relation while scanning the global index and then do the bitmap heap scan partition at a time based on the bitmap. We also do not support the parallel index scan using the global index. There is nothing blocking as such but this is still a TODO. Open Items - In table_slot_callbacks(), now partiioned table can generate tuple by global index scan so we need proper slot instead of just assigning a virtual slot. This handling should be done maybe through AM callback? --- src/backend/access/index/genam.c | 19 ++ src/backend/access/index/indexam.c | 245 ++++++++++++++++++++++- src/backend/access/nbtree/nbtree.c | 10 +- src/backend/access/nbtree/nbtsearch.c | 71 +++++-- src/backend/catalog/partition.c | 4 - src/backend/commands/explain.c | 12 +- src/backend/executor/nodeIndexonlyscan.c | 25 ++- src/backend/executor/nodeIndexscan.c | 16 +- src/backend/optimizer/path/allpaths.c | 12 ++ src/backend/optimizer/path/indxpath.c | 40 +++- src/backend/optimizer/plan/planmain.c | 4 +- src/backend/optimizer/plan/planner.c | 139 ++++++++++++- src/backend/optimizer/util/appendinfo.c | 60 +++++- src/backend/optimizer/util/plancat.c | 43 ++-- src/backend/optimizer/util/var.c | 1 + src/backend/parser/parse_utilcmd.c | 1 + src/backend/utils/adt/selfuncs.c | 4 + src/backend/utils/cache/plancache.c | 15 ++ src/bin/psql/describe.c | 15 +- src/include/access/genam.h | 6 + src/include/access/nbtree.h | 3 + src/include/access/relscan.h | 8 +- src/include/nodes/pathnodes.h | 21 ++ src/include/nodes/plannodes.h | 3 + src/include/optimizer/appendinfo.h | 2 + 25 files changed, 725 insertions(+), 54 deletions(-) diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index c2b80669aa..13bd1e90b7 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -126,6 +126,25 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_hitup = NULL; scan->xs_hitupdesc = NULL; + /* + * Set a flag to indicate a global index scan and create a cache for + * partition ID to relation OID lookup. This is necessary because a global + * index stores the partition ID along with each tuple, and when fetching a + * tuple, we need to convert that partition ID into a relation OID. For + * more details, refer to the comments above the PartitionId typedef. + */ + if (RelationIsGlobalIndex(indexRelation)) + { + scan->xs_global_index = true; + scan->xs_global_index_cache = + create_globalindex_partition_cache(CurrentMemoryContext); + } + else + { + scan->xs_global_index = false; + scan->xs_global_index_cache = NULL; + } + return scan; } diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 3aa1fc92df..4e18d8150d 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -104,11 +104,35 @@ do { \ CppAsString(pname), RelationGetRelationName(scan->indexRelation)); \ } while(0) +/* + * Lookup table from relation oid to the relation descriptor and + * IndexFetchTableData structure. Because only once we should call + * table_index_fetch_begin() for each partition but in scan->xs_heapfetch we + * will overwrite with the current partition so if we come back to the old + * partition which we already have scanned once then we should use the same + * xs_heapfetch and that we can get from the cache. + */ +typedef struct GlobalIndexPartitionCacheData +{ + MemoryContext pdir_mcxt; + HTAB *pdir_hash; +} GlobalIndexPartitionCacheData; + +typedef struct GlobalIndexPartitionCacheEntry +{ + Oid reloid; + Relation relation; + IndexFetchTableData *heapfetch; +} GlobalIndexPartitionCacheEntry; + static IndexScanDesc index_beginscan_internal(Relation indexRelation, int nkeys, int norderbys, Snapshot snapshot, ParallelIndexScanDesc pscan, bool temp_snap); static inline void validate_relation_kind(Relation r); - +static GlobalIndexPartitionCacheEntry *globalindex_partition_entry_lookup( + GlobalIndexPartitionCache pdir, + Oid relid); +static void globalindex_partition_cache_reset(GlobalIndexPartitionCache pdir); /* ---------------------------------------------------------------- * index_ interface functions @@ -270,12 +294,29 @@ index_beginscan(Relation heapRelation, * Save additional parameters into the scandesc. Everything else was set * up by RelationGetIndexScan. */ - scan->heapRelation = heapRelation; scan->xs_snapshot = snapshot; scan->instrument = instrument; - /* prepare to fetch index matches from table */ - scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + /* + * For global index do not set the heapRelation and xs_heapfetch because + * while scanning the index we might get tids belongs to different + * partitions so we will initialize these fields when we actually fetch the + * tid from the index as that time we will know the relation oid from where + * we need to fetch the tid. + */ + if (scan->xs_global_index) + { + scan->heapRelation = NULL; + scan->xs_heapfetch = NULL; + } + else + { + scan->heapRelation = heapRelation; + + /* prepare to fetch index matches from table */ + scan->xs_heapfetch = table_index_fetch_begin(heapRelation); + } + return scan; } @@ -365,7 +406,23 @@ index_rescan(IndexScanDesc scan, Assert(norderbys == scan->numberOfOrderBys); /* Release resources (like buffer pins) from table accesses */ - if (scan->xs_heapfetch) + if (scan->xs_global_index) + { + /* + * For the global index, also reset the xs_global_index_cache. + * Essentially, the global index will have multiple entries of + * xs_heapfetch corresponding to each partition. These entries will be + * reset inside globalindex_partition_cache_reset(). Here, we can + * simply set xs_heapfetch and heapRelation to NULL in the scan + * descriptor. For more details, refer to the comments inside + * index_beginscan(). + */ + scan->heapRelation = NULL; + scan->xs_heapfetch = NULL; + if (scan->xs_global_index_cache) + globalindex_partition_cache_reset(scan->xs_global_index_cache); + } + else if (scan->xs_heapfetch) table_index_fetch_reset(scan->xs_heapfetch); scan->kill_prior_tuple = false; /* for safety */ @@ -386,7 +443,18 @@ index_endscan(IndexScanDesc scan) CHECK_SCAN_PROCEDURE(amendscan); /* Release resources (like buffer pins) from table accesses */ - if (scan->xs_heapfetch) + if (scan->xs_global_index) + { + /* + * For global index also reset the cache, interanlly this will + * deallocate the index fetch handle for each partition. + */ + if (scan->xs_global_index_cache) + globalindex_partition_cache_destroy(scan->xs_global_index_cache); + scan->heapRelation = NULL; + scan->xs_heapfetch = NULL; + } + else if (scan->xs_heapfetch) { table_index_fetch_end(scan->xs_heapfetch); scan->xs_heapfetch = NULL; @@ -442,7 +510,18 @@ index_restrpos(IndexScanDesc scan) CHECK_SCAN_PROCEDURE(amrestrpos); /* release resources (like buffer pins) from table accesses */ - if (scan->xs_heapfetch) + if (scan->xs_global_index) + { + /* + * For global index also reset the cache, interanlly this will reset + * the index fetch handle for each partition. + */ + if (scan->xs_global_index_cache) + globalindex_partition_cache_reset(scan->xs_global_index_cache); + scan->heapRelation = NULL; + scan->xs_heapfetch = NULL; + } + else if (scan->xs_heapfetch) table_index_fetch_reset(scan->xs_heapfetch); scan->kill_prior_tuple = false; /* for safety */ @@ -742,6 +821,15 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot * * the index. */ Assert(ItemPointerIsValid(&scan->xs_heaptid)); + + /* + * For global index we need to get the heapoid of the parittion + * relation from the scan descriptor stored by index scan and fetch the + * tuple from that relation. + */ + if (scan->xs_global_index) + global_indexscan_setup_partrel(scan); + if (index_fetch_heap(scan, slot)) return true; } @@ -1085,3 +1173,146 @@ index_opclass_options(Relation indrel, AttrNumber attnum, Datum attoptions, return build_local_reloptions(&relopts, attoptions, validate); } + +/* + * Helper function for index_getnext_slot() and IndexOnlyNext for setting up + * a proper scan->heapRelation and scan->xs_heapfetch during global index scan + * as global index will return tids which belongs to different partitions. + */ +void +global_indexscan_setup_partrel(IndexScanDesc scan) +{ + Oid relid; + GlobalIndexPartitionCacheEntry *entry; + + relid = scan->xs_heapoid; + + /* + * During a global index scan, we might encounter index entries that belong + * to different partitions, which could be interleaved. Each time we get + * a new index tuple, we need to verify if the scan->heapRelation matches + * the relid of that tuple. If it does not, we fetch the corresponding + * entry from the cache and store it in the scan descriptor. + */ + if (scan->heapRelation == NULL) + { + entry = globalindex_partition_entry_lookup( + scan->xs_global_index_cache, relid); + + scan->heapRelation = entry->relation; + scan->xs_heapfetch = entry->heapfetch; + } + else if (scan->heapRelation && + relid != RelationGetRelid(scan->heapRelation)) + { + table_index_fetch_reset(scan->xs_heapfetch); + + entry = globalindex_partition_entry_lookup( + scan->xs_global_index_cache, relid); + scan->heapRelation = entry->relation; + scan->xs_heapfetch = entry->heapfetch; + } +} + +/* + * create_globalindex_partition_cache - Create index scan partition cache + * + * For more details about this cache refer comments atop + * GlobalIndexPartitionCacheData structure. + */ +GlobalIndexPartitionCache +create_globalindex_partition_cache(MemoryContext mcxt) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(mcxt); + GlobalIndexPartitionCache pdir; + HASHCTL ctl; + + MemSet(&ctl, 0, sizeof(HASHCTL)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(GlobalIndexPartitionCacheEntry); + ctl.hcxt = mcxt; + + pdir = palloc(sizeof(GlobalIndexPartitionCacheData)); + pdir->pdir_mcxt = mcxt; + pdir->pdir_hash = hash_create("globalIndex partitionId cache", 256, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + MemoryContextSwitchTo(oldcontext); + return pdir; +} + +/* + * globalindex_partition_entry_lookup + * + * Lookup the relation descriptor and index heap fetch handle for the given + * relid. If the entry is not found, it will open the relation, initialize the + * index fetch on that relation, and store it in the cache for subsequent + * references. + */ +static GlobalIndexPartitionCacheEntry * +globalindex_partition_entry_lookup(GlobalIndexPartitionCache pdir, Oid relid) +{ + GlobalIndexPartitionCacheEntry *pde; + bool found; + Relation part_rel; + + Assert(OidIsValid(relid)); + Assert(pdir); + pde = hash_search(pdir->pdir_hash, &relid, HASH_FIND, &found); + if (found) + return pde; + else + { + pde = hash_search(pdir->pdir_hash, &relid, HASH_ENTER, &found); + part_rel = relation_open(relid, AccessShareLock); + pde->relation = part_rel; + pde->heapfetch = table_index_fetch_begin(part_rel); + } + + return pde; +} + +/* + * globalindex_partition_entry_lookup - destory the cache + * + * This will destory the GlobalIndexPartitionCache and also deallocate index + * fetch for each cache entry whereever it was initialized. + */ +void +globalindex_partition_cache_destroy(GlobalIndexPartitionCache pdir) +{ + HASH_SEQ_STATUS status; + GlobalIndexPartitionCacheEntry *pde; + + hash_seq_init(&status, pdir->pdir_hash); + while ((pde = hash_seq_search(&status)) != NULL) + { + if (pde->heapfetch) + { + table_index_fetch_end(pde->heapfetch); + pde->heapfetch = NULL; + } + + relation_close(pde->relation, NoLock); + } +} + +/* + * globalindex_partition_entry_lookup - reset the cache + * + * This will reset the GlobalIndexPartitionCache and also reset the index + * fetch for each cache entry if it was initialized. + */ +static void +globalindex_partition_cache_reset(GlobalIndexPartitionCache pdir) +{ + HASH_SEQ_STATUS status; + GlobalIndexPartitionCacheEntry *entry; + + hash_seq_init(&status, pdir->pdir_hash); + while ((entry = hash_seq_search(&status))) + { + if (entry->heapfetch) + table_index_fetch_reset(entry->heapfetch); + } +} diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index c3960784eb..e310ddcea6 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -228,7 +228,15 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) BTScanOpaque so = (BTScanOpaque) scan->opaque; bool res; - Assert(scan->heapRelation != NULL); + /* + * When working with global indexes, the scan's heap relation + * (scan->heapRelation) is not set beforehand. Instead, it's populated by + * the index scan interfaces, dynamically determined based on the TID being + * processed. This is because global index tuples explicitly carry the heap + * OID (along with the TID) to identify the originating heap relation. + */ + Assert(RelationIsGlobalIndex(scan->indexRelation) || + scan->heapRelation != NULL); /* btree indexes are never lossy */ scan->xs_recheck = false; diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 36544ecfd5..44841394df 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -35,13 +35,14 @@ static int _bt_binsrch_posting(BTScanInsert key, Page page, static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool firstpage); static void _bt_saveitem(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, IndexTuple itup); + OffsetNumber offnum, IndexTuple itup, Oid heapOid); static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, ItemPointer heapTid, - IndexTuple itup); + IndexTuple itup, Oid heapOid); static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, - ItemPointer heapTid, int tupleOffset); + ItemPointer heapTid, int tupleOffset, + Oid heapOid); static inline void _bt_returnitem(IndexScanDesc scan, BTScanOpaque so); static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); static bool _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, @@ -1608,6 +1609,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, bool arrayKeys; int itemIndex, indnatts; + Oid heapOid; /* save the page/buffer block number, along with its sibling links */ page = BufferGetPage(so->currPos.buf); @@ -1718,6 +1720,27 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, iid); Assert(!BTreeTupleIsPivot(itup)); + /* + * For global index we also need to fetch the relation oid in order + * to know from which relation we need to fetch tuple. + */ + if (RelationIsGlobalIndex(scan->indexRelation)) + { + heapOid = BTreeTupleGetPartitionRelid(scan->indexRelation, itup); + + /* + * If the partition is already detcahed then we will get an + * InvalidOid so ignore such tuples. + */ + if (!OidIsValid(heapOid)) + { + offnum = OffsetNumberNext(offnum); + continue; + } + } + else + heapOid = InvalidOid; + pstate.offnum = offnum; passes_quals = _bt_checkkeys(scan, &pstate, arrayKeys, itup, indnatts); @@ -1743,7 +1766,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, if (!BTreeTupleIsPosting(itup)) { /* Remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(so, itemIndex, offnum, itup, heapOid); itemIndex++; } else @@ -1757,14 +1780,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, tupleOffset = _bt_setuppostingitems(so, itemIndex, offnum, BTreeTupleGetPostingN(itup, 0), - itup); + itup, heapOid); itemIndex++; /* Remember additional TIDs */ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) { _bt_savepostingitem(so, itemIndex, offnum, BTreeTupleGetPostingN(itup, i), - tupleOffset); + tupleOffset, heapOid); itemIndex++; } } @@ -1883,6 +1906,24 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, itup = (IndexTuple) PageGetItem(page, iid); Assert(!BTreeTupleIsPivot(itup)); + /* + * For global index we also need to fetch the partition id in order + * to know from which relation we need to fetch tuple. We might + * get an InvalidOid if the partition is already detcahed so ignore + * such tuples. + */ + if (RelationIsGlobalIndex(scan->indexRelation)) + { + heapOid = BTreeTupleGetPartitionRelid(scan->indexRelation, itup); + if (!OidIsValid(heapOid)) + { + offnum = OffsetNumberNext(offnum); + continue; + } + } + else + heapOid = InvalidOid; + pstate.offnum = offnum; if (arrayKeys && offnum == minoff && pstate.forcenonrequired) { @@ -1931,7 +1972,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, { /* Remember it */ itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(so, itemIndex, offnum, itup, heapOid); } else { @@ -1951,14 +1992,14 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, tupleOffset = _bt_setuppostingitems(so, itemIndex, offnum, BTreeTupleGetPostingN(itup, 0), - itup); + itup, heapOid); /* Remember additional TIDs */ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) { itemIndex--; _bt_savepostingitem(so, itemIndex, offnum, BTreeTupleGetPostingN(itup, i), - tupleOffset); + tupleOffset, heapOid); } } } @@ -2002,12 +2043,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum, /* Save an index item into so->currPos.items[itemIndex] */ static void _bt_saveitem(BTScanOpaque so, int itemIndex, - OffsetNumber offnum, IndexTuple itup) + OffsetNumber offnum, IndexTuple itup, Oid heapOid) { BTScanPosItem *currItem = &so->currPos.items[itemIndex]; Assert(!BTreeTupleIsPivot(itup) && !BTreeTupleIsPosting(itup)); + currItem->heapOid = heapOid; currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; if (so->currTuples) @@ -2032,12 +2074,13 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, */ static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, - ItemPointer heapTid, IndexTuple itup) + ItemPointer heapTid, IndexTuple itup, Oid heapOid) { BTScanPosItem *currItem = &so->currPos.items[itemIndex]; Assert(BTreeTupleIsPosting(itup)); + currItem->heapOid = heapOid; currItem->heapTid = *heapTid; currItem->indexOffset = offnum; if (so->currTuples) @@ -2070,10 +2113,11 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, */ static inline void _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, - ItemPointer heapTid, int tupleOffset) + ItemPointer heapTid, int tupleOffset, Oid heapOid) { BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + currItem->heapOid = heapOid; currItem->heapTid = *heapTid; currItem->indexOffset = offnum; @@ -2100,6 +2144,9 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) Assert(so->currPos.itemIndex <= so->currPos.lastItem); /* Return next item, per amgettuple contract */ + /* For global index we must have a valid heap oid. */ + Assert(!scan->xs_global_index || OidIsValid(currItem->heapOid)); + scan->xs_heapoid = currItem->heapOid; scan->xs_heaptid = currItem->heapTid; if (so->currTuples) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 472a096206..48bd2066a1 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -44,10 +44,6 @@ static void get_partition_ancestors_worker(Relation inhRel, Oid relid, * * If the partition is in the process of being detached, an error is thrown, * unless even_if_detached is passed as true. - * - * Note: Because this function assumes that the relation whose OID is passed - * as an argument will have precisely one parent, it should only be called - * when it is known that the relation is a partition. */ Oid get_partition_parent(Oid relid, bool even_if_detached) diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 7e2792ead7..0721135200 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1442,10 +1442,18 @@ ExplainNode(PlanState *planstate, List *ancestors, pname = sname = "Gather Merge"; break; case T_IndexScan: - pname = sname = "Index Scan"; + if (get_rel_relkind(((IndexScan *) plan)->indexid) == + RELKIND_GLOBAL_INDEX) + pname = sname = "Global Index Scan"; + else + pname = sname = "Index Scan"; break; case T_IndexOnlyScan: - pname = sname = "Index Only Scan"; + if (get_rel_relkind(((IndexScan *) plan)->indexid) == + RELKIND_GLOBAL_INDEX) + pname = sname = "Global Index Only Scan"; + else + pname = sname = "Index Only Scan"; break; case T_BitmapIndexScan: pname = sname = "Bitmap Index Scan"; diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f464cca950..f85962b88a 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -43,6 +43,7 @@ #include "storage/bufmgr.h" #include "storage/predicate.h" #include "utils/builtins.h" +#include "utils/lsyscache.h" #include "utils/rel.h" @@ -124,6 +125,14 @@ IndexOnlyNext(IndexOnlyScanState *node) CHECK_FOR_INTERRUPTS(); + /* + * For global index we need to get the heapoid of the parittion + * relation from the scan descriptor stored by index scan in order to + * check the visibility map of that relation. + */ + if (scandesc->xs_global_index) + global_indexscan_setup_partrel(scandesc); + /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, @@ -534,6 +543,7 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) TupleDesc tupDesc; int indnkeyatts; int namecount; + const TupleTableSlotOps *tts_cb; /* * create state structure @@ -569,14 +579,25 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags) ExecInitScanTupleSlot(estate, &indexstate->ss, tupDesc, &TTSOpsVirtual); + /* + * FIXME: Global index scans on partitioned tables require + * TTSOpsBufferHeapTuple, but partitioned tables normally get TTSOpsVirtual + * (no TableAM). We currently hack this by assuming partitions with global + * indexes are Heap AM. Proper TableAM integration for partitioned tables + * is needed for slot allocation. + */ + if (get_rel_relkind(node->indexid) == RELKIND_GLOBAL_INDEX) + tts_cb = &TTSOpsBufferHeapTuple; + else + tts_cb = table_slot_callbacks(currentRelation); + /* * We need another slot, in a format that's suitable for the table AM, for * when we need to fetch a tuple from the table for rechecking visibility. */ indexstate->ioss_TableSlot = ExecAllocTableSlot(&estate->es_tupleTable, - RelationGetDescr(currentRelation), - table_slot_callbacks(currentRelation)); + RelationGetDescr(currentRelation), tts_cb); /* * Initialize result type and projection info. The node's targetlist will diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c index 7fcaa37fe6..6cd041330d 100644 --- a/src/backend/executor/nodeIndexscan.c +++ b/src/backend/executor/nodeIndexscan.c @@ -911,6 +911,7 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) IndexScanState *indexstate; Relation currentRelation; LOCKMODE lockmode; + const TupleTableSlotOps *tts_cb; /* * create state structure @@ -935,12 +936,23 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags) indexstate->ss.ss_currentRelation = currentRelation; indexstate->ss.ss_currentScanDesc = NULL; /* no heap scan here */ + /* + * FIXME: Global index scans on partitioned tables require + * TTSOpsBufferHeapTuple, but partitioned tables normally get TTSOpsVirtual + * (no TableAM). We currently hack this by assuming partitions with global + * indexes are Heap AM. Proper TableAM integration for partitioned tables + * is needed for slot allocation. + */ + if (get_rel_relkind(node->indexid) == RELKIND_GLOBAL_INDEX) + tts_cb = &TTSOpsBufferHeapTuple; + else + tts_cb = table_slot_callbacks(currentRelation); + /* * get the scan type from the relation descriptor. */ ExecInitScanTupleSlot(estate, &indexstate->ss, - RelationGetDescr(currentRelation), - table_slot_callbacks(currentRelation)); + RelationGetDescr(currentRelation), tts_cb); /* * Initialize result type and projection. diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 6cc6966b06..230a98f221 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -1211,6 +1211,12 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, } } + /* + * We need to check the index predicate for the parent relation, as the + * parent relation may have global index scan paths. + */ + check_index_predicates(root, rel); + if (has_live_children) { /* @@ -1303,6 +1309,12 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, /* Add paths to the append relation. */ add_paths_to_append_rel(root, rel, live_childrels); + + /* + * Partiotioned relation may have global indexes so lets consider index + * scan paths. + */ + create_index_paths(root, rel); } diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 601354ea3e..8fef652d4a 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -21,6 +21,7 @@ #include "access/sysattr.h" #include "catalog/pg_am.h" #include "catalog/pg_amop.h" +#include "catalog/pg_index_partitions.h" #include "catalog/pg_operator.h" #include "catalog/pg_opfamily.h" #include "catalog/pg_type.h" @@ -246,6 +247,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel) IndexClauseSet jclauseset; IndexClauseSet eclauseset; ListCell *lc; + bool ispartitioned = IS_PARTITIONED_REL(rel); /* Skip the whole mess if no indexes */ if (rel->indexlist == NIL) @@ -259,6 +261,22 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel) { IndexOptInfo *index = (IndexOptInfo *) lfirst(lc); + /* + * For partitioned relations, we can only consider global index scan + * paths. And for non partitioned relation ignore the indirect + * global indexes. + */ + if ((ispartitioned && index->idxkind != INDEX_GLOBAL_DIRECT) || + (!ispartitioned && index->idxkind != INDEX_LOCAL)) + continue; + + /* + * For non partitioned table we should not get the global index info. + * Check comments in get_relation_info() where we are adding + * IndexOptInfo nodes. + */ + Assert(ispartitioned || index->idxkind != INDEX_GLOBAL_DIRECT); + /* Protect limited-size array in IndexClauseSets */ Assert(index->nkeycolumns <= INDEX_MAX_KEYS); @@ -2228,6 +2246,7 @@ check_index_only(RelOptInfo *rel, IndexOptInfo *index) { bool result; Bitmapset *attrs_used = NULL; + Bitmapset *rowidvar = NULL; Bitmapset *index_canreturn_attrs = NULL; ListCell *lc; int i; @@ -2248,6 +2267,21 @@ check_index_only(RelOptInfo *rel, IndexOptInfo *index) */ pull_varattnos((Node *) rel->reltarget->exprs, rel->relid, &attrs_used); + /* + * FIXME: Ugly hack to avoid global index only scan during update/delete. + * In normal case it is avoided because reltarget will have junkattribute + * which would not match with index_canreturn_attrs. But with global index + * we are creating this scan on parent table so we would have extra + * ROWID_VAR but that would not get caught while calling pull_varattnos + * with rel->relid so we are searching here with sepecific ROWID_VAR. + */ + if (rel->nparts != 0) + { + pull_varattnos((Node *) rel->reltarget->exprs, ROWID_VAR, &rowidvar); + if (rowidvar != NULL) + return false; + } + /* * Add all the attributes used by restriction clauses; but consider only * those clauses not implied by the index predicate, since ones that are @@ -2276,9 +2310,11 @@ check_index_only(RelOptInfo *rel, IndexOptInfo *index) /* * For the moment, we just ignore index expressions. It might be nice - * to do something with them, later. + * to do something with them, later. For global index we also add + * an internal partition id attribute so just ignore that as we don't + * need to return that attribute from index. */ - if (attno == 0) + if (attno == 0 || attno == PartitionIdAttributeNumber) continue; if (index->canreturn[i]) diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index 5467e094ca..922b938f0b 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -20,6 +20,7 @@ */ #include "postgres.h" +#include "catalog/pg_inherits.h" #include "optimizer/appendinfo.h" #include "optimizer/clauses.h" #include "optimizer/optimizer.h" @@ -28,7 +29,8 @@ #include "optimizer/paths.h" #include "optimizer/placeholder.h" #include "optimizer/planmain.h" - +#include "storage/lmgr.h" +#include "storage/lockdefs.h" /* * query_planner diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 549aedcfa9..b63e9c47c1 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -22,6 +22,7 @@ #include "access/parallel.h" #include "access/sysattr.h" #include "access/table.h" +#include "catalog/partition.h" #include "catalog/pg_aggregate.h" #include "catalog/pg_inherits.h" #include "catalog/pg_proc.h" @@ -58,6 +59,7 @@ #include "parser/parsetree.h" #include "partitioning/partdesc.h" #include "rewrite/rewriteManip.h" +#include "storage/lmgr.h" #include "utils/backend_status.h" #include "utils/lsyscache.h" #include "utils/rel.h" @@ -267,7 +269,7 @@ static bool group_by_has_partkey(RelOptInfo *input_rel, static int common_prefix_cmp(const void *a, const void *b); static List *generate_setop_child_grouplist(SetOperationStmt *op, List *targetlist); - +static void lock_additional_rel(PlannerInfo *root); /***************************************************************************** * @@ -581,6 +583,7 @@ standard_planner(Query *parse, const char *query_string, int cursorOptions, result->utilityStmt = parse->utilityStmt; result->stmt_location = parse->stmt_location; result->stmt_len = parse->stmt_len; + result->lockrelOids = glob->lockRelOids; result->jitFlags = PGJIT_NONE; if (jit_enabled && jit_above_cost >= 0 && @@ -1176,6 +1179,13 @@ subquery_planner(PlannerGlobal *glob, Query *parse, PlannerInfo *parent_root, */ SS_identify_outer_params(root); + /* + * Prepare a list of additional relation OIDs to be locked if there is any + * global index on the result relation. Also lock those OIDs, for more + * details refer function header comments. + */ + lock_additional_rel(root); + /* * If any initPlans were created in this query level, adjust the surviving * Paths' costs and parallel-safety flags to account for them. The @@ -7748,12 +7758,13 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, bool rel_is_partitioned = IS_PARTITIONED_REL(rel); PathTarget *scanjoin_target; ListCell *lc; + List *global_index_path_list = NIL; /* This recurses, so be paranoid. */ check_stack_depth(); /* - * If the rel is partitioned, we want to drop its existing paths and + * If the rel is partitioned, we want to drop its existing append paths and * generate new ones. This function would still be correct if we kept the * existing paths: we'd modify them to generate the correct target above * the partitioning Append, and then they'd compete on cost with paths @@ -7770,9 +7781,57 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, * stanza. Hence, zap the main pathlist here, then allow * generate_useful_gather_paths to add path(s) to the main list, and * finally zap the partial pathlist. + * + * Note: All the partitioned rel paths which are build by appending child + * rel paths will be rebuilt again so we need to preserve the global index + * paths which are directly created on the partitioned relation. */ if (rel_is_partitioned) + { + List *newtarget = NIL; + PathTarget *index_scanjoin_target; + + /* + * Preprocess the scanjoin_targets and replace ROWID_VAR with the + * partitioned rel's varno, TODO - explain the reasoning here. + */ + foreach(lc, scanjoin_targets) + { + PathTarget *target = lfirst_node(PathTarget, lc); + + target = copy_pathtarget(target); + target->exprs = (List *) + adjust_appendrel_rowid_vars(root, (Node *) target->exprs, + rel->relid); + newtarget = lappend(newtarget, target); + } + /* Extract SRF-free scan/join target. */ + index_scanjoin_target = linitial_node(PathTarget, newtarget); + + /* + * As explained in above comments, skip all paths other than the + * global index paths as other paths will be build again. So process + * the global index paths and apply the index_scanjoin_target to them. + */ + foreach(lc, rel->pathlist) + { + Path *path = (Path *) lfirst(lc); + Path *newpath; + + if (nodeTag(path) != T_IndexPath) + continue; + + newpath = (Path *) create_projection_path(root, rel, path, + index_scanjoin_target); + global_index_path_list = lappend(global_index_path_list, newpath); + } + + /* + * For now set the rel->pathlist to NIL and once we have regenerated + * the append paths add the other paths back to the list. + */ rel->pathlist = NIL; + } /* * If the scan/join target is not parallel-safe, partial paths cannot @@ -7935,6 +7994,9 @@ apply_scanjoin_target_to_paths(PlannerInfo *root, /* Build new paths for this relation by appending child paths. */ add_paths_to_append_rel(root, rel, live_children); + + if (global_index_path_list) + rel->pathlist = list_concat(rel->pathlist, global_index_path_list); } /* @@ -8248,3 +8310,76 @@ generate_setop_child_grouplist(SetOperationStmt *op, List *targetlist) return grouplist; } + + +/* + * lock_additional_rel + * Lock additional relations to be locked in presence of a global index and + * also add those Oids to PlannerGlobal so that + * + * During DML operations on tables with global indexes, it's necessary to + * lock the entire partition tree up to the partitioned relation that holds + * the global index. + */ +static void +lock_additional_rel(PlannerInfo *root) +{ + Query *parse = root->parse; + RelOptInfo *rel; + ListCell *lc; + List *lockreloids = NIL; + + /* Nothing to do if there is no result relation. */ + if (parse->resultRelation <= 0) + return; + + /* + * Fetch the RelOptInfo of the result relation. If we haven't built it + * already then do it now. + */ + rel = find_base_rel_noerr(root, parse->resultRelation); + if (rel == NULL) + { + RangeTblEntry *rte = root->simple_rte_array[parse->resultRelation]; + + /* + * If we don't have global index on the result relation then we don't + * need to do anything. + */ + if (!get_rel_has_globalindex(rte->relid)) + return; + + rel = build_simple_rel(root, parse->resultRelation, NULL); + } + + /* + * Loop through all the indexes of the result relation and if it is a + * global index then lock all the inheritors under the relation on which + * this global index is created. Also store the list of all the OIDs + * in PlannerGlobal. + */ + foreach(lc, rel->indexlist) + { + IndexOptInfo *index = (IndexOptInfo *) lfirst(lc); + List *childrel = NIL; + + if (index->idxkind == INDEX_LOCAL) + continue; + + if (list_member_oid(lockreloids, index->indrelid)) + continue; + + /* + * Acquire lock on top level parent on which the global index is + * created and also lock all its inheritors. + */ + LockRelationOid(index->indrelid, RowExclusiveLock); + lockreloids = lappend_oid(lockreloids, index->indrelid); + childrel = find_all_inheritors(index->indrelid, RowExclusiveLock, + NULL); + lockreloids = list_concat(lockreloids, childrel); + } + + root->glob->lockRelOids = + list_concat_unique_oid(root->glob->lockRelOids, lockreloids); +} diff --git a/src/backend/optimizer/util/appendinfo.c b/src/backend/optimizer/util/appendinfo.c index 5b3dc0d865..2ad52cb497 100644 --- a/src/backend/optimizer/util/appendinfo.c +++ b/src/backend/optimizer/util/appendinfo.c @@ -32,6 +32,7 @@ typedef struct { PlannerInfo *root; int nappinfos; + int varno; AppendRelInfo **appinfos; } adjust_appendrel_attrs_context; @@ -41,7 +42,8 @@ static void make_inh_translation_list(Relation oldrelation, AppendRelInfo *appinfo); static Node *adjust_appendrel_attrs_mutator(Node *node, adjust_appendrel_attrs_context *context); - +static Node *adjust_appendrel_rowid_vars_mutator(Node *node, + adjust_appendrel_attrs_context *context); /* * make_append_rel_info @@ -529,6 +531,62 @@ adjust_appendrel_attrs_mutator(Node *node, return expression_tree_mutator(node, adjust_appendrel_attrs_mutator, context); } +/* + * Replace ROWID_VAR with the varno. + * + * This is simmilar to the adjust_appendrel_attrs(), except here instead of + * preparing the scantarget for the appendrel we are preparing for the + * partitioned rel, so varno of the partitioned rel is passed as input and we + * need to replcae the ROWID_VAR with the input varno. + */ +Node * +adjust_appendrel_rowid_vars(PlannerInfo *root, Node *node, int varno) +{ + adjust_appendrel_attrs_context context; + + context.root = root; + context.nappinfos = 0; + context.varno = varno; + + /* Should never be translating a Query tree. */ + Assert(node == NULL || !IsA(node, Query)); + + return adjust_appendrel_rowid_vars_mutator(node, &context); +} + +static Node * +adjust_appendrel_rowid_vars_mutator(Node *node, + adjust_appendrel_attrs_context *context) +{ + if (node == NULL) + return NULL; + if (IsA(node, Var)) + { + Var *var = (Var *) copyObject(node); + + if (var->varno == ROWID_VAR) + { + RowIdentityVarInfo *ridinfo = (RowIdentityVarInfo *) + list_nth(context->root->row_identity_vars, var->varattno - 1); + + /* Substitute the Var given in the RowIdentityVarInfo */ + var = copyObject(ridinfo->rowidvar); + + /* Replace the ROWID_VAR with the varno of the partitioned rel. */ + var->varno = context->varno; + /* identity vars shouldn't have nulling rels */ + Assert(var->varnullingrels == NULL); + /* varnosyn in the RowIdentityVarInfo is probably wrong */ + var->varnosyn = 0; + var->varattnosyn = 0; + } + + return (Node *) var; + } + return expression_tree_mutator(node, adjust_appendrel_rowid_vars_mutator, + (void *) context); +} + /* * adjust_appendrel_attrs_multilevel * Apply Var translations from an appendrel parent down to a child. diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index c716f9a6fe..576a7f97f4 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -35,6 +35,7 @@ #include "miscadmin.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" +#include "nodes/pathnodes.h" #include "nodes/supportnodes.h" #include "optimizer/cost.h" #include "optimizer/optimizer.h" @@ -268,15 +269,6 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, continue; } - /* - * TODO: Global index scan paths are not yet supported. - */ - if (RelationIsGlobalIndex(indexRelation)) - { - index_close(indexRelation, NoLock); - continue; - } - /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See @@ -293,7 +285,13 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info = makeNode(IndexOptInfo); + /* Set a flag to indicate this is a global index. */ + if (RelationIsGlobalIndex(indexRelation)) + info->idxkind = (index->indrelid == relationObjectId) ? + INDEX_GLOBAL_DIRECT : INDEX_GLOBAL_INDIRECT; + info->indexoid = index->indexrelid; + info->indrelid = index->indrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; @@ -333,15 +331,28 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, info->amoptionalkey = amroutine->amoptionalkey; info->amsearcharray = amroutine->amsearcharray; info->amsearchnulls = amroutine->amsearchnulls; - info->amcanparallel = amroutine->amcanparallel; info->amhasgettuple = (amroutine->amgettuple != NULL); - info->amhasgetbitmap = amroutine->amgetbitmap != NULL && - relation->rd_tableam->scan_bitmap_next_tuple != NULL; info->amcanmarkpos = (amroutine->ammarkpos != NULL && amroutine->amrestrpos != NULL); info->amcostestimate = amroutine->amcostestimate; Assert(info->amcostestimate != NULL); + /* + * TODO: Currently parallel and bitmap scans are not supported + * for the global indexes. + */ + if (info->idxkind != INDEX_LOCAL) + { + info->amcanparallel = false; + info->amhasgetbitmap = false; + } + else + { + info->amcanparallel = amroutine->amcanparallel; + info->amhasgetbitmap = amroutine->amgetbitmap != NULL && + relation->rd_tableam->scan_bitmap_next_tuple != NULL; + } + /* Fetch index opclass options */ info->opclassoptions = RelationGetIndexAttOptions(indexRelation, true); @@ -1932,7 +1943,13 @@ build_index_tlist(PlannerInfo *root, IndexOptInfo *index, /* simple column */ const FormData_pg_attribute *att_tup; - if (indexkey < 0) + /* + * If the attribute number is PartitionIdAttributeNumber then + * directly assign to the predefined partitionid_attr constant. + */ + if (indexkey == PartitionIdAttributeNumber) + att_tup = &partitionid_attr; + else if (indexkey < 0) att_tup = SystemAttributeDefinition(indexkey); else att_tup = TupleDescAttr(heapRelation->rd_att, indexkey - 1); diff --git a/src/backend/optimizer/util/var.c b/src/backend/optimizer/util/var.c index 8065237a18..3fd7bc949f 100644 --- a/src/backend/optimizer/util/var.c +++ b/src/backend/optimizer/util/var.c @@ -21,6 +21,7 @@ #include "postgres.h" #include "access/sysattr.h" +#include "catalog/pg_index_partitions.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/optimizer.h" diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index d354f44e66..1dc7fd2ae4 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -4266,6 +4266,7 @@ transformPartitionCmd(CreateStmtContext *cxt, PartitionCmd *cmd) RelationGetRelationName(parentRel)))); break; case RELKIND_INDEX: + case RELKIND_GLOBAL_INDEX: /* the index must be partitioned */ ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index ce6a626eba..7d3082a54b 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6500,6 +6500,8 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata, /* Ignore non-ordering indexes */ if (index->sortopfamily == NULL) continue; + if (index->idxkind != INDEX_LOCAL) + continue; /* * Ignore partial indexes --- we only want stats that cover the entire @@ -6720,6 +6722,8 @@ get_actual_variable_endpoint(Relation heapRel, InitNonVacuumableSnapshot(SnapshotNonVacuumable, GlobalVisTestFor(heapRel)); + Assert(!RelationIsGlobalIndex(indexRel)); + index_scan = index_beginscan(heapRel, indexRel, &SnapshotNonVacuumable, NULL, 1, 0); diff --git a/src/backend/utils/cache/plancache.c b/src/backend/utils/cache/plancache.c index 89a1c79e98..412628872c 100644 --- a/src/backend/utils/cache/plancache.c +++ b/src/backend/utils/cache/plancache.c @@ -1928,6 +1928,21 @@ AcquireExecutorLocks(List *stmt_list, bool acquire) else UnlockRelationOid(rte->relid, rte->rellockmode); } + + /* + * Loop through the lockrelOids derived based on the result relations + * and acquire lock on all the relation. We may store the lockmode as + * well along with the oid but we can dirtectly use RowExclusiveLock + * because these are derived from result relations and result relations + * are locked in this mode. + */ + foreach_oid(relid, plannedstmt->lockrelOids) + { + if (acquire) + LockRelationOid(relid, RowExclusiveLock); + else + UnlockRelationOid(relid, RowExclusiveLock); + } } } diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index 778ec2815c..8624ece5d7 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -1923,7 +1923,8 @@ describeOneTableDetails(const char *schemaname, attgenerated_col = cols++; } if (tableinfo.relkind == RELKIND_INDEX || - tableinfo.relkind == RELKIND_PARTITIONED_INDEX) + tableinfo.relkind == RELKIND_PARTITIONED_INDEX || + tableinfo.relkind == RELKIND_GLOBAL_INDEX) { if (pset.sversion >= 110000) { @@ -2308,7 +2309,8 @@ describeOneTableDetails(const char *schemaname, } if (tableinfo.relkind == RELKIND_INDEX || - tableinfo.relkind == RELKIND_PARTITIONED_INDEX) + tableinfo.relkind == RELKIND_PARTITIONED_INDEX || + tableinfo.relkind == RELKIND_GLOBAL_INDEX) { /* Footer information about an index */ PGresult *result; @@ -2412,7 +2414,8 @@ describeOneTableDetails(const char *schemaname, /* * If it's a partitioned index, we'll print the tablespace below */ - if (tableinfo.relkind == RELKIND_INDEX) + if (tableinfo.relkind == RELKIND_INDEX || + tableinfo.relkind == RELKIND_GLOBAL_INDEX) add_tablespace_footer(&cont, tableinfo.relkind, tableinfo.tablespace, true); } @@ -3666,6 +3669,7 @@ add_tablespace_footer(printTableContent *const cont, char relkind, relkind == RELKIND_INDEX || relkind == RELKIND_PARTITIONED_TABLE || relkind == RELKIND_PARTITIONED_INDEX || + relkind == RELKIND_GLOBAL_INDEX || relkind == RELKIND_TOASTVALUE) { /* @@ -4055,6 +4059,7 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys " WHEN " CppAsString2(RELKIND_FOREIGN_TABLE) " THEN '%s'" " WHEN " CppAsString2(RELKIND_PARTITIONED_TABLE) " THEN '%s'" " WHEN " CppAsString2(RELKIND_PARTITIONED_INDEX) " THEN '%s'" + " WHEN " CppAsString2(RELKIND_GLOBAL_INDEX) " THEN '%s'" " END as \"%s\",\n" " pg_catalog.pg_get_userbyid(c.relowner) as \"%s\"", gettext_noop("Schema"), @@ -4068,6 +4073,7 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys gettext_noop("foreign table"), gettext_noop("partitioned table"), gettext_noop("partitioned index"), + gettext_noop("global index"), gettext_noop("Type"), gettext_noop("Owner")); cols_so_far = 4; @@ -4148,7 +4154,8 @@ listTables(const char *tabtypes, const char *pattern, bool verbose, bool showSys appendPQExpBufferStr(&buf, CppAsString2(RELKIND_MATVIEW) ","); if (showIndexes) appendPQExpBufferStr(&buf, CppAsString2(RELKIND_INDEX) "," - CppAsString2(RELKIND_PARTITIONED_INDEX) ","); + CppAsString2(RELKIND_PARTITIONED_INDEX) "," + CppAsString2(RELKIND_GLOBAL_INDEX) ","); if (showSeq) appendPQExpBufferStr(&buf, CppAsString2(RELKIND_SEQUENCE) ","); if (showSystem || pattern) diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 5b2ab181b5..ec032ceda6 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -15,6 +15,8 @@ #define GENAM_H #include "access/htup.h" +#include "access/itup.h" +#include "access/relscan.h" #include "access/sdir.h" #include "access/skey.h" #include "nodes/tidbitmap.h" @@ -265,6 +267,10 @@ extern SysScanDesc systable_beginscan_ordered(Relation heapRelation, extern HeapTuple systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction); extern void systable_endscan_ordered(SysScanDesc sysscan); +extern Relation globalindex_partition_rel_lookup(GlobalIndexPartitionCache pdir, Oid relid); +extern void globalindex_partition_cache_destroy(GlobalIndexPartitionCache pdir); +extern GlobalIndexPartitionCache create_globalindex_partition_cache(MemoryContext mcxt); +extern void global_indexscan_setup_partrel(IndexScanDesc scan); extern void systable_inplace_update_begin(Relation relation, Oid indexId, bool indexOK, diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index cf7ddb0131..435a74749a 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1009,6 +1009,9 @@ typedef BTVacuumPostingData *BTVacuumPosting; typedef struct BTScanPosItem /* what we remember about each match */ { + Oid heapOid; /* Oid of the partition relation , only valid for + global indexes because global index can hold tuples + from multiple partitions */ ItemPointerData heapTid; /* TID of referenced heap item */ OffsetNumber indexOffset; /* index item's location within page */ LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c..8d0925d504 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -125,6 +125,8 @@ typedef struct IndexFetchTableData struct IndexScanInstrumentation; +typedef struct GlobalIndexPartitionCacheData *GlobalIndexPartitionCache; + /* * We use the same IndexScanDescData structure for both amgettuple-based * and amgetbitmap-based index scans. Some fields are only relevant in @@ -168,7 +170,9 @@ typedef struct IndexScanDescData struct TupleDescData *xs_itupdesc; /* rowtype descriptor of xs_itup */ HeapTuple xs_hitup; /* index data returned by AM, as HeapTuple */ struct TupleDescData *xs_hitupdesc; /* rowtype descriptor of xs_hitup */ - + Oid xs_heapoid; /* Oid of the partition relation , only valid + for global indexes because global index can + hold tuples from multiple partitions */ ItemPointerData xs_heaptid; /* result */ bool xs_heap_continue; /* T if must keep walking, potential * further results */ @@ -189,6 +193,8 @@ typedef struct IndexScanDescData /* parallel index scan information, in shared memory */ struct ParallelIndexScanDescData *parallel_scan; + bool xs_global_index; + GlobalIndexPartitionCache xs_global_index_cache; } IndexScanDescData; /* Generic structure for parallel scans */ diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 6567759595..fbae020a4c 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -153,6 +153,9 @@ typedef struct PlannerGlobal /* type OIDs for PARAM_EXEC Params */ List *paramExecTypes; + /* additional relation OIDs to be locked for global index */ + List *lockRelOids; + /* highest PlaceHolderVar ID assigned */ Index lastPHId; @@ -856,6 +859,13 @@ typedef enum RelOptKind RELOPT_OTHER_UPPER_REL, } RelOptKind; +typedef enum IndexKind +{ + INDEX_LOCAL, + INDEX_GLOBAL_DIRECT, + INDEX_GLOBAL_INDIRECT +} IndexKind; + /* * Is the given relation a simple relation i.e a base or "other" member * relation? @@ -1143,6 +1153,14 @@ struct IndexOptInfo Oid indexoid; /* tablespace of index (not table) */ Oid reltablespace; + + /* + * OID of the relation on which the index is created, for normal index we + * have RelOptInfo reference to identify that relation but for global index + * we need to explicitely need it as global index might have defined on + * some upper level parent relations. + */ + Oid indrelid; /* back-link to index's table; don't print, else infinite recursion */ RelOptInfo *rel pg_node_attr(read_write_ignore); @@ -1206,6 +1224,9 @@ struct IndexOptInfo */ List *indrestrictinfo; + /* whether the index is local or direct global or indirect global */ + IndexKind idxkind; + /* true if index predicate matches query */ bool predOK; /* true if a unique index */ diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 4f59e30d62..c07a8f14fc 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -122,6 +122,9 @@ typedef struct PlannedStmt /* OIDs of relations the plan depends on */ List *relationOids; + /* OIDs of relation to be locked */ + List *lockrelOids; + /* other dependencies, as PlanInvalItems */ List *invalItems; diff --git a/src/include/optimizer/appendinfo.h b/src/include/optimizer/appendinfo.h index d06f93b726..f8fd66c657 100644 --- a/src/include/optimizer/appendinfo.h +++ b/src/include/optimizer/appendinfo.h @@ -22,6 +22,8 @@ extern AppendRelInfo *make_append_rel_info(Relation parentrel, Index parentRTindex, Index childRTindex); extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node, int nappinfos, AppendRelInfo **appinfos); +extern Node *adjust_appendrel_rowid_vars(PlannerInfo *root, Node *node, + int varno); extern Node *adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, RelOptInfo *childrel, RelOptInfo *parentrel); -- 2.49.0